Spaces:

agentDebugger
/

AgentDebugger-training-v3

Running

shank commited on 8 days ago

Commit

b92ad01

1 Parent(s): 85f14d3

Add Gradio training monitor and fix subprocess python path

- app.py: Gradio Space that streams GRPO training log live
- requirements.txt: training deps (unsloth, trl, wandb, gradio, etc.)
- README.md: sdk switched to gradio for training Space
- calibrate.py: fix python → sys.executable for portability
- train_grpo.py: fix python subprocess + add shutil.which fallback

Made-with: Cursor

Files changed (5) hide show

README.md +6 -6
app.py +106 -0
calibrate.py +2 -1
requirements.txt +9 -10
training/train_grpo.py +129 -42

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
-title: AgentDebugger-Env 🐛
-emoji: 🐛
-colorFrom: red
-colorTo: yellow
-sdk: docker
-app_port: 8000
 pinned: true
 license: mit
 ---

 ---
+title: AgentDebugger-Training 🧠
+emoji: 🧠
+colorFrom: blue
+colorTo: purple
+sdk: gradio
+app_file: app.py
 pinned: true
 license: mit
 ---

app.py ADDED Viewed

	@@ -0,0 +1,106 @@

+"""
+AgentDebuggerEnv — Training Monitor
+Gradio UI that boots GRPO training in a background process and streams live status.
+"""
+import subprocess
+import threading
+import gradio as gr
+import os
+import json
+import sys
+import time
+# ── Start training in background ───────────────────────────────────────────────
+training_log: list[str] = []
+training_proc: subprocess.Popen | None = None
+training_started_at: float = time.time()
+def _stream_training():
+    global training_proc
+    script = os.path.join(os.path.dirname(__file__), "training", "train_grpo.py")
+    training_proc = subprocess.Popen(
+        [sys.executable, script],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+        bufsize=1,
+    )
+    for line in training_proc.stdout:
+        line = line.rstrip()
+        training_log.append(line)
+        if len(training_log) > 300:
+            training_log.pop(0)
+    training_proc.wait()
+training_thread = threading.Thread(target=_stream_training, daemon=True)
+training_thread.start()
+# ── Status checker ─────────────────────────────────────────────────────────────
+def check_status() -> str:
+    lines: list[str] = []
+    elapsed = int(time.time() - training_started_at)
+    lines.append(f"Elapsed: {elapsed // 60}m {elapsed % 60}s")
+    if training_proc is None:
+        lines.append("Status: starting up (give it ~2 minutes)...")
+    elif training_proc.poll() is None:
+        lines.append("Status: TRAINING RUNNING ✓")
+    else:
+        code = training_proc.poll()
+        lines.append(f"Status: {'COMPLETED ✓' if code == 0 else f'EXITED (code {code})'}")
+    if os.path.exists("baseline_results.json"):
+        try:
+            with open("baseline_results.json") as f:
+                baseline = json.load(f)
+            lines.append(f"\nBaseline solve rate : {baseline['solve_rate']:.1%}")
+            lines.append(f"Baseline avg reward : {baseline['avg_reward']:.3f}")
+        except Exception:
+            pass
+    if os.path.exists("checkpoints"):
+        ckpts = sorted(
+            [d for d in os.listdir("checkpoints") if os.path.isdir(f"checkpoints/{d}")]
+        )
+        if ckpts:
+            lines.append(f"\nLatest checkpoint   : {ckpts[-1]}")
+            lines.append(f"Total checkpoints   : {len(ckpts)}")
+    if os.path.exists("final_model"):
+        lines.append("\nFinal model saved ✓ — training complete!")
+    lines.append("\n" + "─" * 50)
+    lines.append("Recent log (last 40 lines):")
+    lines.extend(training_log[-40:] if training_log else ["(no output yet)"])
+    return "\n".join(lines)
+# ── Gradio UI ──────────────────────────────────────────────────────────────────
+with gr.Blocks(title="AgentDebuggerEnv Training Monitor") as demo:
+    gr.Markdown(
+        """
+# AgentDebuggerEnv — GRPO Training Monitor
+Training **Qwen2.5-Coder-7B-Instruct** on structured hypothesis-driven debugging.
+- Algorithm: GRPO (same as DeepSeek-R1)
+- Dataset: 90 hand-validated bugs across 3 difficulty tiers
+- Curriculum: Tier 1 (steps 0–300) → Tier 1+2 (300–600) → All tiers (600+)
+        """
+    )
+    status_box = gr.Textbox(
+        label="Training Status",
+        lines=50,
+        max_lines=50,
+        interactive=False,
+    )
+    refresh_btn = gr.Button("Refresh Status")
+    refresh_btn.click(fn=check_status, outputs=status_box)
+    # Auto-refresh every 30s
+    demo.load(fn=check_status, outputs=status_box, every=30)
+demo.launch(server_name="0.0.0.0", server_port=7860)

calibrate.py CHANGED Viewed

@@ -2,6 +2,7 @@ import json
 import subprocess
 import tempfile
 import os
 def test_passes(code, func, inp, expected):
     if isinstance(inp, (list, tuple)):
@@ -23,7 +24,7 @@ except Exception as e:
             f.write(script)
             fname = f.name
         r = subprocess.run(
-            ['python', fname],
             capture_output=True, text=True, timeout=5
         )
         os.unlink(fname)

 import subprocess
 import tempfile
 import os
+import sys
 def test_passes(code, func, inp, expected):
     if isinstance(inp, (list, tuple)):
             f.write(script)
             fname = f.name
         r = subprocess.run(
+            [sys.executable, fname],
             capture_output=True, text=True, timeout=5
         )
         os.unlink(fname)

requirements.txt CHANGED Viewed

@@ -1,10 +1,9 @@
-fastapi==0.110.0
-uvicorn==0.29.0
-pydantic==2.6.4
-openai==2.7.2
-requests==2.31.0
-python-dotenv==1.0.1
-pytest==8.1.0
-httpx==0.27.0
-RestrictedPython==7.0
-openenv-core>=0.2.0

+gradio>=4.0
+pydantic>=2.0
+wandb
+datasets
+transformers>=4.40
+accelerate>=0.30
+trl>=0.12
+torch>=2.1
+unsloth

training/train_grpo.py CHANGED Viewed

@@ -5,7 +5,10 @@ Algorithm: GRPO (Group Relative Policy Optimization) via HuggingFace TRL
 GPU: HuggingFace ZeroGPU H200 (free) or paid HF Spaces A10G
 Usage:
-  # Test run (no GPU needed, 10 steps):
   python training/train_grpo.py --test
   # Full training run:
@@ -22,11 +25,13 @@ import argparse
 import random
 import subprocess
 import tempfile
-import torch
 # ── Parse args ────────────────────────────────────────────────────────────────
 parser = argparse.ArgumentParser()
-parser.add_argument("--test", action="store_true", help="Run 10 steps for testing")
 parser.add_argument("--resume", type=str, default=None, help="Path to checkpoint")
 parser.add_argument("--max_steps", type=int, default=1000)
 args = parser.parse_args()
@@ -36,12 +41,14 @@ args = parser.parse_args()
 if os.environ.get("COLAB_RELEASE_TAG") or os.environ.get("SPACE_ID"):
     os.system("pip install -q unsloth trl wandb datasets")
-# ── Imports ───────────────────────────────────────────────────────────────────
-import wandb
-from datasets import Dataset
-from unsloth import FastLanguageModel
-from trl import GRPOTrainer, GRPOConfig
-from transformers import TrainerCallback
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from server.reward_calculator import DebugRewardCalculator
@@ -54,7 +61,7 @@ MAX_STEPS = 10 if args.test else args.max_steps
 CHECKPOINT_DIR = "./checkpoints"
 # W&B — optional but strongly recommended for judging
-WANDB_API_KEY = os.environ.get("WANDB_API_KEY", "")
 if WANDB_API_KEY:
     wandb.init(
         project="AgentDebuggerEnv",
@@ -114,6 +121,118 @@ def bug_to_prompt(bug: dict) -> str:
         f"<|im_start|>assistant\n"
     )
 # ── Load model ────────────────────────────────────────────────────────────────
 print(f"Loading {MODEL_NAME}...")
 model, tokenizer = FastLanguageModel.from_pretrained(
@@ -178,38 +297,6 @@ def reward_fn(completions: list[str], prompts: list[str], **kwargs) -> list[floa
     return rewards
-def _run_fix(proposed_code: str, bug: dict) -> dict:
-    """Safely run proposed fix with subprocess timeout."""
-    test_cases = bug.get("test_cases", [])
-    func_name = bug.get("function_name", "")
-    if not proposed_code or not test_cases or not func_name:
-        return {"passed": 0, "failed": 0, "total": len(test_cases), "newly_broken": 0}
-    passed = 0
-    for test in test_cases:
-        inp = test["input"]
-        args_str = ", ".join(repr(x) for x in inp) if isinstance(inp, (list, tuple)) else repr(inp)
-        script = (
-            f"{proposed_code}\n"
-            f"try:\n"
-            f"    r={func_name}({args_str})\n"
-            f"    print('PASS' if r=={repr(test['expected_output'])} else 'FAIL')\n"
-            f"except Exception as e:\n"
-            f"    print(f'ERROR: {{e}}')\n"
-        )
-        try:
-            with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
-                f.write(script)
-                fname = f.name
-            r = subprocess.run(["python", fname], capture_output=True, text=True, timeout=5)
-            os.unlink(fname)
-            if "PASS" in r.stdout:
-                passed += 1
-        except Exception:
-            pass
-    return {"passed": passed, "failed": len(test_cases) - passed, "total": len(test_cases), "newly_broken": 0}
 # ── Baseline evaluation (run BEFORE training) ─────────────────────────────────
 def run_baseline(n: int = 20) -> dict:
     print("\nRunning baseline evaluation on UNTRAINED model...")

 GPU: HuggingFace ZeroGPU H200 (free) or paid HF Spaces A10G
 Usage:
+  # Local reward sanity-check (no GPU, no model loading):
+  python training/train_grpo.py --test-local
+  # Test run (Colab/GPU, 10 steps):
   python training/train_grpo.py --test
   # Full training run:
 import random
 import subprocess
 import tempfile
+import shutil
 # ── Parse args ────────────────────────────────────────────────────────────────
 parser = argparse.ArgumentParser()
+parser.add_argument("--test", action="store_true", help="Run 10 steps for testing (Colab/GPU)")
+parser.add_argument("--test-local", action="store_true", dest="test_local",
+                    help="Sanity-check reward function locally without any model or GPU")
 parser.add_argument("--resume", type=str, default=None, help="Path to checkpoint")
 parser.add_argument("--max_steps", type=int, default=1000)
 args = parser.parse_args()
 if os.environ.get("COLAB_RELEASE_TAG") or os.environ.get("SPACE_ID"):
     os.system("pip install -q unsloth trl wandb datasets")
+# ── GPU/training imports (skipped in --test-local mode) ───────────────────────
+if not args.test_local:
+    import torch
+    import wandb
+    from datasets import Dataset
+    from unsloth import FastLanguageModel
+    from trl import GRPOTrainer, GRPOConfig
+    from transformers import TrainerCallback
 sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 from server.reward_calculator import DebugRewardCalculator
 CHECKPOINT_DIR = "./checkpoints"
 # W&B — optional but strongly recommended for judging
+WANDB_API_KEY = os.environ.get("WANDB_API_KEY", "") if not args.test_local else ""
 if WANDB_API_KEY:
     wandb.init(
         project="AgentDebuggerEnv",
         f"<|im_start|>assistant\n"
     )
+def _run_fix(proposed_code: str, bug: dict) -> dict:
+    """Safely run proposed fix with subprocess timeout."""
+    test_cases = bug.get("test_cases", [])
+    func_name = bug.get("function_name", "")
+    if not proposed_code or not test_cases or not func_name:
+        return {"passed": 0, "failed": 0, "total": len(test_cases), "newly_broken": 0}
+    passed = 0
+    for test in test_cases:
+        inp = test["input"]
+        args_str = ", ".join(repr(x) for x in inp) if isinstance(inp, (list, tuple)) else repr(inp)
+        script = (
+            f"{proposed_code}\n"
+            f"try:\n"
+            f"    r={func_name}({args_str})\n"
+            f"    print('PASS' if r=={repr(test['expected_output'])} else 'FAIL')\n"
+            f"except Exception as e:\n"
+            f"    print(f'ERROR: {{e}}')\n"
+        )
+        try:
+            with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
+                f.write(script)
+                fname = f.name
+            python = shutil.which("python3") or shutil.which("python") or sys.executable
+            r = subprocess.run([python, fname], capture_output=True, text=True, timeout=5)
+            os.unlink(fname)
+            if "PASS" in r.stdout:
+                passed += 1
+        except Exception:
+            pass
+    return {"passed": passed, "failed": len(test_cases) - passed, "total": len(test_cases), "newly_broken": 0}
+# ── Mock completions for --test-local ─────────────────────────────────────────
+MOCK_GOOD = """
+OBSERVATION: The loop condition on line 4 uses <= instead of
+HYPOTHESIS: This causes an off-by-one error because Python lists are
+0-indexed, so the last valid index is len(arr)-1 not len(arr)
+CONFIDENCE: high
+ACTION: propose_fix
+DETAIL: def binary_search(arr, target):
+    left, right = 0, len(arr) - 1
+    while left < right:
+        mid = (left + right) // 2
+        if arr[mid] == target:
+            return mid
+        elif arr[mid] < target:
+            left = mid + 1
+        else:
+            right = mid - 1
+    return -1
+"""
+MOCK_BAD = """
+I think there might be a bug somewhere in the code.
+Let me try fixing it.
+"""
+# ── --test-local: reward sanity-check without any model ───────────────────────
+if args.test_local:
+    print("=" * 60)
+    print("LOCAL TEST MODE — no model loaded, testing reward function only")
+    print("=" * 60)
+    bugs = load_bugs(1)
+    if not bugs:
+        print("ERROR: No bugs found in data/bugs_tier1.jsonl. Run data/generate_bugs.py first.")
+        sys.exit(1)
+    bug = bugs[0]
+    print(f"\nUsing bug: {bug.get('function_name', '?')} — {bug.get('bug_type', '?')}\n")
+    calculator_local = DebugRewardCalculator()
+    def _score(label: str, completion: str) -> float:
+        try:
+            agent_output = parse_agent_output(completion)
+            test_results = {"passed": 0, "failed": 0, "total": 0, "newly_broken": 0}
+            if agent_output.action == "propose_fix":
+                test_results = _run_fix(agent_output.detail, bug)
+            breakdown = calculator_local.compute_turn_reward(
+                agent_output=agent_output,
+                ground_truth={
+                    "bug_function": bug.get("bug_location", {}).get("function", ""),
+                    "bug_line": bug.get("bug_location", {}).get("line_start", -1),
+                    "bug_type": bug.get("bug_type", ""),
+                    "canonical_fix_code": bug.get("original_code", ""),
+                },
+                test_results=test_results,
+                turn_number=0,
+            )
+            print(f"--- {label} reward breakdown ---")
+            for field, value in breakdown.__dict__.items():
+                print(f"  {field}: {value}")
+            print(f"  TOTAL: {breakdown.total}\n")
+            return breakdown.total
+        except Exception as e:
+            print(f"Reward error for {label}: {e}")
+            return -0.3
+    good_score = _score("MOCK_GOOD", MOCK_GOOD)
+    bad_score = _score("MOCK_BAD", MOCK_BAD)
+    print(f"MOCK_GOOD score: {good_score:.4f}")
+    print(f"MOCK_BAD  score: {bad_score:.4f}")
+    assert good_score > bad_score, (
+        f"ASSERTION FAILED: MOCK_GOOD ({good_score:.4f}) should be > MOCK_BAD ({bad_score:.4f})"
+    )
+    print("\nLOCAL TEST PASSED")
+    sys.exit(0)
 # ── Load model ────────────────────────────────────────────────────────────────
 print(f"Loading {MODEL_NAME}...")
 model, tokenizer = FastLanguageModel.from_pretrained(
     return rewards
 # ── Baseline evaluation (run BEFORE training) ─────────────────────────────────
 def run_baseline(n: int = 20) -> dict:
     print("\nRunning baseline evaluation on UNTRAINED model...")