shank commited on
Commit
b92ad01
Β·
1 Parent(s): 85f14d3

Add Gradio training monitor and fix subprocess python path

Browse files

- app.py: Gradio Space that streams GRPO training log live
- requirements.txt: training deps (unsloth, trl, wandb, gradio, etc.)
- README.md: sdk switched to gradio for training Space
- calibrate.py: fix python β†’ sys.executable for portability
- train_grpo.py: fix python subprocess + add shutil.which fallback

Made-with: Cursor

Files changed (5) hide show
  1. README.md +6 -6
  2. app.py +106 -0
  3. calibrate.py +2 -1
  4. requirements.txt +9 -10
  5. training/train_grpo.py +129 -42
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: AgentDebugger-Env πŸ›
3
- emoji: πŸ›
4
- colorFrom: red
5
- colorTo: yellow
6
- sdk: docker
7
- app_port: 8000
8
  pinned: true
9
  license: mit
10
  ---
 
1
  ---
2
+ title: AgentDebugger-Training 🧠
3
+ emoji: 🧠
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: gradio
7
+ app_file: app.py
8
  pinned: true
9
  license: mit
10
  ---
app.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ AgentDebuggerEnv β€” Training Monitor
3
+ Gradio UI that boots GRPO training in a background process and streams live status.
4
+ """
5
+
6
+ import subprocess
7
+ import threading
8
+ import gradio as gr
9
+ import os
10
+ import json
11
+ import sys
12
+ import time
13
+
14
+ # ── Start training in background ───────────────────────────────────────────────
15
+ training_log: list[str] = []
16
+ training_proc: subprocess.Popen | None = None
17
+ training_started_at: float = time.time()
18
+
19
+
20
+ def _stream_training():
21
+ global training_proc
22
+ script = os.path.join(os.path.dirname(__file__), "training", "train_grpo.py")
23
+ training_proc = subprocess.Popen(
24
+ [sys.executable, script],
25
+ stdout=subprocess.PIPE,
26
+ stderr=subprocess.STDOUT,
27
+ text=True,
28
+ bufsize=1,
29
+ )
30
+ for line in training_proc.stdout:
31
+ line = line.rstrip()
32
+ training_log.append(line)
33
+ if len(training_log) > 300:
34
+ training_log.pop(0)
35
+ training_proc.wait()
36
+
37
+
38
+ training_thread = threading.Thread(target=_stream_training, daemon=True)
39
+ training_thread.start()
40
+
41
+
42
+ # ── Status checker ─────────────────────────────────────────────────────────────
43
+ def check_status() -> str:
44
+ lines: list[str] = []
45
+ elapsed = int(time.time() - training_started_at)
46
+ lines.append(f"Elapsed: {elapsed // 60}m {elapsed % 60}s")
47
+
48
+ if training_proc is None:
49
+ lines.append("Status: starting up (give it ~2 minutes)...")
50
+ elif training_proc.poll() is None:
51
+ lines.append("Status: TRAINING RUNNING βœ“")
52
+ else:
53
+ code = training_proc.poll()
54
+ lines.append(f"Status: {'COMPLETED βœ“' if code == 0 else f'EXITED (code {code})'}")
55
+
56
+ if os.path.exists("baseline_results.json"):
57
+ try:
58
+ with open("baseline_results.json") as f:
59
+ baseline = json.load(f)
60
+ lines.append(f"\nBaseline solve rate : {baseline['solve_rate']:.1%}")
61
+ lines.append(f"Baseline avg reward : {baseline['avg_reward']:.3f}")
62
+ except Exception:
63
+ pass
64
+
65
+ if os.path.exists("checkpoints"):
66
+ ckpts = sorted(
67
+ [d for d in os.listdir("checkpoints") if os.path.isdir(f"checkpoints/{d}")]
68
+ )
69
+ if ckpts:
70
+ lines.append(f"\nLatest checkpoint : {ckpts[-1]}")
71
+ lines.append(f"Total checkpoints : {len(ckpts)}")
72
+
73
+ if os.path.exists("final_model"):
74
+ lines.append("\nFinal model saved βœ“ β€” training complete!")
75
+
76
+ lines.append("\n" + "─" * 50)
77
+ lines.append("Recent log (last 40 lines):")
78
+ lines.extend(training_log[-40:] if training_log else ["(no output yet)"])
79
+
80
+ return "\n".join(lines)
81
+
82
+
83
+ # ── Gradio UI ──────────────────────────────────────────────────────────────────
84
+ with gr.Blocks(title="AgentDebuggerEnv Training Monitor") as demo:
85
+ gr.Markdown(
86
+ """
87
+ # AgentDebuggerEnv β€” GRPO Training Monitor
88
+ Training **Qwen2.5-Coder-7B-Instruct** on structured hypothesis-driven debugging.
89
+ - Algorithm: GRPO (same as DeepSeek-R1)
90
+ - Dataset: 90 hand-validated bugs across 3 difficulty tiers
91
+ - Curriculum: Tier 1 (steps 0–300) β†’ Tier 1+2 (300–600) β†’ All tiers (600+)
92
+ """
93
+ )
94
+ status_box = gr.Textbox(
95
+ label="Training Status",
96
+ lines=50,
97
+ max_lines=50,
98
+ interactive=False,
99
+ )
100
+ refresh_btn = gr.Button("Refresh Status")
101
+ refresh_btn.click(fn=check_status, outputs=status_box)
102
+
103
+ # Auto-refresh every 30s
104
+ demo.load(fn=check_status, outputs=status_box, every=30)
105
+
106
+ demo.launch(server_name="0.0.0.0", server_port=7860)
calibrate.py CHANGED
@@ -2,6 +2,7 @@ import json
2
  import subprocess
3
  import tempfile
4
  import os
 
5
 
6
  def test_passes(code, func, inp, expected):
7
  if isinstance(inp, (list, tuple)):
@@ -23,7 +24,7 @@ except Exception as e:
23
  f.write(script)
24
  fname = f.name
25
  r = subprocess.run(
26
- ['python', fname],
27
  capture_output=True, text=True, timeout=5
28
  )
29
  os.unlink(fname)
 
2
  import subprocess
3
  import tempfile
4
  import os
5
+ import sys
6
 
7
  def test_passes(code, func, inp, expected):
8
  if isinstance(inp, (list, tuple)):
 
24
  f.write(script)
25
  fname = f.name
26
  r = subprocess.run(
27
+ [sys.executable, fname],
28
  capture_output=True, text=True, timeout=5
29
  )
30
  os.unlink(fname)
requirements.txt CHANGED
@@ -1,10 +1,9 @@
1
- fastapi==0.110.0
2
- uvicorn==0.29.0
3
- pydantic==2.6.4
4
- openai==2.7.2
5
- requests==2.31.0
6
- python-dotenv==1.0.1
7
- pytest==8.1.0
8
- httpx==0.27.0
9
- RestrictedPython==7.0
10
- openenv-core>=0.2.0
 
1
+ gradio>=4.0
2
+ pydantic>=2.0
3
+ wandb
4
+ datasets
5
+ transformers>=4.40
6
+ accelerate>=0.30
7
+ trl>=0.12
8
+ torch>=2.1
9
+ unsloth
 
training/train_grpo.py CHANGED
@@ -5,7 +5,10 @@ Algorithm: GRPO (Group Relative Policy Optimization) via HuggingFace TRL
5
  GPU: HuggingFace ZeroGPU H200 (free) or paid HF Spaces A10G
6
 
7
  Usage:
8
- # Test run (no GPU needed, 10 steps):
 
 
 
9
  python training/train_grpo.py --test
10
 
11
  # Full training run:
@@ -22,11 +25,13 @@ import argparse
22
  import random
23
  import subprocess
24
  import tempfile
25
- import torch
26
 
27
  # ── Parse args ────────────────────────────────────────────────────────────────
28
  parser = argparse.ArgumentParser()
29
- parser.add_argument("--test", action="store_true", help="Run 10 steps for testing")
 
 
30
  parser.add_argument("--resume", type=str, default=None, help="Path to checkpoint")
31
  parser.add_argument("--max_steps", type=int, default=1000)
32
  args = parser.parse_args()
@@ -36,12 +41,14 @@ args = parser.parse_args()
36
  if os.environ.get("COLAB_RELEASE_TAG") or os.environ.get("SPACE_ID"):
37
  os.system("pip install -q unsloth trl wandb datasets")
38
 
39
- # ── Imports ───────────────────────────────────────────────────────────────────
40
- import wandb
41
- from datasets import Dataset
42
- from unsloth import FastLanguageModel
43
- from trl import GRPOTrainer, GRPOConfig
44
- from transformers import TrainerCallback
 
 
45
 
46
  sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
47
  from server.reward_calculator import DebugRewardCalculator
@@ -54,7 +61,7 @@ MAX_STEPS = 10 if args.test else args.max_steps
54
  CHECKPOINT_DIR = "./checkpoints"
55
 
56
  # W&B β€” optional but strongly recommended for judging
57
- WANDB_API_KEY = os.environ.get("WANDB_API_KEY", "")
58
  if WANDB_API_KEY:
59
  wandb.init(
60
  project="AgentDebuggerEnv",
@@ -114,6 +121,118 @@ def bug_to_prompt(bug: dict) -> str:
114
  f"<|im_start|>assistant\n"
115
  )
116
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  # ── Load model ────────────────────────────────────────────────────────────────
118
  print(f"Loading {MODEL_NAME}...")
119
  model, tokenizer = FastLanguageModel.from_pretrained(
@@ -178,38 +297,6 @@ def reward_fn(completions: list[str], prompts: list[str], **kwargs) -> list[floa
178
 
179
  return rewards
180
 
181
- def _run_fix(proposed_code: str, bug: dict) -> dict:
182
- """Safely run proposed fix with subprocess timeout."""
183
- test_cases = bug.get("test_cases", [])
184
- func_name = bug.get("function_name", "")
185
- if not proposed_code or not test_cases or not func_name:
186
- return {"passed": 0, "failed": 0, "total": len(test_cases), "newly_broken": 0}
187
-
188
- passed = 0
189
- for test in test_cases:
190
- inp = test["input"]
191
- args_str = ", ".join(repr(x) for x in inp) if isinstance(inp, (list, tuple)) else repr(inp)
192
- script = (
193
- f"{proposed_code}\n"
194
- f"try:\n"
195
- f" r={func_name}({args_str})\n"
196
- f" print('PASS' if r=={repr(test['expected_output'])} else 'FAIL')\n"
197
- f"except Exception as e:\n"
198
- f" print(f'ERROR: {{e}}')\n"
199
- )
200
- try:
201
- with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
202
- f.write(script)
203
- fname = f.name
204
- r = subprocess.run(["python", fname], capture_output=True, text=True, timeout=5)
205
- os.unlink(fname)
206
- if "PASS" in r.stdout:
207
- passed += 1
208
- except Exception:
209
- pass
210
-
211
- return {"passed": passed, "failed": len(test_cases) - passed, "total": len(test_cases), "newly_broken": 0}
212
-
213
  # ── Baseline evaluation (run BEFORE training) ─────────────────────────────────
214
  def run_baseline(n: int = 20) -> dict:
215
  print("\nRunning baseline evaluation on UNTRAINED model...")
 
5
  GPU: HuggingFace ZeroGPU H200 (free) or paid HF Spaces A10G
6
 
7
  Usage:
8
+ # Local reward sanity-check (no GPU, no model loading):
9
+ python training/train_grpo.py --test-local
10
+
11
+ # Test run (Colab/GPU, 10 steps):
12
  python training/train_grpo.py --test
13
 
14
  # Full training run:
 
25
  import random
26
  import subprocess
27
  import tempfile
28
+ import shutil
29
 
30
  # ── Parse args ────────────────────────────────────────────────────────────────
31
  parser = argparse.ArgumentParser()
32
+ parser.add_argument("--test", action="store_true", help="Run 10 steps for testing (Colab/GPU)")
33
+ parser.add_argument("--test-local", action="store_true", dest="test_local",
34
+ help="Sanity-check reward function locally without any model or GPU")
35
  parser.add_argument("--resume", type=str, default=None, help="Path to checkpoint")
36
  parser.add_argument("--max_steps", type=int, default=1000)
37
  args = parser.parse_args()
 
41
  if os.environ.get("COLAB_RELEASE_TAG") or os.environ.get("SPACE_ID"):
42
  os.system("pip install -q unsloth trl wandb datasets")
43
 
44
+ # ── GPU/training imports (skipped in --test-local mode) ───────────────────────
45
+ if not args.test_local:
46
+ import torch
47
+ import wandb
48
+ from datasets import Dataset
49
+ from unsloth import FastLanguageModel
50
+ from trl import GRPOTrainer, GRPOConfig
51
+ from transformers import TrainerCallback
52
 
53
  sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
54
  from server.reward_calculator import DebugRewardCalculator
 
61
  CHECKPOINT_DIR = "./checkpoints"
62
 
63
  # W&B β€” optional but strongly recommended for judging
64
+ WANDB_API_KEY = os.environ.get("WANDB_API_KEY", "") if not args.test_local else ""
65
  if WANDB_API_KEY:
66
  wandb.init(
67
  project="AgentDebuggerEnv",
 
121
  f"<|im_start|>assistant\n"
122
  )
123
 
124
+ def _run_fix(proposed_code: str, bug: dict) -> dict:
125
+ """Safely run proposed fix with subprocess timeout."""
126
+ test_cases = bug.get("test_cases", [])
127
+ func_name = bug.get("function_name", "")
128
+ if not proposed_code or not test_cases or not func_name:
129
+ return {"passed": 0, "failed": 0, "total": len(test_cases), "newly_broken": 0}
130
+
131
+ passed = 0
132
+ for test in test_cases:
133
+ inp = test["input"]
134
+ args_str = ", ".join(repr(x) for x in inp) if isinstance(inp, (list, tuple)) else repr(inp)
135
+ script = (
136
+ f"{proposed_code}\n"
137
+ f"try:\n"
138
+ f" r={func_name}({args_str})\n"
139
+ f" print('PASS' if r=={repr(test['expected_output'])} else 'FAIL')\n"
140
+ f"except Exception as e:\n"
141
+ f" print(f'ERROR: {{e}}')\n"
142
+ )
143
+ try:
144
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False) as f:
145
+ f.write(script)
146
+ fname = f.name
147
+ python = shutil.which("python3") or shutil.which("python") or sys.executable
148
+ r = subprocess.run([python, fname], capture_output=True, text=True, timeout=5)
149
+ os.unlink(fname)
150
+ if "PASS" in r.stdout:
151
+ passed += 1
152
+ except Exception:
153
+ pass
154
+
155
+ return {"passed": passed, "failed": len(test_cases) - passed, "total": len(test_cases), "newly_broken": 0}
156
+
157
+ # ── Mock completions for --test-local ─────────────────────────────────────────
158
+ MOCK_GOOD = """
159
+ OBSERVATION: The loop condition on line 4 uses <= instead of
160
+ HYPOTHESIS: This causes an off-by-one error because Python lists are
161
+ 0-indexed, so the last valid index is len(arr)-1 not len(arr)
162
+ CONFIDENCE: high
163
+ ACTION: propose_fix
164
+ DETAIL: def binary_search(arr, target):
165
+ left, right = 0, len(arr) - 1
166
+ while left < right:
167
+ mid = (left + right) // 2
168
+ if arr[mid] == target:
169
+ return mid
170
+ elif arr[mid] < target:
171
+ left = mid + 1
172
+ else:
173
+ right = mid - 1
174
+ return -1
175
+ """
176
+
177
+ MOCK_BAD = """
178
+ I think there might be a bug somewhere in the code.
179
+ Let me try fixing it.
180
+ """
181
+
182
+ # ── --test-local: reward sanity-check without any model ───────────────────────
183
+ if args.test_local:
184
+ print("=" * 60)
185
+ print("LOCAL TEST MODE β€” no model loaded, testing reward function only")
186
+ print("=" * 60)
187
+
188
+ bugs = load_bugs(1)
189
+ if not bugs:
190
+ print("ERROR: No bugs found in data/bugs_tier1.jsonl. Run data/generate_bugs.py first.")
191
+ sys.exit(1)
192
+
193
+ bug = bugs[0]
194
+ print(f"\nUsing bug: {bug.get('function_name', '?')} β€” {bug.get('bug_type', '?')}\n")
195
+
196
+ calculator_local = DebugRewardCalculator()
197
+
198
+ def _score(label: str, completion: str) -> float:
199
+ try:
200
+ agent_output = parse_agent_output(completion)
201
+ test_results = {"passed": 0, "failed": 0, "total": 0, "newly_broken": 0}
202
+ if agent_output.action == "propose_fix":
203
+ test_results = _run_fix(agent_output.detail, bug)
204
+ breakdown = calculator_local.compute_turn_reward(
205
+ agent_output=agent_output,
206
+ ground_truth={
207
+ "bug_function": bug.get("bug_location", {}).get("function", ""),
208
+ "bug_line": bug.get("bug_location", {}).get("line_start", -1),
209
+ "bug_type": bug.get("bug_type", ""),
210
+ "canonical_fix_code": bug.get("original_code", ""),
211
+ },
212
+ test_results=test_results,
213
+ turn_number=0,
214
+ )
215
+ print(f"--- {label} reward breakdown ---")
216
+ for field, value in breakdown.__dict__.items():
217
+ print(f" {field}: {value}")
218
+ print(f" TOTAL: {breakdown.total}\n")
219
+ return breakdown.total
220
+ except Exception as e:
221
+ print(f"Reward error for {label}: {e}")
222
+ return -0.3
223
+
224
+ good_score = _score("MOCK_GOOD", MOCK_GOOD)
225
+ bad_score = _score("MOCK_BAD", MOCK_BAD)
226
+
227
+ print(f"MOCK_GOOD score: {good_score:.4f}")
228
+ print(f"MOCK_BAD score: {bad_score:.4f}")
229
+
230
+ assert good_score > bad_score, (
231
+ f"ASSERTION FAILED: MOCK_GOOD ({good_score:.4f}) should be > MOCK_BAD ({bad_score:.4f})"
232
+ )
233
+ print("\nLOCAL TEST PASSED")
234
+ sys.exit(0)
235
+
236
  # ── Load model ────────────────────────────────────────────────────────────────
237
  print(f"Loading {MODEL_NAME}...")
238
  model, tokenizer = FastLanguageModel.from_pretrained(
 
297
 
298
  return rewards
299
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
300
  # ── Baseline evaluation (run BEFORE training) ─────────────────────────────────
301
  def run_baseline(n: int = 20) -> dict:
302
  print("\nRunning baseline evaluation on UNTRAINED model...")