How to Use AgentTrove: Streaming 1.7M Agentic Traces and Building a Clean ShareGPT SFT Dataset in Python


def is_success(row):
   res = (row.get("result") or "").lower()
   if res in ("resolved", "success", "pass", "passed", "correct"):
       return True
   rw = row.get("reward")
   try:
       return float(rw) >= 1.0
   except (TypeError, ValueError):
       return False
out_path = "agenttrove_clean_sft.jsonl"
kept, scanned, SCAN, KEEP = 0, 0, 1500, 200
print(f"\n⏳ Scanning up to {SCAN} rows, keeping up to {KEEP} successful traces…")
with open(out_path, "w") as f:
   for row in itertools.islice(load_dataset(REPO, split="train", streaming=True), SCAN):
       scanned += 1
       if not is_success(row):
           continue
       turns = normalize_turns(row[TRACE_KEY])
       conv = [{"from": r, "value": c} for r, c in turns if c.strip()]
       if len(conv) < 2:
           continue
       f.write(json.dumps({
           "conversations": conv,
           "source": row.get("original_source"),
           "teacher": row.get("original_teacher"),
       }) + "\n")
       kept += 1
       if kept >= KEEP:
           break
print(f"✅ Scanned {scanned} rows → wrote {kept} clean traces to '{out_path}'")
def search_traces(keyword=None, source=None, limit=3, scan=3000):
   """Stream the dataset and yield-print traces matching filters."""
   hits = 0
   for row in itertools.islice(load_dataset(REPO, split="train", streaming=True), scan):
       if source and row.get("original_source") != source:
           continue
       if keyword:
           blob = " ".join(c for _, c in normalize_turns(row[TRACE_KEY]))
           if keyword.lower() not in blob.lower():
               continue
       render_trace(row, max_chars=300)
       hits += 1
       if hits >= limit:
           break
   if hits == 0:
       print("No matches in the scanned window — try increasing `scan`.")
print("\n🔍 Searching for 'nl2bash' source traces:")
search_traces(source="nl2bash", limit=2, scan=4000)
print("\n🎉 Tutorial complete! Next ideas:")
print("   • Increase N / SCAN for bigger analyses.")
print("   • Filter by original_source (swesmith, codeforces, r2egym…) for a domain SFT set.")
print("   • Feed agenttrove_clean_sft.jsonl into Axolotl / LLaMA-Factory for fine-tuning.")



Source link

  • Related Posts

    NVIDIA Introduces X-Token: Projection-Guided Cross-Tokenizer KD That Outperforms GOLD by +3.82 Average Points on Llama-3.2-1B

    Knowledge distillation (KD) transfers “dark knowledge” from a large teacher model to a smaller student. The student learns from the teacher’s full output probability distribution over tokens, not just correct…

    StepFun Releases Step 3.7 Flash: A 198B MoE Vision-Language Model for Coding Agents and Search Workflows

    StepFun today released Step 3.7 Flash, a multimodal Mixture-of-Experts model targeting agentic use cases. It adds native vision input and improved tool-use reliability over Step 3.5 Flash. What is Step…

    Leave a Reply

    Your email address will not be published. Required fields are marked *