How one can Use AgentTrove: Streaming 1.7M Agentic Traces and Constructing a Clear ShareGPT SFT Dataset in Python

How one can Use AgentTrove: Streaming 1.7M Agentic Traces and Constructing a Clear ShareGPT SFT Dataset in Python


def is_success(row):
   res = (row.get("consequence") or "").decrease()
   if res in ("resolved", "success", "go", "handed", "appropriate"):
       return True
   rw = row.get("reward")
   attempt:
       return float(rw) >= 1.0
   besides (TypeError, ValueError):
       return False
out_path = "agenttrove_clean_sft.jsonl"
stored, scanned, SCAN, KEEP = 0, 0, 1500, 200
print(f"n⏳ Scanning as much as {SCAN} rows, protecting as much as {KEEP} profitable traces…")
with open(out_path, "w") as f:
   for row in itertools.islice(load_dataset(REPO, break up="practice", streaming=True), SCAN):
       scanned += 1
       if not is_success(row):
           proceed
       turns = normalize_turns(row[TRACE_KEY])
       conv = [{"from": r, "value": c} for r, c in turns if c.strip()]
       if len(conv) < 2:
           proceed
       f.write(json.dumps({
           "conversations": conv,
           "supply": row.get("original_source"),
           "trainer": row.get("original_teacher"),
       }) + "n")
       stored += 1
       if stored >= KEEP:
           break
print(f"✅ Scanned {scanned} rows → wrote {stored} clear traces to '{out_path}'")
def search_traces(key phrase=None, supply=None, restrict=3, scan=3000):
   """Stream the dataset and yield-print traces matching filters."""
   hits = 0
   for row in itertools.islice(load_dataset(REPO, break up="practice", streaming=True), scan):
       if supply and row.get("original_source") != supply:
           proceed
       if key phrase:
           blob = " ".be part of(c for _, c in normalize_turns(row[TRACE_KEY]))
           if key phrase.decrease() not in blob.decrease():
               proceed
       render_trace(row, max_chars=300)
       hits += 1
       if hits >= restrict:
           break
   if hits == 0:
       print("No matches within the scanned window — attempt rising `scan`.")
print("n🔍 Looking for 'nl2bash' supply traces:")
search_traces(supply="nl2bash", restrict=2, scan=4000)
print("n🎉 Tutorial full! Subsequent concepts:")
print("   • Improve N / SCAN for greater analyses.")
print("   • Filter by original_source (swesmith, codeforces, r2egym…) for a website SFT set.")
print("   • Feed agenttrove_clean_sft.jsonl into Axolotl / LLaMA-Manufacturing unit for fine-tuning.")



Source link

Leave a Reply

Your email address will not be published. Required fields are marked *