AI & Tech

Construct a Full Langfuse Observability and Analysis Pipeline for Tracing, Immediate Administration, Scoring, and Experiments

Naveed Ahmad3 weeks ago3 weeks ago02 mins

print("nPART 5 ── Datasets & experiments --------------------------------------")
DATASET = "capital-cities-tutorial"
langfuse.create_dataset(identify=DATASET, description="Capital-city QA benchmark")
_items = [
   ("What is the capital of France?",  "Paris"),
   ("What is the capital of Germany?", "Berlin"),
   ("What is the capital of Japan?",   "Tokyo"),
   ("What is the capital of Italy?",   "Rome"),
]
for i, (q, a) in enumerate(_items):
   langfuse.create_dataset_item(dataset_name=DATASET, id=f"cap-{i}",
                                enter={"query": q}, expected_output=a)
def capital_task(*, merchandise, **kwargs):
   query = merchandise.enter["question"] if isinstance(merchandise.enter, dict) else merchandise.enter
   return llm_chat([{"role": "user", "content": question}], identify="experiment-answer")
def accuracy(*, enter, output, expected_output, metadata=None, **kwargs):
   hit = bool(expected_output) and expected_output.decrease() in (output or "").decrease()
   return Analysis(identify="accuracy", worth=1.0 if hit else 0.0,
                     remark="exact-match accommodates verify")
def conciseness(*, enter, output, **kwargs):
   return Analysis(identify="char_length", worth=float(len(output or "")))
def mean_accuracy(*, item_results, **kwargs):
   vals = [e.value for r in item_results for e in r.evaluations if e.name == "accuracy"]
   avg = sum(vals) / len(vals) if vals else 0.0
   return Analysis(identify="mean_accuracy", worth=avg, remark=f"{avg:.0%} appropriate")
dataset = langfuse.get_dataset(DATASET)
outcome = dataset.run_experiment(
   identify="capitals-baseline",
   description="Baseline run from the Colab tutorial",
   process=capital_task,
   evaluators=[accuracy, conciseness],
   run_evaluators=[mean_accuracy],
   max_concurrency=4,
)
print(outcome.format())

Leave a Reply Cancel reply