Create your first merit
Create
merit_store_chatbot.py:Copy
import merit
from merit.predicates import has_unsupported_facts, follows_policy
@merit.sut
def store_chatbot(prompt: str) -> str:
return call_llm(prompt)
async def merit_chatbot_no_hallucinations(store_chatbot):
context = "Store hours: 9 AM - 6 PM, Monday-Saturday. Closed Sundays."
response = store_chatbot("When are you open?")
assert not await has_unsupported_facts(response, context)
policy = "Agent provides prices only by calling an 'offer_product' tool"
assert await follows_policy(response, policy)
Cover more questions
Add cases to test multiple scenarios:
Copy
import merit
from merit import Case
from merit.predicates import has_unsupported_facts, follows_policy
@merit.sut
def store_chatbot(prompt: str) -> str:
return call_llm(prompt)
cases = [
Case(
sut_input_values={"prompt": "When are you open?"},
references={
"context": """Store hours:
9 AM - 6 PM, Monday-Saturday.
Closed Sundays.""",
},
),
Case(
sut_input_values={"prompt": "Return policy?"},
references={"context": "30-day returns with receipt."},
),
Case(
sut_input_values={"prompt": "Shipping cost?"},
references={"context": "Free shipping over $50."},
),
]
@merit.iter_cases(*cases)
async def merit_chatbot_no_hallucinations(case: Case, store_chatbot):
response = store_chatbot(**case.sut_input_values)
assert not await has_unsupported_facts(response, case.references["context"])
policy = "Agent provides prices only by calling an 'offer_product' tool"
assert await follows_policy(response, policy)
Track quality with metrics
Add a metric to enforce 80% accuracy across all cases:
Copy
import merit
from merit import Case, Metric, metrics
from merit.predicates import has_unsupported_facts, follows_policy
@merit.sut
def store_chatbot(prompt: str) -> str:
return call_llm(prompt)
@merit.metric
def accuracy():
metric = Metric()
yield metric
assert metric.mean > 0.8
cases = [
Case(
sut_input_values={"prompt": "When are you open?"},
references={
"context": """Store hours:
9 AM - 6 PM, Monday-Saturday.
Closed Sundays.""",
},
),
Case(
sut_input_values={"prompt": "Return policy?"},
references={"context": "30-day returns with receipt."},
),
Case(
sut_input_values={"prompt": "Shipping cost?"},
references={"context": "Free shipping over $50."},
),
]
@merit.iter_cases(*cases)
async def merit_chatbot_no_hallucinations(
case: Case,
store_chatbot,
accuracy: Metric,
):
response = store_chatbot(**case.sut_input_values)
with metrics(accuracy):
assert not await has_unsupported_facts(
response,
case.references["context"],
)
policy = "Agent provides prices only by calling an 'offer_product' tool"
assert await follows_policy(response, policy)
Assert on trace spans
Inject
trace_context to verify the chatbot actually called the right tools:Copy
import merit
from merit import Case, Metric, metrics
from merit.predicates import has_unsupported_facts, follows_policy
@merit.sut
def store_chatbot(prompt: str) -> str:
return call_llm(prompt)
@merit.metric
def accuracy():
metric = Metric()
yield metric
assert metric.mean > 0.8
cases = [
Case(
sut_input_values={"prompt": "When are you open?"},
references={
"context": """Store hours:
9 AM - 6 PM, Monday-Saturday.
Closed Sundays.""",
},
),
Case(
sut_input_values={"prompt": "Return policy?"},
references={"context": "30-day returns with receipt."},
),
Case(
sut_input_values={"prompt": "How much for the Nike Air Max?"},
references={
"context": "Nike Air Max: $129.99",
"expected_tool": "offer_product",
},
),
]
@merit.iter_cases(*cases)
async def merit_chatbot_no_hallucinations(
case: Case,
store_chatbot,
accuracy: Metric,
trace_context,
):
response = store_chatbot(**case.sut_input_values)
with metrics(accuracy):
assert not await has_unsupported_facts(
response,
case.references["context"],
)
# Verify tool was called when expected
if expected_tool := case.references.get("expected_tool"):
sut_spans = trace_context.get_sut_spans(name="store_chatbot")
tool_names = [
s.attributes.get("llm.request.functions.0.name")
for s in trace_context.get_llm_calls()
if s.attributes
]
assert expected_tool in tool_names