import asyncio
import weave
from verdict import Pipeline
from verdict.common.judge import JudgeUnit
from verdict.schema import Schema
# Initialize Weave
# highlight-next-line
weave.init("verdict_demo")
# Create evaluation model
class SentimentEvaluator(weave.Model):
    @weave.op()
    async def predict(self, text: str) -> dict:
        pipeline = Pipeline()
        pipeline = pipeline >> JudgeUnit().prompt(
            "Classify sentiment as positive, negative, or neutral: {source.text}"
        )
        
        data = Schema.of(text=text)
        result = pipeline.run(data)
        
        return {"sentiment": result}
# Test data
texts = [
    "I love this product, it's amazing!",
    "This is terrible, worst purchase ever.",
    "The weather is okay today."
]
labels = ["positive", "negative", "neutral"]
examples = [
    {"id": str(i), "text": texts[i], "target": labels[i]}
    for i in range(len(texts))
]
# Scoring function
@weave.op()
def sentiment_accuracy(target: str, output: dict) -> dict:
    predicted = output.get("sentiment", "").lower()
    return {"correct": target.lower() in predicted}
model = SentimentEvaluator()
evaluation = weave.Evaluation(
    dataset=examples,
    scorers=[sentiment_accuracy],
)
scores = asyncio.run(evaluation.evaluate(model))
# if you're in a Jupyter Notebook, run:
# scores = await evaluation.evaluate(model)
print(scores)