@osmosis_rubric 装饰函数并提供 rubric 描述,平台会处理剩余工作。
基本示例
文件:reward_rubric/reward_rubric_openai.py
复制
询问AI
from osmosis_ai import evaluate_rubric, osmosis_rubric
import os
RUBRIC = "Reward based on whether the predicted numerical value matches the ground truth."
SCORE_MIN = 0.0
SCORE_MAX = 1.0
PROVIDER = "openai"
MODEL = "gpt-5.2"
API_KEY = os.getenv("OPENAI_API_KEY")
@osmosis_rubric
def compute_rubric_score_openai(
solution_str: str,
ground_truth: str,
extra_info: dict,
**kwargs
) -> float:
"""
Delegate rubric scoring to OpenAI GPT model.
"""
model_info = {
"provider": PROVIDER,
"model": MODEL,
"api_key": API_KEY
}
result = evaluate_rubric(
rubric=RUBRIC,
solution_str=solution_str,
model_info=model_info,
ground_truth=ground_truth,
metadata=extra_info.get("metadata"),
score_min=SCORE_MIN,
score_max=SCORE_MAX,
return_details=False,
)
return float(result)
evaluate_rubric 函数
evaluate_rubric() 函数处理 LLM 评估。请参阅 API 参考 了解完整的参数文档。
支持的提供商
上面的示例使用了 OpenAI。对于其他提供商(Anthropic、Google、xAI、OpenRouter、Cerebras),请更改model_info 字典中的 provider 和 model 字段。请参阅 支持的提供商 了解可用提供商和模型的完整列表。
编写有效的 Rubrics
要具体
复制
询问AI
# Vague
rubric = "Score the answer quality"
# Specific
rubric = """
Evaluate the solution based on:
1. Correctness: Does it match the ground truth? (50%)
2. Explanation: Is the reasoning clear? (30%)
3. Formatting: Is it well-structured? (20%)
Return a score from 0.0 to 1.0.
"""
包含评分指南
复制
询问AI
rubric = """
Score the code quality from 0.0 to 1.0 based on:
- 1.0: Perfect - Correct, efficient, well-documented
- 0.7-0.9: Good - Correct with minor style issues
- 0.4-0.6: Fair - Works but has problems
- 0.0-0.3: Poor - Incorrect or seriously flawed
Ground truth: {ground_truth}
"""
提供示例
复制
询问AI
rubric = """
Evaluate if the SQL query correctly answers the question.
Examples:
- "SELECT * FROM users WHERE age > 18" for "users over 18" → 1.0
- "SELECT name FROM users WHERE age >= 18" for "users over 18" → 0.8 (missing users exactly 18)
- "SELECT * FROM products" for "users over 18" → 0.0 (wrong table)
Score from 0.0 (completely wrong) to 1.0 (perfect).
"""
高级模式
多维度评估
复制
询问AI
@osmosis_rubric
def comprehensive_rubric(
solution_str: str,
ground_truth: str,
extra_info: dict,
**kwargs
) -> float:
rubric = """
Evaluate the solution across multiple dimensions:
1. Factual Accuracy (40%): Is the information correct?
2. Completeness (30%): Does it address all parts of the question?
3. Clarity (20%): Is it easy to understand?
4. Conciseness (10%): Is it appropriately brief?
Compare against ground truth: {ground_truth}
Return a weighted average score from 0.0 to 1.0.
"""
model_info = {
"provider": "anthropic",
"model": "claude-sonnet-4-5",
"api_key": os.getenv("ANTHROPIC_API_KEY")
}
return evaluate_rubric(
rubric=rubric,
solution_str=solution_str,
model_info=model_info,
ground_truth=ground_truth,
metadata=extra_info.get("metadata"),
score_min=0.0,
score_max=1.0
)
上下文感知 Rubric
复制
询问AI
@osmosis_rubric
def context_aware_rubric(
solution_str: str,
ground_truth: str,
extra_info: dict,
**kwargs
) -> float:
# Extract context from extra_info
difficulty = extra_info.get("metadata", {}).get("difficulty", "medium")
rubric = f"""
Evaluate the solution for a {difficulty} difficulty problem.
Criteria:
- Correctness: Must match ground truth logic
- Approach: Should be appropriate for {difficulty} level
- Efficiency: Expected to be {get_efficiency_requirement(difficulty)}
Ground truth: {{ground_truth}}
Score from 0.0 to 1.0.
"""
model_info = {
"provider": "openai",
"model": "gpt-5.2",
"api_key": os.getenv("OPENAI_API_KEY")
}
return evaluate_rubric(
rubric=rubric,
solution_str=solution_str,
model_info=model_info,
ground_truth=ground_truth,
metadata=extra_info.get("metadata"),
score_min=0.0,
score_max=1.0
)
获取详细反馈
复制
询问AI
@osmosis_rubric
def detailed_rubric(
solution_str: str,
ground_truth: str,
extra_info: dict,
**kwargs
) -> float:
rubric = "Evaluate the solution quality and provide detailed feedback."
model_info = {
"provider": "anthropic",
"model": "claude-sonnet-4-5",
"api_key": os.getenv("ANTHROPIC_API_KEY")
}
# Get detailed result
result = evaluate_rubric(
rubric=rubric,
solution_str=solution_str,
model_info=model_info,
ground_truth=ground_truth,
score_min=0.0,
score_max=1.0,
return_details=True # Returns dict with score and explanation
)
# Log the explanation for debugging
if isinstance(result, dict):
print(f"Score: {result['score']}")
print(f"Reasoning: {result['explanation']}")
return float(result['score'])
return float(result)