Skip to main content

Reward Functions

Reward functions provide deterministic, numeric scoring for LLM outputs. They use the @osmosis_reward decorator and return a float value representing the quality of the output.

Basic Example

File: reward_fn/compute_reward.py
import re
from osmosis_ai import osmosis_reward

def extract_solution(solution_str):
    solution = re.search(r'####\s*([-+]?\d*\.?\d+)', solution_str)
    if not solution:
        return None
    return solution.group(1)

@osmosis_reward
def numbers_match_reward(
    solution_str: str,
    ground_truth: str,
    extra_info: dict = None,
    **kwargs
) -> float:
    """
    Reward function that checks if extracted number matches ground truth.
    Returns 1.0 for match, 0.0 otherwise.
    """
    extracted = extract_solution(solution_str)
    try:
        sol_val = float(extracted)
        gt_val = float(ground_truth)
        return 1.0 if abs(gt_val - sol_val) < 1e-7 else 0.0
    except:
        return 0.0

Function Signature

All reward functions must follow this signature:
@osmosis_reward
def my_reward_function(
    solution_str: str,      # The LLM output to evaluate
    ground_truth: str,      # The expected correct answer
    extra_info: dict = None,  # Additional context/metadata
    **kwargs                # Future extensibility
) -> float:                 # Return score (typically 0.0 to 1.0)
    pass

Parameters

solution_str: str

The output generated by the LLM that you want to evaluate. Example:
solution_str = "The answer is 42. #### 42"

ground_truth: str

The expected correct answer or reference solution. Example:
ground_truth = "42"

extra_info: dict

Optional dictionary containing additional metadata or context. Example:
extra_info = {
    "metadata": {
        "problem_type": "arithmetic",
        "difficulty": "easy"
    },
    "context": "This is a simple addition problem"
}

**kwargs

Captures any additional keyword arguments for future compatibility.

Return Value

Return a float value representing the score:
  • Typically between 0.0 (worst) and 1.0 (best)
  • Can use other ranges if appropriate for your use case
  • Should be deterministic (same inputs → same output)

Common Patterns

Exact Match

@osmosis_reward
def exact_match_reward(
    solution_str: str,
    ground_truth: str,
    extra_info: dict = None,
    **kwargs
) -> float:
    """Returns 1.0 for exact match, 0.0 otherwise"""
    return 1.0 if solution_str.strip() == ground_truth.strip() else 0.0

Partial Credit

@osmosis_reward
def partial_credit_reward(
    solution_str: str,
    ground_truth: str,
    extra_info: dict = None,
    **kwargs
) -> float:
    """Returns partial credit based on keyword matches"""
    keywords = ground_truth.lower().split()
    solution = solution_str.lower()

    matches = sum(1 for keyword in keywords if keyword in solution)
    return matches / len(keywords) if keywords else 0.0

Threshold-Based

from difflib import SequenceMatcher

@osmosis_reward
def similarity_reward(
    solution_str: str,
    ground_truth: str,
    extra_info: dict = None,
    **kwargs
) -> float:
    """Returns score based on string similarity"""
    ratio = SequenceMatcher(None, solution_str, ground_truth).ratio()

    # Apply threshold: below 0.5 = 0.0, above 0.8 = 1.0
    if ratio < 0.5:
        return 0.0
    elif ratio > 0.8:
        return 1.0
    else:
        # Linear interpolation between thresholds
        return (ratio - 0.5) / 0.3

Multi-Criteria

import json

@osmosis_reward
def multi_criteria_reward(
    solution_str: str,
    ground_truth: str,
    extra_info: dict = None,
    **kwargs
) -> float:
    """Evaluates multiple criteria and returns weighted average"""
    try:
        solution = json.loads(solution_str)
        expected = json.loads(ground_truth)

        scores = []

        # Criterion 1: Correctness (weight: 0.5)
        correctness = 1.0 if solution["answer"] == expected["answer"] else 0.0
        scores.append(correctness * 0.5)

        # Criterion 2: Explanation quality (weight: 0.3)
        explanation_length = len(solution.get("explanation", ""))
        explanation_score = min(explanation_length / 100, 1.0)
        scores.append(explanation_score * 0.3)

        # Criterion 3: Code validity (weight: 0.2)
        has_code = "code" in solution and len(solution["code"]) > 0
        scores.append(1.0 * 0.2 if has_code else 0.0)

        return sum(scores)
    except:
        return 0.0

Error Handling

Always handle errors gracefully:
@osmosis_reward
def robust_reward(
    solution_str: str,
    ground_truth: str,
    extra_info: dict = None,
    **kwargs
) -> float:
    """Reward function with comprehensive error handling"""
    try:
        # Validate inputs
        if not solution_str or not ground_truth:
            return 0.0

        # Process
        result = compute_score(solution_str, ground_truth)

        # Validate output
        if not isinstance(result, (int, float)):
            return 0.0

        # Clamp to valid range
        return max(0.0, min(1.0, float(result)))

    except ValueError as e:
        print(f"Value error: {e}")
        return 0.0
    except Exception as e:
        print(f"Unexpected error: {e}")
        return 0.0

Testing Locally

Test your reward functions before pushing:
# test_rewards.py
from reward_fn.compute_reward import numbers_match_reward

def test_exact_match():
    score = numbers_match_reward("#### 42", "42")
    assert score == 1.0

def test_mismatch():
    score = numbers_match_reward("#### 43", "42")
    assert score == 0.0

def test_invalid_format():
    score = numbers_match_reward("no number here", "42")
    assert score == 0.0

if __name__ == "__main__":
    test_exact_match()
    test_mismatch()
    test_invalid_format()
    print("All tests passed!")
Run the test:
python test_rewards.py

Best Practices

1. Be Deterministic

Reward functions should always return the same score for the same inputs:
# Good - deterministic
@osmosis_reward
def deterministic_reward(solution_str: str, ground_truth: str, **kwargs) -> float:
    return len(set(solution_str) & set(ground_truth)) / len(set(ground_truth))

# Avoid - non-deterministic
import random
@osmosis_reward
def random_reward(solution_str: str, ground_truth: str, **kwargs) -> float:
    return random.random()  # Don't do this!

2. Normalize Scores

Keep scores in a consistent range:
@osmosis_reward
def normalized_reward(solution_str: str, ground_truth: str, **kwargs) -> float:
    raw_score = compute_raw_score(solution_str, ground_truth)
    # Normalize to [0, 1]
    return max(0.0, min(1.0, raw_score / max_possible_score))

3. Document Scoring Logic

Clearly explain how scores are calculated:
@osmosis_reward
def documented_reward(solution_str: str, ground_truth: str, **kwargs) -> float:
    """
    Evaluates code quality based on multiple factors.

    Scoring breakdown:
    - Syntax correctness: 0.4
    - Functionality: 0.4
    - Code style: 0.2

    Returns:
        float: Score between 0.0 and 1.0
    """
    # Implementation
    pass

4. Use extra_info When Appropriate

Leverage the extra_info parameter for context:
@osmosis_reward
def context_aware_reward(
    solution_str: str,
    ground_truth: str,
    extra_info: dict = None,
    **kwargs
) -> float:
    """Uses metadata to adjust scoring thresholds"""
    base_score = compute_base_score(solution_str, ground_truth)

    # Adjust based on difficulty
    if extra_info and "difficulty" in extra_info.get("metadata", {}):
        difficulty = extra_info["metadata"]["difficulty"]
        if difficulty == "hard":
            # More lenient for hard problems
            return base_score * 1.2 if base_score < 0.8 else 1.0

    return base_score

Next Steps