Skip to contents

This vignette provides practical recipes for integrating dsprrr modules with vitals evaluation. Each recipe is self-contained and demonstrates a specific pattern.

library(dsprrr)
#> 
#> Attaching package: 'dsprrr'
#> The following object is masked from 'package:methods':
#> 
#>     signature
library(vitals)
library(ellmer)
library(tibble)

Recipe 1: Quick Start

The fastest way to evaluate a dsprrr module with vitals:

# 1. Create a simple module
classifier <- signature("input -> label: enum('positive', 'negative')") |>
  module()

# 2. Prepare test data (vitals expects 'input' and 'target' columns)
test_data <- tibble(
  input = c(
    "I love this product!",
    "Terrible experience, waste of money"
  ),
  target = c("positive", "negative")
)

# 3. Create a vitals Task with dsprrr's helper
task <- as_vitals_task(
  module = classifier,
  dataset = test_data,
  scorer = detect_match(),
  name = "sentiment",
  dir = tempdir(),
  .llm = chat_openai(model = "gpt-4o-mini")
)

# 4. Run evaluation
task$eval()
#>  Solving
#>  Solving [655ms]
#> 
#>  Scoring
#>  Scoring [78ms]
#> 

# 5. View results
task$get_samples()
#> # A tibble: 2 × 9
#>   input              target    id result       solver_chat score scorer_metadata
#>   <chr>              <chr>  <int> <list>       <list>      <ord> <list>         
#> 1 I love this produ… posit…     1 <named list> <Chat>      C     <named list>   
#> 2 Terrible experien… negat…     2 <named list> <Chat>      C     <named list>   
#> # ℹ 2 more variables: scorer_explanation <chr>, scorer <chr>

That’s it! The as_vitals_task() helper handles the integration details.

Recipe 2: Train-Test Split Workflow

A common pattern: optimize on training data, evaluate on held-out test data.

# Full dataset with ground truth
qa_data <- tibble(
  question = c(
    "What is the capital of France?",
    "Who wrote Romeo and Juliet?",
    "What is 2 + 2?",
    "What color is the sky?",
    "Who painted the Mona Lisa?",
    "What is the largest planet?"
  ),
  target = c(
    "Paris",
    "Shakespeare",
    "4",
    "Blue",
    "Leonardo da Vinci",
    "Jupiter"
  )
)

# Split into train/test
set.seed(42)
train_idx <- sample(nrow(qa_data), 4)
trainset <- qa_data[train_idx, ]
testset <- qa_data[-train_idx, ]

# Create and optimize module
qa_module <- signature("question -> answer") |>
  module()

# Optimize with few-shot examples from training set
optimized <- compile(
  LabeledFewShot(k = 2L),
  qa_module,
  trainset = trainset
)

# Evaluate on held-out test set
eval_task <- as_vitals_task(
  module = optimized,
  dataset = testset,
  scorer = detect_match(),
  name = "qa-test",
  dir = tempdir(),
  .llm = chat_openai(model = "gpt-4o-mini")
)

eval_task$eval()
#>  Solving
#>  Solving [483ms]
#> 
#>  Scoring
#>  Scoring [51ms]
#> 

# Compare performance
cat("Test accuracy:", mean(eval_task$get_samples()$score == "C"), "\n")
#> Test accuracy: 1

Recipe 3: Using Different Vitals Scorers

Vitals provides several scorers for different use cases:

Exact Match (detect_match)

Best for factual questions with precise answers:

factual <- signature("question -> answer") |> module()

task <- as_vitals_task(
  module = factual,
  dataset = tibble(
    question = c("What is 5 * 7?", "What year did WWII end?"),
    target = c("35", "1945")
  ),
  scorer = detect_match(),
  name = "factual-qa",
  dir = tempdir(),
  .llm = chat_openai(model = "gpt-4o-mini")
)

task$eval()
#>  Solving
#>  Solving [308ms]
#> 
#>  Scoring
#>  Scoring [57ms]
#> 
task$get_samples()
#> # A tibble: 2 × 10
#>   question     target input    id result       solver_chat score scorer_metadata
#>   <chr>        <chr>  <chr> <int> <list>       <list>      <ord> <list>         
#> 1 What is 5 *… 35     What…     1 <named list> <Chat>      C     <named list>   
#> 2 What year d… 1945   What…     2 <named list> <Chat>      C     <named list>   
#> # ℹ 2 more variables: scorer_explanation <chr>, scorer <chr>

Model-Graded QA (model_graded_qa)

Best for open-ended questions where multiple phrasings are correct:

explainer <- signature("topic -> explanation") |>
  module(type = "chain_of_thought")

task <- as_vitals_task(
  module = explainer,
  dataset = tibble(
    topic = c("Why is the sky blue?", "How do plants make food?"),
    target = c(
      "Light scattering in atmosphere",
      "Photosynthesis using sunlight"
    )
  ),
  scorer = model_graded_qa(),
  name = "explanations",
  dir = tempdir(),
  .llm = chat_openai(model = "gpt-4o-mini")
)

task$eval()
#>  Solving
#>  Solving [309ms]
#> 
#>  Scoring
#>  Scoring [305ms]
#> 
task$get_samples()
#> # A tibble: 2 × 10
#>   topic            target input    id result       solver_chat score scorer_chat
#>   <chr>            <chr>  <chr> <int> <list>       <list>      <ord> <list>     
#> 1 Why is the sky … Light… Why …     1 <named list> <Chat>      C     <Chat>     
#> 2 How do plants m… Photo… How …     2 <named list> <Chat>      C     <Chat>     
#> # ℹ 2 more variables: scorer_metadata <list>, scorer <chr>

Substring Match (detect_includes)

Best for checking if specific text appears anywhere in output:

coder <- signature("task -> code") |> module()

task <- as_vitals_task(
  module = coder,
  dataset = tibble(
    task = c(
      "Write a function to add two numbers",
      "Create a loop that prints 1 to 5"
    ),
    target = c("function", "for")
  ),
  scorer = detect_includes(),
  name = "code-gen",
  dir = tempdir(),
  .llm = chat_openai(model = "gpt-4o-mini")
)

task$eval()
#>  Solving
#>  Solving [308ms]
#> 
#>  Scoring
#>  Scoring [50ms]
#> 
task$get_samples()
#> # A tibble: 2 × 10
#>   task         target input    id result       solver_chat score scorer_metadata
#>   <chr>        <chr>  <chr> <int> <list>       <list>      <ord> <list>         
#> 1 Write a fun… funct… Writ…     1 <named list> <Chat>      I     <named list>   
#> 2 Create a lo… for    Crea…     2 <named list> <Chat>      C     <named list>   
#> # ℹ 2 more variables: scorer_explanation <chr>, scorer <chr>

Recipe 4: Analyzing Evaluation Results

After running evaluation, dig into the results:

# Run an evaluation
sentiment <- signature(
  "text -> sentiment: enum('positive', 'negative', 'neutral')"
) |>
  module()

dataset <- tibble(
  text = c(
    "Best purchase ever!",
    "Complete garbage",
    "It's okay I guess",
    "Amazing quality!",
    "Never buying again"
  ),
  target = c("positive", "negative", "neutral", "positive", "negative")
)

task <- as_vitals_task(
  module = sentiment,
  dataset = dataset,
  scorer = detect_match(),
  name = "sentiment-analysis",
  dir = tempdir(),
  .llm = chat_openai(model = "gpt-4o-mini")
)

task$eval()
#>  Solving
#>  Solving [780ms]
#> 
#>  Scoring
#>  Scoring [109ms]
#> 

# Get detailed scores
scores <- task$get_samples()

# Find failures
failures <- scores[scores$score == "I", ]
cat("Failed on", nrow(failures), "of", nrow(scores), "examples\n")
#> Failed on 0 of 5 examples

# Examine what went wrong
if (nrow(failures) > 0) {
  cat("\nFailure analysis:\n")
  for (i in seq_len(nrow(failures))) {
    cat("Input:", failures$input[i], "\n")
    cat("Expected:", failures$target[i], "\n")
    cat("Got:", failures$answer[i], "\n\n")
  }
}

# Calculate accuracy
accuracy <- mean(scores$score == "C")
cat("Overall accuracy:", scales::percent(accuracy), "\n")
#> Overall accuracy: 100%

Recipe 5: Comparing Module Variants

Test different module configurations side-by-side:

# Same signature, different approaches
sig <- signature("question -> answer")

# Variant 1: Basic prediction
basic <- module(sig)

# Variant 2: Chain-of-thought reasoning
cot <- module(sig, type = "chain_of_thought")

# Test dataset
test_data <- tibble(
  question = c(
    "If a train travels 60 mph for 2 hours, how far does it go?",
    "What is 15% of 80?"
  ),
  target = c("120 miles", "12")
)

llm <- chat_openai(model = "gpt-4o-mini")

# Evaluate both
results <- list()
for (name in c("basic", "cot")) {
  mod <- if (name == "basic") basic else cot

  task <- as_vitals_task(
    module = mod,
    dataset = test_data,
    scorer = model_graded_qa(),
    name = paste0("math-", name),
    dir = tempdir(),
    .llm = llm
  )
  task$eval()

  results[[name]] <- mean(task$get_samples()$score == "C")
}
#>  Solving
#>  Solving [305ms]
#> 
#>  Scoring
#>  Scoring [192ms]
#> 
#>  Solving
#>  Solving [328ms]
#> 
#>  Scoring
#>  Scoring [198ms]
#> 

# Compare
cat("Basic accuracy:", scales::percent(results$basic), "\n")
#> Basic accuracy: 100%
cat("CoT accuracy:", scales::percent(results$cot), "\n")
#> CoT accuracy: 100%

Recipe 6: Multiple Epochs for Confidence

Run multiple evaluation passes for more reliable metrics:

classifier <- signature(
  "text -> category: enum('tech', 'sports', 'politics')"
) |>
  module()

# Small dataset - need multiple epochs for confidence
test_data <- tibble(
  text = c(
    "New iPhone released today",
    "Lakers win championship",
    "Senate passes new bill"
  ),
  target = c("tech", "sports", "politics")
)

# Run 3 epochs (each example evaluated 3 times)
task <- as_vitals_task(
  module = classifier,
  dataset = test_data,
  scorer = detect_match(),
  name = "news-classification",
  epochs = 3L,
  dir = tempdir(),
  .llm = chat_openai(model = "gpt-4o-mini")
)

task$eval()
#>  Solving
#>  Solving [1.6s]
#> 
#>  Scoring
#>  Scoring [214ms]
#> 

# Aggregate scores across epochs
scores <- task$get_samples()
cat(
  "Total evaluations:",
  nrow(scores),
  "(3 epochs x",
  nrow(test_data),
  "examples)\n"
)
#> Total evaluations: 9 (3 epochs x 3 examples)
cat("Overall accuracy:", scales::percent(mean(scores$score == "C")), "\n")
#> Overall accuracy: 100%

Recipe 7: Custom Evaluation Pipeline

For complex evaluation logic, build a custom pipeline:

# Custom evaluation function
evaluate_module <- function(module, dataset, llm, scorer = detect_match()) {
  # Run predictions
  predictions <- run(
    module,
    !!!as.list(dataset[, -which(names(dataset) == "target")]),
    .llm = llm
  )

  # Combine with targets
  results <- tibble(
    input = dataset$input,
    target = dataset$target,
    prediction = predictions[[1]] # First output column
  )

  # Score
  results$correct <- results$prediction == results$target

  # Summary
  list(
    accuracy = mean(results$correct),
    n = nrow(results),
    failures = results[!results$correct, ],
    all = results
  )
}

# Use it
results <- evaluate_module(
  module = sentiment_module,
  dataset = test_data,
  llm = chat_openai()
)

cat("Accuracy:", scales::percent(results$accuracy), "\n")
cat("Failures:", nrow(results$failures), "\n")

Summary

These recipes cover the most common integration patterns:

Recipe Use When
Quick Start Getting started, simple evaluation
Train-Test Split Validating optimization results
Different Scorers Matching scorer to task type
Analyzing Results Understanding failures
Comparing Variants A/B testing modules
Multiple Epochs Small datasets, need confidence
Custom Pipeline Complex evaluation logic

For more details on the underlying APIs, see vignette("vitals-integration").