Tutorial 3: Extracting Structured Data

In Tutorial 2, you built a classifier that returns a single value. But real extraction tasks need multiple fields: names and dates, sentiment and confidence, entities and relationships.

In this tutorial, you’ll extract complex, structured data from text.

Time: 25-30 minutes

What You’ll Build

An entity extractor that pulls structured information from news articles and emails.

Prerequisites

Completed Tutorial 2
OPENAI_API_KEY set in your environment

library(dsprrr)
#> 
#> Attaching package: 'dsprrr'
#> The following object is masked from 'package:stats':
#> 
#>     step
#> The following object is masked from 'package:methods':
#> 
#>     signature
library(ellmer)
library(tibble)

chat <- chat_openai()
#> Using model = "gpt-4.1".

Step 1: Multiple Output Fields

So far you’ve seen single outputs like -> answer or -> sentiment. Add more outputs with commas:

sig <- signature("text -> sentiment, confidence: number")

extractor <- module(sig, type = "predict")

run(extractor, text = "This product is absolutely fantastic!", .llm = chat)
#> $sentiment
#> [1] "positive"
#> 
#> $confidence
#> [1] 0.98

You get back both sentiment and confidence in a structured result.

Try another:

run(extractor, text = "It was okay, nothing special.", .llm = chat)
#> $sentiment
#> [1] "neutral"
#> 
#> $confidence
#> [1] 0.85

Notice the confidence is lower for ambiguous text.

Step 2: Typed Multiple Outputs

Add types to each output field:

sig <- signature(
  "review -> sentiment: enum('positive', 'negative', 'neutral'), stars: int, summary: string"
)

analyzer <- module(sig, type = "predict")

result <- run(
  analyzer,
  review = "I've been using this blender for 6 months now. It's incredibly powerful and easy to clean. The only downside is it's quite loud. Overall, I'm very happy with it.",
  .llm = chat
)

result
#> $sentiment
#> [1] "positive"
#> 
#> $stars
#> [1] 4
#> 
#> $summary
#> [1] "Powerful and easy-to-clean blender, but a bit loud."

You get sentiment, a star rating, and a summary—all typed correctly.

Step 3: Complex Structures with `type_object()`

For nested or complex data, use ellmer’s type system directly:

sig <- signature(
  inputs = list(
    input("article", description = "News article to analyze")
  ),
  output_type = type_object(
    headline = type_string("A concise headline"),
    sentiment = type_enum(values = c("positive", "negative", "neutral")),
    word_count = type_integer()
  ),
  instructions = "Analyze the news article."
)

article_analyzer <- module(sig, type = "predict")

Test it with a news snippet:

article <- "
Scientists at MIT announced a breakthrough in solar panel efficiency today.
The new panels can convert 47% of sunlight to electricity, nearly double
the current commercial standard. The technology uses a novel layered
approach that captures more of the light spectrum. Researchers expect
commercial applications within 3-5 years.
"

run(article_analyzer, article = article, .llm = chat)
#> $headline
#> [1] "MIT Scientists Achieve Breakthrough in Solar Panel Efficiency"
#> 
#> $sentiment
#> [1] "positive"
#> 
#> $word_count
#> [1] 54

Step 4: Arrays of Values

Extract lists of items with type_array():

sig <- signature(
  inputs = list(
    input("text", description = "Text to extract entities from")
  ),
  output_type = type_object(
    people = type_array(type_string(), description = "Names of people mentioned"),
    organizations = type_array(type_string(), description = "Organizations mentioned"),
    locations = type_array(type_string(), description = "Places mentioned")
  ),
  instructions = "Extract named entities from the text."
)

entity_extractor <- module(sig, type = "predict")

Test with a news article:

news <- "
Apple CEO Tim Cook met with President Biden at the White House yesterday
to discuss manufacturing jobs. Cook announced that Apple will invest
$430 billion in the United States over the next five years, creating
20,000 new jobs. The meeting also included Treasury Secretary Janet Yellen
and Commerce Secretary Gina Raimondo.
"

result <- run(entity_extractor, text = news, .llm = chat)
result
#> $people
#> [1] "Tim Cook"        "President Biden" "Janet Yellen"    "Gina Raimondo"  
#> 
#> $organizations
#> [1] "Apple"       "White House" "Treasury"    "Commerce"   
#> 
#> $locations
#> [1] "United States"

Access the arrays directly:

result$people
#> [1] "Tim Cook"        "President Biden" "Janet Yellen"    "Gina Raimondo"
result$organizations
#> [1] "Apple"       "White House" "Treasury"    "Commerce"

Step 5: Nested Objects

For hierarchical data, nest type_object() calls:

sig <- signature(
  inputs = list(
    input("email", description = "Email message to parse")
  ),
  output_type = type_object(
    sender = type_object(
      name = type_string(),
      email = type_string()
    ),
    subject = type_string(),
    priority = type_enum(values = c("low", "normal", "high", "urgent")),
    action_items = type_array(type_string()),
    requires_response = type_boolean()
  ),
  instructions = "Parse the email and extract key information."
)

email_parser <- module(sig, type = "predict")

Test with an email:

email <- "
From: Sarah Johnson <sarah.johnson@techcorp.com>
Subject: Q4 Budget Review - Action Required

Hi team,

Please review the attached Q4 budget proposal by Friday. We need to:
1. Confirm department allocations
2. Identify any cost-saving opportunities
3. Submit final numbers to finance

This is time-sensitive as the board meeting is next Monday.

Thanks,
Sarah
"

result <- run(email_parser, email = email, .llm = chat)
result
#> $sender
#> $sender$name
#> [1] "Sarah Johnson"
#> 
#> $sender$email
#> [1] "sarah.johnson@techcorp.com"
#> 
#> 
#> $subject
#> [1] "Q4 Budget Review - Action Required"
#> 
#> $priority
#> [1] "urgent"
#> 
#> $action_items
#> [1] "Review the attached Q4 budget proposal by Friday"
#> [2] "Confirm department allocations"                  
#> [3] "Identify any cost-saving opportunities"          
#> [4] "Submit final numbers to finance"                 
#> 
#> $requires_response
#> [1] TRUE

Access nested fields:

result$sender$name
#> [1] "Sarah Johnson"
result$action_items
#> [1] "Review the attached Q4 budget proposal by Friday"
#> [2] "Confirm department allocations"                  
#> [3] "Identify any cost-saving opportunities"          
#> [4] "Submit final numbers to finance"
result$priority
#> [1] "urgent"

Step 6: Building an Email Triage System

Let’s combine what you’ve learned into a practical system:

sig <- signature(
  inputs = list(
    input("email", description = "Email to triage")
  ),
  output_type = type_object(
    category = type_enum(
      values = c("meeting", "task", "fyi", "urgent", "spam"),
      description = "Email category"
    ),
    summary = type_string("One-sentence summary"),
    action_required = type_boolean(),
    suggested_response = type_enum(
      values = c("reply_now", "reply_later", "forward", "archive", "delete"),
      description = "Recommended action"
    )
  ),
  instructions = "Triage the email for inbox management."
)

triage <- module(sig, type = "predict")

Process a batch of emails:

emails <- tibble(
  id = 1:3,
  email = c(
    "Meeting tomorrow at 3pm to discuss Q1 results. Please confirm attendance.",
    "FYI - The office will be closed on Monday for the holiday.",
    "URGENT: Server down! Need immediate assistance to restore services."
  )
)

results <- run_dataset(triage, emails, .llm = chat)
results
#> # A tibble: 3 × 3
#>      id email                                                       result      
#>   <int> <chr>                                                       <list>      
#> 1     1 Meeting tomorrow at 3pm to discuss Q1 results. Please conf… <named list>
#> 2     2 FYI - The office will be closed on Monday for the holiday.  <named list>
#> 3     3 URGENT: Server down! Need immediate assistance to restore … <named list>

Step 7: Handling Optional Fields

Some fields might not always be present. Make them nullable:

sig <- signature(
  inputs = list(
    input("text", description = "Text that may mention a date")
  ),
  output_type = type_object(
    has_date = type_boolean(),
    date = type_string("Date in YYYY-MM-DD format, or null if no date mentioned"),
    confidence = type_number()
  ),
  instructions = "Extract date information if present. Set date to empty string if no date."
)

date_extractor <- module(sig, type = "predict")

run(date_extractor, text = "Let's meet next Tuesday", .llm = chat)
run(date_extractor, text = "Great weather today!", .llm = chat)

What You Learned

In this tutorial, you:

Extracted multiple output fields with comma notation
Added types to each field
Built complex structures with type_object()
Extracted lists with type_array()
Created nested/hierarchical outputs
Built a practical email triage system
Handled optional fields

The Structure Advantage

Structured extraction is powerful because:

Guaranteed types: Numbers come back as numbers, not strings
Predictable shape: Your downstream code knows exactly what to expect
Validation: The LLM must conform to your schema
Composability: Results plug directly into R data structures

Next Steps

Your extractor works, but can it be improved? Continue to:

Tutorial 4: Improving with Examples — Add demonstrations to improve accuracy
Quick Reference — All output types at a glance
How Optimization Works — Why examples improve LLM performance