5  String Functions

6 Learning Objectives

By the end of this chapter, you will be able to:

  • Master fundamental string manipulation functions in R
  • Apply string functions to pharmaceutical data cleaning and processing
  • Use regular expressions for complex pattern matching in drug names and codes
  • Handle medication names, dosage forms, and clinical terminology
  • Clean and standardize pharmaceutical datasets using string operations

7 Introduction

String manipulation is crucial in pharmaceutical data analysis. You’ll frequently work with:

  • Drug names and generic/brand name mapping
  • Dosage forms and strengths
  • Medical terminology and coding systems (ICD, MedDRA, ATC codes)
  • Patient identifiers and study codes
  • Adverse event descriptions
  • Clinical trial data with text fields

8 Setup

# Load required packages
library(stringr)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(purrr)
library(tools)

# Suppress warnings for cleaner output
options(warn = -1)

9 Basic String Functions

9.1 1. String Length and Basic Information

# Basic string functions with pharmaceutical examples
drug_names <- c("Aspirin", "Ibuprofen", "Acetaminophen", "Warfarin", "Metformin")

# Get string lengths
cat("String lengths:\n")
String lengths:
print(nchar(drug_names))
[1]  7  9 13  8  9
# Check if strings are empty
medical_terms <- c("Hypertension", "", "Diabetes", NA, "Pneumonia")
cat("\nEmpty string check:\n")

Empty string check:
print(nzchar(medical_terms))  # Returns FALSE for empty strings
[1]  TRUE FALSE  TRUE  TRUE  TRUE
# Handle NA values properly
cat("\nNA value check:\n")

NA value check:
print(is.na(medical_terms))
[1] FALSE FALSE FALSE  TRUE FALSE

9.2 2. Case Conversion

# Case conversion for drug names
brand_names <- c("Tylenol", "Advil", "Lipitor", "Nexium")

cat("Original names:", paste(brand_names, collapse = ", "), "\n")
Original names: Tylenol, Advil, Lipitor, Nexium 
cat("Uppercase:", paste(toupper(brand_names), collapse = ", "), "\n")
Uppercase: TYLENOL, ADVIL, LIPITOR, NEXIUM 
cat("Lowercase:", paste(tolower(brand_names), collapse = ", "), "\n")
Lowercase: tylenol, advil, lipitor, nexium 
cat("Title Case:", paste(toTitleCase(tolower(brand_names)), collapse = ", "), "\n")
Title Case: Tylenol, Advil, Lipitor, Nexium 

9.3 3. String Concatenation

# Combining drug information
drug_name <- c("Aspirin", "Ibuprofen", "Acetaminophen")
strength <- c("81mg", "200mg", "500mg")
form <- c("Tablet", "Capsule", "Tablet")

# Basic concatenation
cat("Basic concatenation:\n")
Basic concatenation:
print(paste(drug_name, strength, form))
[1] "Aspirin 81mg Tablet"        "Ibuprofen 200mg Capsule"   
[3] "Acetaminophen 500mg Tablet"
# Concatenation with custom separator
cat("\nWith custom separator:\n")

With custom separator:
print(paste(drug_name, strength, form, sep = " | "))
[1] "Aspirin | 81mg | Tablet"        "Ibuprofen | 200mg | Capsule"   
[3] "Acetaminophen | 500mg | Tablet"
# Concatenation without separator
cat("\nWithout separator:\n")

Without separator:
print(paste0(drug_name, "_", strength))
[1] "Aspirin_81mg"        "Ibuprofen_200mg"     "Acetaminophen_500mg"

10 Substring Operations

10.1 1. Extracting Substrings

# Working with NDC (National Drug Code) numbers
ndc_codes <- c("12345-678-90", "98765-432-10", "11111-222-33")

cat("NDC Codes:", paste(ndc_codes, collapse = ", "), "\n")
NDC Codes: 12345-678-90, 98765-432-10, 11111-222-33 
# Extract manufacturer code (first part)
cat("Manufacturer codes (base R):", paste(substr(ndc_codes, 1, 5), collapse = ", "), "\n")
Manufacturer codes (base R): 12345, 98765, 11111 
# Extract product code (middle part)
cat("Product codes:", paste(substr(ndc_codes, 7, 9), collapse = ", "), "\n")
Product codes: 678, 432, 222 
# Using stringr for more intuitive operations
cat("First 5 chars (stringr):", paste(str_sub(ndc_codes, 1, 5), collapse = ", "), "\n")
First 5 chars (stringr): 12345, 98765, 11111 
cat("Last 2 chars:", paste(str_sub(ndc_codes, -2, -1), collapse = ", "), "\n")
Last 2 chars: 90, 10, 33 

10.2 2. Finding and Replacing Text

# Standardizing drug names
messy_drug_names <- c("aspirin 81 mg", "IBUPROFEN-200MG", "Acetaminophen/500mg")

cat("Original messy names:\n")
Original messy names:
print(messy_drug_names)
[1] "aspirin 81 mg"       "IBUPROFEN-200MG"     "Acetaminophen/500mg"
# Replace hyphens and slashes with spaces
cleaned_names <- str_replace_all(messy_drug_names, "[-/]", " ")
cat("\nAfter replacing separators:\n")

After replacing separators:
print(cleaned_names)
[1] "aspirin 81 mg"       "IBUPROFEN 200MG"     "Acetaminophen 500mg"
# Remove extra whitespace
cat("\nAfter removing extra whitespace:\n")

After removing extra whitespace:
print(str_squish(cleaned_names))
[1] "aspirin 81 mg"       "IBUPROFEN 200MG"     "Acetaminophen 500mg"
# Find positions of patterns
drug_text <- "Patient took Aspirin 81mg twice daily"
cat("\nPosition of 'Aspirin' in text:\n")

Position of 'Aspirin' in text:
print(str_locate(drug_text, "Aspirin"))
     start end
[1,]    14  20
cat("\nPositions of all numbers:\n")

Positions of all numbers:
print(str_locate_all(drug_text, "[0-9]+"))
[[1]]
     start end
[1,]    22  23

11 Pattern Matching and Regular Expressions

11.1 1. Basic Pattern Detection

# Detect patterns in medication descriptions
medications <- c(
  "Aspirin 81mg tablet once daily",
  "Metformin 500mg twice daily",
  "Insulin injection as needed",
  "Lisinopril 10mg daily",
  "Warfarin 5mg as directed"
)

cat("Medications containing 'mg':\n")
Medications containing 'mg':
print(medications[str_detect(medications, "mg")])
[1] "Aspirin 81mg tablet once daily" "Metformin 500mg twice daily"   
[3] "Lisinopril 10mg daily"          "Warfarin 5mg as directed"      
cat("\nMedications starting with 'Aspirin':\n")

Medications starting with 'Aspirin':
print(medications[str_detect(medications, "^Aspirin")])
[1] "Aspirin 81mg tablet once daily"
cat("\nMedications ending with 'daily':\n")

Medications ending with 'daily':
print(medications[str_detect(medications, "daily$")])
[1] "Aspirin 81mg tablet once daily" "Metformin 500mg twice daily"   
[3] "Lisinopril 10mg daily"         

11.2 2. Advanced Regular Expressions

# Extract dosage information
dosage_pattern <- "[0-9]+\\.?[0-9]*mg"
extracted_dosages <- str_extract(medications, dosage_pattern)

cat("Extracted dosages:\n")
Extracted dosages:
for(i in seq_along(medications)) {
  cat(sprintf("%s -> %s\n", medications[i], 
              ifelse(is.na(extracted_dosages[i]), "No dosage found", extracted_dosages[i])))
}
Aspirin 81mg tablet once daily -> 81mg
Metformin 500mg twice daily -> 500mg
Insulin injection as needed -> No dosage found
Lisinopril 10mg daily -> 10mg
Warfarin 5mg as directed -> 5mg
# Complex pattern:  Extract drug name and dosage
drug_dosage_pattern <- "^([A-Za-z]+)\\s+([0-9]+\\.?[0-9]*mg)"
matches <- str_match(medications, drug_dosage_pattern)

cat("\nDrug name and dosage extraction:\n")

Drug name and dosage extraction:
for(i in seq_along(medications)) {
  if(! is.na(matches[i, 1])) {
    cat(sprintf("Drug: %s, Dosage:  %s\n", matches[i, 2], matches[i, 3]))
  } else {
    cat(sprintf("No match for:  %s\n", medications[i]))
  }
}
Drug: Aspirin, Dosage:  81mg
Drug: Metformin, Dosage:  500mg
No match for:  Insulin injection as needed
Drug: Lisinopril, Dosage:  10mg
Drug: Warfarin, Dosage:  5mg

12 Working with coded terms

12.1 1. ICD-10 Code Processing

# Working with ICD-10 diagnostic codes
icd_codes <- c("E11.9", "I10", "J44.1", "N18.6", "M79.3")
icd_descriptions <- c(
  "Type 2 diabetes mellitus without complications",
  "Essential hypertension",
  "Chronic obstructive pulmonary disease with acute exacerbation",
  "End stage renal disease",
  "Panniculitis, unspecified"
)

# Create a lookup function
create_icd_lookup <- function(codes, descriptions) {
  lookup_table <- setNames(descriptions, codes)
  return(lookup_table)
}

icd_lookup <- create_icd_lookup(icd_codes, icd_descriptions)

# Function to get description from code
get_icd_description <- function(code, lookup_table) {
  description <- lookup_table[code]
  ifelse(is.na(description), "Code not found", description)
}

# Test the function
patient_codes <- c("E11.9", "I10", "Z99.9")  # Last one doesn't exist
cat("ICD-10 Code Lookup Results:\n")
ICD-10 Code Lookup Results:
for(i in seq_along(patient_codes)) {
  cat(sprintf("%s:  %s\n", patient_codes[i], get_icd_description(patient_codes[i], icd_lookup)))
}
E11.9:  Type 2 diabetes mellitus without complications
I10:  Essential hypertension
Z99.9:  Code not found

12.2 2. ATC Code Analysis

# Anatomical Therapeutic Chemical (ATC) codes
atc_codes <- c("A02BC01", "C09AA02", "A10BA02", "B01AA03", "N02BA01")
drug_names_atc <- c("Omeprazole", "Enalapril", "Metformin", "Warfarin", "Aspirin")

# Extract anatomical group (first character)
anatomical_groups <- str_sub(atc_codes, 1, 1)

# Create descriptive labels
anatomical_labels <- c(
  "A" = "Alimentary tract and metabolism",
  "B" = "Blood and blood forming organs",
  "C" = "Cardiovascular system",
  "D" = "Dermatologicals",
  "G" = "Genito urinary system and sex hormones",
  "H" = "Systemic hormonal preparations",
  "J" = "Antiinfectives for systemic use",
  "L" = "Antineoplastic and immunomodulating agents",
  "M" = "Musculo-skeletal system",
  "N" = "Nervous system",
  "P" = "Antiparasitic products",
  "R" = "Respiratory system",
  "S" = "Sensory organs",
  "V" = "Various"
)

# Map codes to descriptions
atc_mapping <- data.frame(
  Drug = drug_names_atc,
  ATC_Code = atc_codes,
  Anatomical_Group = anatomical_labels[anatomical_groups],
  stringsAsFactors = FALSE
)

cat("ATC Code Analysis:\n")
ATC Code Analysis:
print(atc_mapping)
        Drug ATC_Code                Anatomical_Group
1 Omeprazole  A02BC01 Alimentary tract and metabolism
2  Enalapril  C09AA02           Cardiovascular system
3  Metformin  A10BA02 Alimentary tract and metabolism
4   Warfarin  B01AA03  Blood and blood forming organs
5    Aspirin  N02BA01                  Nervous system

13 Data Cleaning Applications

13.1 1. Standardizing Drug Names

# Real-world messy drug data
messy_drug_data <- c(
  "ASPIRIN 81 MG TABLET",
  "aspirin 81mg tab",
  "Aspirin, 81 mg, tablet",
  "ASA 81MG TABS",
  "aspirin_81_mg_tablet",
  "Aspirin (81mg) - Tablet"
)

# Function to standardize drug names
standardize_drug_name <- function(drug_names) {
  # Convert to lowercase
  clean_names <- tolower(drug_names)
  
  # Replace various separators with spaces
  clean_names <- str_replace_all(clean_names, "[_,\\-()]+", " ")
  
  # Standardize abbreviations
  clean_names <- str_replace_all(clean_names, "\\btab[s]?\\b", "tablet")
  clean_names <- str_replace_all(clean_names, "\\basa\\b", "aspirin")
  
  # Remove extra whitespace
  clean_names <- str_squish(clean_names)
  
  # Standardize mg format
  clean_names <- str_replace_all(clean_names, "([0-9]+)\\s*mg", "\\1mg")
  
  return(clean_names)
}

standardized_names <- standardize_drug_name(messy_drug_data)

cat("Drug Name Standardization Results:\n")
Drug Name Standardization Results:
for(i in seq_along(messy_drug_data)) {
  cat(sprintf("Original: %s\n", messy_drug_data[i]))
  cat(sprintf("Standardized: %s\n\n", standardized_names[i]))
}
Original: ASPIRIN 81 MG TABLET
Standardized: aspirin 81mg tablet

Original: aspirin 81mg tab
Standardized: aspirin 81mg tablet

Original: Aspirin, 81 mg, tablet
Standardized: aspirin 81mg tablet

Original: ASA 81MG TABS
Standardized: aspirin 81mg tablet

Original: aspirin_81_mg_tablet
Standardized: aspirin 81mg tablet

Original: Aspirin (81mg) - Tablet
Standardized: aspirin 81mg tablet

13.2 2. Parsing Dosage Information

# Extract structured information from prescription strings
prescriptions <- c(
  "Metformin 500mg twice daily with meals",
  "Lisinopril 10mg once daily in morning",
  "Warfarin 5mg daily as directed by INR",
  "Aspirin 81mg once daily for cardioprotection",
  "Insulin 10 units subcutaneous before meals"
)

# Function to parse prescription information
parse_prescription <- function(prescription_text) {
  # Extract drug name (assumes it's the first word)
  drug_name <- str_extract(prescription_text, "^[A-Za-z]+")
  
  # Extract dosage with units
  dosage <- str_extract(prescription_text, "[0-9]+\\.?[0-9]*\\s?(mg|units|mcg|g)")
  
  # Extract frequency information
  frequency <- case_when(
    str_detect(prescription_text, "twice|bid") ~ "Twice daily",
    str_detect(prescription_text, "once|daily") ~ "Once daily",
    str_detect(prescription_text, "three times|tid") ~ "Three times daily",
    str_detect(prescription_text, "four times|qid") ~ "Four times daily",
    TRUE ~ "As directed"
  )
  
  # Extract special instructions
  special_instructions <- case_when(
    str_detect(prescription_text, "with meals") ~ "With meals",
    str_detect(prescription_text, "before meals") ~ "Before meals",
    str_detect(prescription_text, "morning") ~ "In morning",
    str_detect(prescription_text, "bedtime") ~ "At bedtime",
    TRUE ~ "No special instructions"
  )
  
  return(data.frame(
    Drug = drug_name,
    Dosage = dosage,
    Frequency = frequency,
    Instructions = special_instructions,
    stringsAsFactors = FALSE
  ))
}

# Parse all prescriptions
prescription_data <- map_dfr(prescriptions, parse_prescription)

cat("Parsed Prescription Information:\n")
Parsed Prescription Information:
print(prescription_data)
        Drug   Dosage   Frequency            Instructions
1  Metformin    500mg Twice daily              With meals
2 Lisinopril     10mg  Once daily              In morning
3   Warfarin      5mg  Once daily No special instructions
4    Aspirin     81mg  Once daily No special instructions
5    Insulin 10 units As directed            Before meals

14 Advanced String Operations

14.1 1. Working with Patient Identifiers

# Function to generate patient IDs
generate_patient_id <- function(site_code, patient_number) {
  # Format: SITE001-P-0001
  formatted_number <- str_pad(patient_number, 4, pad = "0")
  patient_id <- paste0(site_code, "-P-", formatted_number)
  return(patient_id)
}

# Generate some patient IDs
site_codes <- c("NYC", "LAX", "CHI")
patient_numbers <- 1:3

# Create patient IDs for demonstration
sample_ids <- c()
for(site in site_codes) {
  for(num in patient_numbers) {
    sample_ids <- c(sample_ids, generate_patient_id(site, num))
  }
}

cat("Generated Patient IDs:\n")
Generated Patient IDs:
print(sample_ids[1:6])  # Show first 6
[1] "NYC-P-0001" "NYC-P-0002" "NYC-P-0003" "LAX-P-0001" "LAX-P-0002"
[6] "LAX-P-0003"
# Function to validate patient ID format
validate_patient_id <- function(patient_id) {
  pattern <- "^[A-Z]{3}-P-[0-9]{4}$"
  return(str_detect(patient_id, pattern))
}

# Test validation
test_ids <- c("NYC-P-0001", "INVALID-ID", "LAX-P-99", "CHI-P-0123")
cat("\nValidation Results:\n")

Validation Results:
for(i in seq_along(test_ids)) {
  cat(sprintf("%s: %s\n", test_ids[i], 
              ifelse(validate_patient_id(test_ids[i]), "Valid", "Invalid")))
}
NYC-P-0001: Valid
INVALID-ID: Invalid
LAX-P-99: Invalid
CHI-P-0123: Valid

14.2 2. Processing Adverse Event Reports

# Working with MedDRA terms
adverse_events <- c(
  "Nausea and vomiting",
  "Severe headache",
  "Skin rash and itching",
  "Diarrhea, mild",
  "Fatigue and weakness",
  "Allergic reaction - hives"
)

# Function to standardize adverse event terms
standardize_ae_terms <- function(ae_terms) {
  # Convert to title case
  clean_terms <- str_to_title(ae_terms)
  
  # Standardize common words
  clean_terms <- str_replace_all(clean_terms, "\\bAnd\\b", "and")
  clean_terms <- str_replace_all(clean_terms, "\\bOf\\b", "of")
  
  # Remove severity indicators for consistency
  severity_pattern <- "(\\b(mild|moderate|severe|serious)\\s*,? \\s*|\\s*-\\s*)"
  clean_terms <- str_remove_all(clean_terms, regex(severity_pattern, ignore_case = TRUE))
  
  # Remove extra whitespace
  clean_terms <- str_squish(clean_terms)
  
  return(clean_terms)
}

standardized_ae <- standardize_ae_terms(adverse_events)

cat("Adverse Event Standardization:\n")
Adverse Event Standardization:
for(i in seq_along(adverse_events)) {
  cat(sprintf("Original: %s\n", adverse_events[i]))
  cat(sprintf("Standardized: %s\n\n", standardized_ae[i]))
}
Original: Nausea and vomiting
Standardized: Nausea and Vomiting

Original: Severe headache
Standardized: Headache

Original: Skin rash and itching
Standardized: Skin Rash and Itching

Original: Diarrhea, mild
Standardized: Diarrhea, Mild

Original: Fatigue and weakness
Standardized: Fatigue and Weakness

Original: Allergic reaction - hives
Standardized: Allergic ReactionHives

14.3 3. Clinical Trial Data Processing Example

library(purrr)
# Comprehensive example:  Processing clinical trial medication data
clinical_trial_data <- data.frame(
  subject_id = c("001-001", "001-002", "001-003"),
  visit = c("Baseline", "Week 4", "Week 8"),
  concomitant_medications = c(
    "Metformin 500mg BID, Lisinopril 10mg QD",
    "Metformin 500mg BID, Lisinopril 10mg QD, Aspirin 81mg QD",
    "Metformin 1000mg BID, Lisinopril 20mg QD"
  ),
  stringsAsFactors = FALSE
)

# Function to parse medication strings into structured data
parse_medications <- function(med_string, subject_id, visit) {
  # Split by comma to get individual medications
  individual_meds <- str_split(med_string, ",\\s*")[[1]]
  
  # Extract information for each medication
  med_data <- map_dfr(individual_meds, function(med) {
    # Extract drug name (first word(s) before dosage)
    # Fixed:  Removed invalid lookahead syntax
    drug_name <- str_extract(med, "^[A-Za-z\\s]+(? =\\s+[0-9])")
    drug_name <- str_trim(drug_name)
    
    # Extract dosage
    dosage <- str_extract(med, "[0-9]+\\.?[0-9]*\\s*(mg|mcg|g|units)")
    
    # Extract frequency
    frequency <- case_when(
      str_detect(med, "BID|twice") ~ "Twice daily",
      str_detect(med, "QD|once|daily") ~ "Once daily",
      str_detect(med, "TID|three") ~ "Three times daily",
      str_detect(med, "QID|four") ~ "Four times daily",
      TRUE ~ "As directed"
    )
    
    return(data.frame(
      drug_name = drug_name,
      dosage = dosage,
      frequency = frequency,
      stringsAsFactors = FALSE
    ))
  })
  
  # Add subject and visit information
  med_data$subject_id <- subject_id
  med_data$visit <- visit
  
  return(med_data)
}

# Alternative function that's more robust (doesn't rely on lookahead)
parse_medications_robust <- function(med_string, subject_id, visit) {
  # Split by comma to get individual medications
  individual_meds <- str_split(med_string, ",\\s*")[[1]]
  
  # Extract information for each medication
  med_data <- map_dfr(individual_meds, function(med) {
    # Method 1: Use word boundaries to extract drug name
    # Extract everything before the first number
    drug_name <- str_extract(med, "^[^0-9]+")
    drug_name <- str_trim(drug_name)
    
    # Method 2: Alternative approach using str_match
    # match <- str_match(med, "^([A-Za-z\\s]+)\\s+([0-9]+\\.?[0-9]*\\s*(mg|mcg|g|units))")
    # drug_name <- str_trim(match[, 2])
    # dosage <- match[, 3]
    
    # Extract dosage
    dosage <- str_extract(med, "[0-9]+\\.?[0-9]*\\s*(mg|mcg|g|units)")
    
    # Extract frequency
    frequency <- case_when(
      str_detect(med, "BID|twice") ~ "Twice daily",
      str_detect(med, "QD|once|daily") ~ "Once daily",
      str_detect(med, "TID|three") ~ "Three times daily",
      str_detect(med, "QID|four") ~ "Four times daily",
      TRUE ~ "As directed"
    )
    
    return(data.frame(
      drug_name = drug_name,
      dosage = dosage,
      frequency = frequency,
      stringsAsFactors = FALSE
    ))
  })
  
  # Add subject and visit information
  med_data$subject_id <- subject_id
  med_data$visit <- visit
  
  return(med_data)
}

# Process all medication data using the robust function
processed_meds <- pmap_dfr(
  list(
    clinical_trial_data$concomitant_medications,
    clinical_trial_data$subject_id,
    clinical_trial_data$visit
  ),
  parse_medications_robust
)

# Display the structured data
cat("Processed Clinical Trial Medication Data:\n")
Processed Clinical Trial Medication Data:
print(processed_meds[, c("subject_id", "visit", "drug_name", "dosage", "frequency")])
  subject_id    visit  drug_name dosage   frequency
1    001-001 Baseline  Metformin  500mg Twice daily
2    001-001 Baseline Lisinopril   10mg  Once daily
3    001-002   Week 4  Metformin  500mg Twice daily
4    001-002   Week 4 Lisinopril   10mg  Once daily
5    001-002   Week 4    Aspirin   81mg  Once daily
6    001-003   Week 8  Metformin 1000mg Twice daily
7    001-003   Week 8 Lisinopril   20mg  Once daily

15 Performance Tips and Best Practices

15.1 1. Efficient String Operations

# Create a moderately sized dataset for demonstration
large_drug_list <- rep(c("Aspirin", "Ibuprofen", "Acetaminophen"), 1000)

# Vectorized operations are more efficient
cat("Efficient approach - vectorized operation:\n")
Efficient approach - vectorized operation:
start_time <- Sys.time()
drug_doses_vectorized <- paste(large_drug_list, "100mg")
end_time <- Sys.time()
cat(sprintf("Time taken: %s seconds\n", round(end_time - start_time, 4)))
Time taken: 0.001 seconds
# Avoid loops for simple operations
cat("\nLess efficient approach - loop-based:\n")

Less efficient approach - loop-based:
start_time <- Sys.time()
drug_doses_loop <- character(length(large_drug_list))
for (i in seq_along(large_drug_list)) {
  drug_doses_loop[i] <- paste(large_drug_list[i], "100mg")
}
end_time <- Sys.time()
cat(sprintf("Time taken: %s seconds\n", round(end_time - start_time, 4)))
Time taken: 0.0093 seconds
cat("\nFirst 5 results are identical:")

First 5 results are identical:
cat(sprintf("Vectorized: %s\n", paste(drug_doses_vectorized[1:5], collapse = ", ")))
Vectorized: Aspirin 100mg, Ibuprofen 100mg, Acetaminophen 100mg, Aspirin 100mg, Ibuprofen 100mg
cat(sprintf("Loop-based: %s\n", paste(drug_doses_loop[1:5], collapse = ", ")))
Loop-based: Aspirin 100mg, Ibuprofen 100mg, Acetaminophen 100mg, Aspirin 100mg, Ibuprofen 100mg

15.2 2. Error Handling in String Operations

# Robust function for extracting dosage information
extract_dosage_safe <- function(medication_string) {
  tryCatch({
    # Handle NULL or NA inputs
    if (is.null(medication_string) || is.na(medication_string)) {
      return(NA_character_)
    }
    
    # Convert to character if not already
    med_str <- as.character(medication_string)
    
    # Extract dosage pattern
    dosage_pattern <- "[0-9]+\\.?[0-9]*\\s*(mg|mcg|g|units)"
    dosage <- str_extract(med_str, regex(dosage_pattern, ignore_case = TRUE))
    
    # Return standardized format
    if (! is.na(dosage)) {
      return(str_to_lower(str_replace(dosage, "\\s+", "")))
    } else {
      return("No dosage found")
    }
    
  }, error = function(e) {
    warning(paste("Error processing medication string:", medication_string, "-", e$message))
    return("Error in processing")
  })
}

# Test with various inputs
test_medications <- c(
  "Aspirin 81mg",
  "Invalid format",
  NA,
  "Metformin 500 MG twice daily",
  ""
)

cat("Safe Dosage Extraction Results:\n")
Safe Dosage Extraction Results:
for(i in seq_along(test_medications)) {
  result <- extract_dosage_safe(test_medications[i])
  cat(sprintf("Input: '%s' -> Output: '%s'\n", 
              ifelse(is.na(test_medications[i]), "NA", test_medications[i]), 
              result))
}
Input: 'Aspirin 81mg' -> Output: '81mg'
Input: 'Invalid format' -> Output: 'No dosage found'
Input: 'NA' -> Output: 'NA'
Input: 'Metformin 500 MG twice daily' -> Output: '500mg'
Input: '' -> Output: 'No dosage found'

16 Summary and Key Takeaways

16.1 Essential String Functions for Pharmaceutical Analysis

The key functions you should master include:

  1. Basic Operations: nchar(), toupper(), tolower(), paste(), paste0()
  2. Substring Operations: substr(), str_sub(), str_extract()
  3. Pattern Matching: str_detect(), str_locate(), str_match()
  4. Text Replacement: str_replace(), str_replace_all(), str_remove()
  5. Text Cleaning: str_squish(), str_trim()

16.2 Best Practices

# Create a summary of best practices
best_practices <- data.frame(
  Practice = c(
    "Standardization",
    "Error Handling", 
    "Vectorization",
    "Regular Expressions",
    "Documentation"
  ),
  Description = c(
    "Always clean and standardize data before analysis",
    "Use tryCatch() for robust string processing",
    "Use vectorized operations for better performance", 
    "Learn regex patterns for complex text matching",
    "Document your string processing logic clearly"
  ),
  Example = c(
    "toupper(), str_squish()",
    "tryCatch(), is.na() checks",
    "str_replace_all() vs loops", 
    "str_extract() with patterns",
    "Comments and function names"
  ),
  stringsAsFactors = FALSE
)

cat("String Processing Best Practices:\n")
String Processing Best Practices:
print(best_practices)
             Practice                                       Description
1     Standardization Always clean and standardize data before analysis
2      Error Handling       Use tryCatch() for robust string processing
3       Vectorization  Use vectorized operations for better performance
4 Regular Expressions    Learn regex patterns for complex text matching
5       Documentation     Document your string processing logic clearly
                      Example
1     toupper(), str_squish()
2  tryCatch(), is.na() checks
3  str_replace_all() vs loops
4 str_extract() with patterns
5 Comments and function names

16.3 Common Pharmaceutical Applications

  • Drug name standardization and mapping: Cleaning messy drug names from different sources
  • Dosage extraction and validation: Parsing prescription information
  • Medical coding: Working with ICD-10, ATC, MedDRA codes
  • Patient identifier processing: Generating and validating study IDs
  • Adverse event text mining: Standardizing AE terms and descriptions
  • Clinical trial data parsing: Processing complex medication strings

16.4 Next Steps

  1. Practice these techniques with your own pharmaceutical datasets
  2. Explore more advanced text mining packages (tm, quanteda, tidytext)
  3. Learn about natural language processing for clinical notes
  4. Study regulatory requirements for data standardization (CDISC standards)
  5. Implement automated data quality checks using string validation functions

This comprehensive guide provides the foundation for effective string manipulation in pharmaceutical R programming. Remember to adapt these techniques to your specific use cases and always validate your results with domain experts.

17 Comparing stringr and base R string functions

We’ll begin with a lookup table between the most important stringr functions and their base R equivalents.

library(stringr)
data_stringr_base_diff <- tibble::tribble(
  ~stringr,                                        ~base_r,
  "str_detect(string, pattern)",                   "grepl(pattern, x)",
  "str_dup(string, times)",                        "strrep(x, times)",
  "str_extract(string, pattern)",                  "regmatches(x, m = regexpr(pattern, text))",
  "str_extract_all(string, pattern)",              "regmatches(x, m = gregexpr(pattern, text))",
  "str_length(string)",                            "nchar(x)",
  "str_locate(string, pattern)",                   "regexpr(pattern, text)",
  "str_locate_all(string, pattern)",               "gregexpr(pattern, text)",
  "str_match(string, pattern)",                    "regmatches(x, m = regexec(pattern, text))",
  "str_order(string)",                             "order(...)",
  "str_replace(string, pattern, replacement)",     "sub(pattern, replacement, x)",
  "str_replace_all(string, pattern, replacement)", "gsub(pattern, replacement, x)",
  "str_sort(string)",                              "sort(x)",
  "str_split(string, pattern)",                    "strsplit(x, split)",
  "str_sub(string, start, end)",                   "substr(x, start, stop)",
  "str_subset(string, pattern)",                   "grep(pattern, x, value = TRUE)",
  "str_to_lower(string)",                          "tolower(x)",
  "str_to_title(string)",                          "tools::toTitleCase(text)",
  "str_to_upper(string)",                          "toupper(x)",
  "str_trim(string)",                              "trimws(x)",
  "str_which(string, pattern)",                    "grep(pattern, x)",
  "str_wrap(string)",                              "strwrap(x)"
)

# create MD table, arranged alphabetically by stringr fn name
data_stringr_base_diff |> 
  dplyr::mutate(dplyr::across(
      .cols = everything(),
      .fns = ~ paste0("`", .x, "`"))
  ) |> 
  dplyr::arrange(stringr) |> 
  dplyr::rename(`base R` = base_r) |> 
  gt::gt() |> 
  gt::fmt_markdown(columns = everything()) |> 
  gt::tab_options(column_labels.font.weight = "bold")
stringr base R
str_detect(string, pattern) grepl(pattern, x)
str_dup(string, times) strrep(x, times)
str_extract(string, pattern) regmatches(x, m = regexpr(pattern, text))
str_extract_all(string, pattern) regmatches(x, m = gregexpr(pattern, text))
str_length(string) nchar(x)
str_locate(string, pattern) regexpr(pattern, text)
str_locate_all(string, pattern) gregexpr(pattern, text)
str_match(string, pattern) regmatches(x, m = regexec(pattern, text))
str_order(string) order(...)
str_replace(string, pattern, replacement) sub(pattern, replacement, x)
str_replace_all(string, pattern, replacement) gsub(pattern, replacement, x)
str_sort(string) sort(x)
str_split(string, pattern) strsplit(x, split)
str_sub(string, start, end) substr(x, start, stop)
str_subset(string, pattern) grep(pattern, x, value = TRUE)
str_to_lower(string) tolower(x)
str_to_title(string) tools::toTitleCase(text)
str_to_upper(string) toupper(x)
str_trim(string) trimws(x)
str_which(string, pattern) grep(pattern, x)
str_wrap(string) strwrap(x)

Overall the main differences between base R and stringr are:

  1. stringr functions start with str_ prefix; base R string functions have no consistent naming scheme.

  2. The order of inputs is usually different between base R and stringr. In base R, the pattern to match usually comes first; in stringr, the string to manupulate always comes first. This makes stringr easier to use in pipes, and with lapply() or purrr::map().

  3. Functions in stringr tend to do less, where many of the string processing functions in base R have multiple purposes.

  4. The output and input of stringr functions has been carefully designed. For example, the output of str_locate() can be fed directly into str_sub(); the same is not true of regexpr() and substr().

  5. Base functions use arguments (like perl, fixed, and ignore.case) to control how the pattern is interpreted. To avoid dependence between arguments, stringr instead uses helper functions (like fixed(), regex(), and coll()).

Next we’ll walk through each of the functions, noting the similarities and important differences. These examples are adapted from the stringr documentation and here they are contrasted with the analogous base R operations.

library(dplyr)
ae <- haven::read_sas("./data/sdtm/ae.sas7bdat")
ae1 <- ae |> 
  select (USUBJID,AEDECOD,AEBODSYS)

#find function in sas, str_detect

ae2 <- ae1 |> 
  filter(str_detect(AEDECOD,"HER"))

ae2 <- ae1 |> 
  filter(str_detect(AEDECOD,"^HIATUS ")) # start of the string

ae2 <- ae1 |> 
  filter(str_detect(AEDECOD,"HERNIA$")) # end of the string

#substr in sas

ae2 <- ae1 |> 
  mutate(newvar1=str_sub(AEDECOD,1,6),
         newvar2=str_sub(AEDECOD,2,6),
         newvar3=str_sub(AEDECOD,-3),
         newvar4=str_length(AEDECOD))

#cat function in sas , Str_c in r

ae2 <- ae1 |> 
  mutate(newvar1=str_c(AEDECOD,AEBODSYS,sep="/"),
         newvar2=paste(AEDECOD,AEBODSYS,sep="/"))

# scan function in sas, word in r

ae2 <- ae1 |> 
  mutate(newvar1=word(AEDECOD,1),
         newvar2=word(AEDECOD,2))

# upper & lower case

ae2 <- ae1 |> 
  mutate(newvar1=str_to_lower(AEDECOD),
         newvar2=str_to_upper(AEDECOD),
         newvar3=str_to_title(AEDECOD),
         newvar4=str_to_sentence(AEDECOD))

#str_trim

a <- "  this is    my String   "
b <- str_trim(a)
c <- str_replace_all(a," "," ")
d <- str_squish(a)