4  Manipulating Vectors, Data Frames, and Lists

# Load essential packages for pharmaceutical data analysis
library(haven)    # For reading SAS datasets (SDTM, ADaM)
library(dplyr)    # For data manipulation
library(tidyr)    # For data reshaping
library(stringr)  # For string manipulation

5 Chapter 1: Vector Operations in Pharmaceutical Context

Vectors are the fundamental building blocks of R. In pharmaceutical programming, vectors are used to store patient IDs, treatment codes, lab values, and categorical variables like adverse event terms.

5.1 1.1 Creating Vectors - The Foundation

# Creating vectors - essential for pharmaceutical data
# Patient IDs (sequential numbering)
patient_ids <- 1:10
patient_ids_desc <- 10:1
patient_ids_range <- -5:10

# Using c() function for explicit vector creation
dose_levels <- c(1, 5, 10, 25, 50, 100)  # Dose escalation study
safety_population <- c(1:254)            # Safety population patient IDs

# Pharmaceutical example: Creating treatment codes
treatment_codes <- c("PLA", "LOW", "MED", "HIGH")

5.1.1 Understanding Vector Naming in Clinical Data

# Naming vectors is crucial for clinical data interpretation
dose_groups <- c(0, 5, 10, 25)
names(dose_groups) <- c("Placebo", "Low_Dose", "Medium_Dose", "High_Dose")
print(dose_groups)
    Placebo    Low_Dose Medium_Dose   High_Dose 
          0           5          10          25 
# Clinical example: Lab normal ranges
lab_ranges <- c(70, 100, 140)
names(lab_ranges) <- c("Glucose_Low", "Glucose_Normal", "Glucose_High")
print(lab_ranges)
   Glucose_Low Glucose_Normal   Glucose_High 
            70            100            140 

5.2 1.2 Vector Indexing - Accessing Patient Data

Vector indexing is critical when working with patient subsets, safety populations, or specific treatment arms.

# Sample adverse event severity scores
ae_severity <- c(1, 3, 5, 2, 4, 1, 3)

# Accessing specific patient data
ae_severity[2]                    # Second patient's AE severity
[1] 3
ae_severity[c(1, 3, 5)]          # Patients 1, 3, and 5
[1] 1 5 4
ae_severity[-2]                  # Exclude patient 2
[1] 1 5 2 4 1 3
ae_severity[-c(1, 3)]            # Exclude patients 1 and 3
[1] 3 2 4 1 3
# Important: Accessing non-existent indices
missing_patient <- ae_severity[10]  # Returns NA
class(missing_patient)              # Still numeric class
[1] "numeric"
# Pharmaceutical application: Safety population subset
safety_ids <- 1:100
efficacy_subset <- safety_ids[1:85]  # Efficacy population

5.3 1.3 Vector Modification - Updating Clinical Data

# Laboratory values that need updating
lab_values <- c(95, 87, 110, 92)

# Single value modification (data correction)
lab_values[2] <- 89  # Correcting second patient's value

# Multiple value updates (batch correction)
lab_values[c(1, 4)] <- c(96, 94)

# Extending vectors (new patient enrollment)
lab_values[8] <- lab_values[2]  # Copying value
lab_values[9] <- 105            # New patient value

# Clinical scenario: Updating multiple positions
visit_scores <- c(10, 15, 20, 25)
visit_scores[c(2, 5)] <- c(16, 30)  # Update visits 2 and 5

5.4 1.4 Arithmetic Operations - Statistical Calculations

# Biomarker values for statistical analysis
biomarker_values <- c(12.5, 15.3, 18.7, 14.2)

# Basic transformations common in clinical analysis
biomarker_values + 2.5          # Adding baseline adjustment
[1] 15.0 17.8 21.2 16.7
biomarker_values - 10           # Subtracting control value
[1] 2.5 5.3 8.7 4.2
biomarker_values * 1.2          # Scaling factor
[1] 15.00 18.36 22.44 17.04
biomarker_values / 2            # Half-life calculation
[1] 6.25 7.65 9.35 7.10
# Advanced arithmetic operations
biomarker_values %/% 2          # Integer division
[1] 6 7 9 7
biomarker_values %% 3           # Modulo operation
[1] 0.5 0.3 0.7 2.2
# Essential statistical functions for clinical data
min(biomarker_values)           # Minimum value
[1] 12.5
max(biomarker_values)           # Maximum value
[1] 18.7
median(biomarker_values)        # Median (robust to outliers)
[1] 14.75
mean(biomarker_values)          # Mean value
[1] 15.175
range(biomarker_values)         # Range (min, max)
[1] 12.5 18.7
var(biomarker_values)           # Variance
[1] 6.849167
sd(biomarker_values)            # Standard deviation
[1] 2.617091
quantile(biomarker_values)      # Quartiles
    0%    25%    50%    75%   100% 
12.500 13.775 14.750 16.150 18.700 
IQR(biomarker_values)           # Interquartile range
[1] 2.375
# Custom quantiles for clinical cutoffs
quantile(biomarker_values, probs = c(0.25, 0.5, 0.75, 0.95))
   25%    50%    75%    95% 
13.775 14.750 16.150 18.190 

5.5 1.5 The WHICH Function - Clinical Data Subsetting

The which() function is essential for identifying patients meeting specific clinical criteria.

# Patient vital signs data
systolic_bp <- c(120, 145, 135, 160, 110, 180, 125, 155)

# Identifying hypertensive patients (>140 mmHg)
hypertensive_logical <- systolic_bp > 140
hypertensive_positions <- which(systolic_bp > 140)  # Returns positions
hypertensive_values <- systolic_bp[which(systolic_bp > 140)]  # Returns values
hypertensive_values_alt <- systolic_bp[systolic_bp > 140]     # Alternative method

# Finding extreme values (safety signals)
min_bp <- min(systolic_bp)
min_position <- which(systolic_bp == min_bp)  # Position of minimum
min_value <- systolic_bp[which(systolic_bp == min_bp)]  # Actual minimum value

# Using which. min() and which.max() for efficiency
which.min(systolic_bp)  # Position only
[1] 5
which.max(systolic_bp)  # Position only
[1] 6
# Complex clinical criteria (combination conditions)
# Patients with moderate hypertension (140-160 mmHg)
moderate_hypertension <- which(systolic_bp > 140 & systolic_bp <= 160)
moderate_bp_values <- systolic_bp[moderate_hypertension]

# Patients with either low (<110) or high (>170) BP
extreme_bp_positions <- which(systolic_bp < 110 | systolic_bp > 170)
extreme_bp_values <- systolic_bp[extreme_bp_positions]

5.6 1.6 The REP Function - Creating Repetitive Clinical Data

The rep() function is valuable for creating balanced study designs and simulating clinical data.

# Creating balanced treatment allocation
treatment_sequence <- rep(1:4, times = 25)  # 25 patients per treatment (4 treatments)
placebo_only <- rep("Placebo", times = 50)  # 50 placebo patients

# Alternating treatment pattern
alternating_trt <- rep(c("Active", "Placebo"), times = 10)  # 10 cycles

# Creating patient identifiers with repetition
patient_labels <- rep("Patient", times = 100)
center_codes <- rep(c("Site_001", "Site_002", "Site_003"), times = c(30, 40, 50))

# Complex repetition patterns for study design
dose_levels <- rep(1:4, c(10, 15, 20, 25))  # Unequal allocation
treatment_blocks <- rep(1:3, each = 20)      # 20 patients per block

# Multi-level repetition (nested study design)
nested_design <- rep(1:4, each = 5, times = 6)  # 4 treatments, 5 reps, 6 cycles

5.7 1.7 The SEQ Function - Creating Clinical Sequences

The seq() function generates sequences critical for visit schedules, dose escalations, and time series analysis.

# Visit schedule creation (days from baseline)
visit_days <- seq(from = 0, to = 365, by = 28)  # Monthly visits for 1 year
dose_escalation <- seq(from = 1, to = 100, by = 10)  # Dose escalation steps

# Time-based sequences
hourly_timepoints <- seq(from = 0, to = 24, by = 2)  # PK sampling every 2 hours
weekly_visits <- seq(from = 7, to = 84, by = 7)      # Weekly visits for 12 weeks

# Creating equal intervals
pk_timepoints <- seq(from = 0, to = 24, length = 13)  # 13 timepoints over 24 hours
biomarker_schedule <- seq(from = 1, by = 14, length = 8)  # Bi-weekly for 16 weeks

# Combining sequences for complex study designs
baseline_visits <- seq(from = -14, to = 0, by = 7)     # Screening period
treatment_visits <- seq(from = 1, to = 168, by = 14)   # Treatment period
followup_visits <- seq(from = 182, to = 365, by = 28)  # Follow-up period

complete_schedule <- c(baseline_visits, treatment_visits, followup_visits)

5.8 1.8 Sequence Helper Functions

# Working with patient vectors
patient_vitals <- c(120, 135, 142, 128, 156, 118, 145, 139)

# seq_len() - creates sequence from 1 to n
patient_count <- length(patient_vitals)
patient_indices <- seq_len(patient_count)  # 1, 2, 3, ..., 8

# seq_along() - creates sequence matching vector length
visit_numbers <- seq_along(patient_vitals)  # Same as seq_len(length(x))

5.9 1.9 Missing Data Handling - Critical in Clinical Studies

# Clinical lab values with missing data (common in real studies)
hemoglobin_values <- c(12.5, NA, 13.2, 11.8, NA, 14.1)

# Detecting missing values
missing_pattern <- is.na(hemoglobin_values)  # Logical vector
complete_values <- hemoglobin_values[!is.na(hemoglobin_values)]  # Complete cases only
complete_data <- na.omit(hemoglobin_values)  # Alternative method

# Important: Never use == with NA
# hemoglobin_values == NA      # WRONG - always returns NA
# hemoglobin_values[hemoglobin_values == NA]  # WRONG

# Correct approach for missing data analysis
has_missing <- any(is.na(hemoglobin_values))
missing_count <- sum(is.na(hemoglobin_values))
complete_count <- sum(!is.na(hemoglobin_values))

5.10 1.10 Vector Element Membership - Clinical Criteria Checking

# Safety population and efficacy population patient IDs
safety_population <- 1:200
efficacy_population <- 1:150
per_protocol_population <- c(1:50, 55:120, 125:148)

# Checking patient membership in populations
patient_in_safety <- 175 %in% safety_population      # TRUE
patient_in_efficacy <- 175 %in% efficacy_population  # FALSE

# Population overlap analysis
safety_in_efficacy <- safety_population %in% efficacy_population
efficacy_in_pp <- efficacy_population %in% per_protocol_population

# Alternative using is.element()
element_check1 <- is.element(safety_population, efficacy_population)
element_check2 <- is.element(efficacy_population, per_protocol_population)

# Clinical application: Treatment compliance checking
compliant_patients <- c(1, 5, 8, 12, 15, 18, 22, 25, 30)
all_patients <- 1:50
compliance_status <- all_patients %in% compliant_patients

5.11 1.11 String Operations - Essential for Clinical Data

# Basic string output
print("Clinical Study Report")
[1] "Clinical Study Report"
# String concatenation for patient identifiers
patient_prefix <- "STUDY001-"
patient_numbers <- sprintf("%03d", 1:10)  # Zero-padded numbers
patient_ids <- paste(patient_prefix, patient_numbers, sep = "")

# Creating visit labels
visit_labels <- paste("Visit", 1:10, sep = "_")
timepoint_labels <- paste("Day", c(1, 7, 14, 28, 56), sep = "_")

# Handling vectors of different lengths
site_codes <- c("001", "002", "003")
patient_ranges <- 1:30
full_patient_ids <- paste(site_codes, patient_ranges, sep = "-")

# Collapsing for summary output
treatment_arms <- c("Placebo", "Low_Dose", "High_Dose")
treatment_summary <- paste(treatment_arms, collapse = ", ")

# paste0() - efficient concatenation
adverse_event_ids <- paste0("AE", sprintf("%04d", 1:100))
lab_test_codes <- paste0("LAB_", c("HEM", "CHEM", "URN"), "_", 1:5)

# Using cat() for formatted output
cat("Study Status:", "Ongoing", "\n")
Study Status: Ongoing 
cat("Enrolled Patients:", 156, "/", 200, "\n")
Enrolled Patients: 156 / 200 

5.12 1.12 Advanced String Manipulation for Clinical Data

# Character counting for data validation
max_adverse_event_length <- max(nchar(c("Headache", "Nausea", "Fatigue", "Dizziness")))

# Case conversion for standardization
raw_adverse_events <- c("headache", "NAUSEA", "Fatigue", "dizziness")
standardized_upper <- toupper(raw_adverse_events)
standardized_lower <- tolower(raw_adverse_events)

# Using casefold() for flexible case conversion
ae_upper <- casefold(raw_adverse_events, upper = TRUE)
ae_lower <- casefold(raw_adverse_events, upper = FALSE)

# Character translation for data cleaning
lab_values_dirty <- "O. 5mg/dL"  # O instead of 0
lab_values_clean <- chartr(old = "O", new = "0", x = lab_values_dirty)

# Sorting clinical terms
medical_terms <- c("Zyrtec", "Aspirin", "Benadryl", "Claritin")
terms_ascending <- sort(medical_terms, decreasing = FALSE)
terms_descending <- sort(medical_terms, decreasing = TRUE)

# Substring extraction for coding
study_id <- "PROTO-2024-ONCOLOGY-001"
protocol_year <- substr(study_id, start = 7, stop = 10)
therapeutic_area <- substr(study_id, start = 12, stop = 19)

5.13 1.13 Regular Expressions for Clinical Data Cleaning

# Load country data for site analysis
library(countrycode)
countries <- as.vector(countrycode::codelist$country.name.en)

# Countries starting with "A" (common site locations)
countries_a <- grep(pattern = "^A", x = countries, value = TRUE)
countries_a_positions <- grep(pattern = "^A", x = countries)

# Countries ending with "y"
countries_ending_y <- countries[grep(pattern = "y$", x = countries)]

# Multi-word country names (important for site coding)
multiword_countries <- countries[grep(pattern = "\\w\\s\\w", x = countries)]

# Countries ending with "e" OR "i"
countries_e_or_i <- countries[grep(pattern = "e$|i$", x = countries)]

# Countries containing "gin" (pattern matching)
countries_with_gin <- countries[grep(pattern = "gin", x = countries)]

# Metacharacter handling in clinical data
financial_data <- c("cost $1000", "budget", "expense $500")

# Escaping special characters
dollar_entries <- financial_data[grep(pattern = "\\$", x = financial_data)]

# Finding decimal numbers in lab results
lab_results <- c("12.5 mg/dL", "normal", "15.7 mg/dL")
decimal_results <- lab_results[grep(pattern = "\\.", x = lab_results)]

# Anchoring patterns
medication_codes <- c("MED123", "ADMIN456", "MED789")
med_codes_only <- medication_codes[grep("^MED", medication_codes)]
admin_codes_only <- medication_codes[grep("456$", medication_codes)]

# Character classes for validation
study_codes <- c("ST123", "STUDY", "ST456")
numeric_codes <- study_codes[grep("\\d", study_codes)]      # Contains digits
alpha_codes <- study_codes[grep("\\D", study_codes)]        # Contains non-digits

# Vowel detection in adverse event terms
ae_terms <- c("headache", "nausea", "rash")
vowel_containing <- ae_terms[grep("[aeiou]", ae_terms)]
numeric_containing <- ae_terms[grep("[0-9]", ae_terms)]

5.14 1.14 String Replacement - Data Standardization

# Clinical data standardization
report_text <- "Patient received Drug A, and Patient showed improvement with Drug A."

# Single replacement vs. global replacement
single_replace <- sub(pattern = "Drug A", replacement = "Investigational Product", x = report_text)
global_replace <- gsub(pattern = "Drug A", replacement = "Investigational Product", x = report_text)

# Removing whitespace for data cleaning
messy_data <- "Patient   ID:    12345   Status:   Active"
clean_data <- gsub(pattern = "\\s+", replacement = " ", x = messy_data)  # Multiple spaces to single
no_spaces <- gsub(pattern = "\\s", replacement = "", x = messy_data)      # Remove all spaces

# String splitting for data parsing
patient_info <- "LastName,FirstName,DOB,Gender"
info_components <- strsplit(x = patient_info, split = ",")

# Parsing structured clinical data
phone_numbers <- c("555-123-4567", "555-987-6543")
phone_parts <- strsplit(x = phone_numbers, split = "-")

6 Chapter 2: Data Frame Manipulation with dplyr - Clinical Data Analysis

Data frames are the primary data structure for clinical datasets (SDTM, ADaM). The dplyr package provides a grammar of data manipulation essential for clinical programming.

6.1 2.1 Creating Data Frames - Study Data Structure

# Basic clinical data frame
patient_demographics <- data.frame(
  USUBJID = paste0("STUDY001-", sprintf("%03d", 1:5)),
  AGE = c(45, 67, 23, 55, 34),
  SEX = c("M", "F", "M", "F", "M"),
  RACE = c("WHITE", "BLACK", "ASIAN", "WHITE", "OTHER"),
  ARM = c("Placebo", "Treatment", "Placebo", "Treatment", "Treatment"),
  RANDDT = as.Date(c("2024-01-15", "2024-01-16", "2024-01-17", "2024-01-18", "2024-01-19"))
)

# More complex data frame with vectors
visit_dates <- seq(as.Date("2024-01-01"), length = 10, by = "weeks")
visit_numbers <- 1:10
study_status <- rep(c("Ongoing", "Completed"), each = 5)

visit_schedule <- data.frame(
  VISITNUM = visit_numbers,
  VISITDT = visit_dates,
  STATUS = study_status,
  stringsAsFactors = FALSE  # Important for character data
)

# Converting matrix to data frame (rare but useful)
lab_matrix <- matrix(data = rnorm(50, mean = 100, sd = 15), nrow = 10, ncol = 5)
rownames(lab_matrix) <- paste("Patient", 1:10, sep = "_")
colnames(lab_matrix) <- paste("Lab", LETTERS[1:5], sep = "_")
lab_dataframe <- as.data.frame(lab_matrix)

# Data frame inspection
dim(patient_demographics)     # Dimensions:  rows x columns
[1] 5 6
nrow(patient_demographics)    # Number of patients
[1] 5
ncol(patient_demographics)    # Number of variables
[1] 6
class(patient_demographics)   # Object class
[1] "data.frame"
str(patient_demographics)     # Structure summary
'data.frame':   5 obs. of  6 variables:
 $ USUBJID: chr  "STUDY001-001" "STUDY001-002" "STUDY001-003" "STUDY001-004" ...
 $ AGE    : num  45 67 23 55 34
 $ SEX    : chr  "M" "F" "M" "F" ...
 $ RACE   : chr  "WHITE" "BLACK" "ASIAN" "WHITE" ...
 $ ARM    : chr  "Placebo" "Treatment" "Placebo" "Treatment" ...
 $ RANDDT : Date, format: "2024-01-15" "2024-01-16" ...

6.2 2.2 Accessing Data Frame Elements - Patient Data Extraction

# Single column extraction (returns data frame)
age_df <- patient_demographics["AGE"]
class(age_df)  # Still a data frame
[1] "data.frame"
# Single column extraction (returns vector)
age_vector <- patient_demographics[["AGE"]]
age_vector_alt <- patient_demographics$AGE
class(age_vector)  # Vector
[1] "numeric"
# Multiple column extraction
demographics_subset <- patient_demographics[c("USUBJID", "AGE", "SEX")]

# Row and column slicing (critical for patient subsets)
# First patient's age
first_patient_age <- patient_demographics[1, "AGE"]

# First 3 patients, last 3 variables
subset_data <- patient_demographics[1:3, 4:6]

# All male patients (logical indexing)
male_patients <- patient_demographics[patient_demographics$SEX == "M", ]

# Treatment arm patients only
treatment_patients <- patient_demographics[patient_demographics$ARM == "Treatment", ]

6.3 2.3 Modifying Data Frames - Clinical Data Updates

# Create working copy
clinical_data <- patient_demographics

# Adding new columns (common in clinical programming)
clinical_data$AGEGR1 <- ifelse(clinical_data$AGE < 65, "<65", ">=65")
clinical_data$STUDY_DAY <- as.numeric(Sys.Date() - clinical_data$RANDDT)

# Adding multiple patients (enrollment updates)
new_patient <- data.frame(
  USUBJID = "STUDY001-006",
  AGE = 42,
  SEX = "F",
  RACE = "HISPANIC",
  ARM = "Placebo",
  RANDDT = as.Date("2024-01-20"),
  AGEGR1 = "<65",
  STUDY_DAY = 1
)

# Proper way to add rows (rbind)
clinical_data <- rbind(clinical_data, new_patient)

# Adding site information (column-wise merge)
site_info <- data.frame(
  SITEID = c("001", "002", "001", "002", "001", "003"),
  COUNTRY = c("USA", "USA", "USA", "USA", "USA", "CAN")
)

clinical_data <- cbind(clinical_data, site_info)

# Removing columns (data cleanup)
clinical_data$STUDY_DAY <- NULL

# Removing rows (patient withdrawal)
# Remove last patient (index 6)
clinical_data <- clinical_data[-6, ]

6.4 2.4 Data Frame Summary and Subsetting

# Comprehensive data summary
summary(clinical_data)
   USUBJID               AGE           SEX                RACE          
 Length:5           Min.   :23.0   Length:5           Length:5          
 Class :character   1st Qu.:34.0   Class :character   Class :character  
 Mode  :character   Median :45.0   Mode  :character   Mode  :character  
                    Mean   :44.8                                        
                    3rd Qu.:55.0                                        
                    Max.   :67.0                                        
     ARM                RANDDT              AGEGR1             SITEID         
 Length:5           Min.   :2024-01-15   Length:5           Length:5          
 Class :character   1st Qu.:2024-01-16   Class :character   Class :character  
 Mode  :character   Median :2024-01-17   Mode  :character   Mode  :character  
                    Mean   :2024-01-17                                        
                    3rd Qu.:2024-01-18                                        
                    Max.   :2024-01-19                                        
   COUNTRY         
 Length:5          
 Class :character  
 Mode  :character  
                   
                   
                   
# Subsetting with conditions (safety population)
male_patients <- subset(clinical_data, SEX == "M")
elderly_female <- subset(clinical_data, SEX == "F" & AGE >= 65)
treatment_elderly <- subset(clinical_data, ARM == "Treatment" & AGE >= 65)

# Using which() for row indices
male_indices <- which(clinical_data$SEX == "M")
male_patients_alt <- clinical_data[male_indices, ]

# Statistical calculations
n_managers <- sum(clinical_data$ARM == "Treatment")  # Treatment group size
mean_age <- mean(clinical_data$AGE)
age_by_treatment <- aggregate(AGE ~ ARM, data = clinical_data, FUN = mean)

# Creating composite variables
clinical_data$PATIENT_LABEL <- paste(clinical_data$USUBJID, clinical_data$ARM, sep = "_")

# Apply functions for calculations
numeric_columns <- c("AGE")
apply(clinical_data[numeric_columns], 2, function(x) c(mean = mean(x), sd = sd(x)))
          AGE
mean 44.80000
sd   17.23949

7 Chapter 3: Reading SAS Datasets - SDTM and ADaM

# Reading SDTM datasets (Study Data Tabulation Model)
# Commented out as files may not exist in all environments
# dm <- haven::read_sas("data/sdtm/dm.sas7bdat")  # Demographics
# ae <- haven::read_sas("data/sdtm/ae. sas7bdat")  # Adverse Events
# vs <- haven::read_sas("data/sdtm/vs. sas7bdat")  # Vital Signs
# lb <- haven::read_sas("data/sdtm/lb. sas7bdat")  # Laboratory

# Reading ADaM datasets (Analysis Data Model)  
# adsl <- haven::read_sas("data/adam/adsl.sas7bdat")  # Subject Level Analysis Dataset
# adae <- haven::read_sas("data/adam/adae.sas7bdat")  # Adverse Events Analysis Dataset
# adlb <- haven::read_sas("data/adam/adlb.sas7bdat")  # Laboratory Analysis Dataset

# For demonstration, create sample datasets
dm <- data.frame(
  USUBJID = paste0("STUDY001-", sprintf("%03d", 1:100)),
  ARM = rep(c("Placebo", "Treatment A", "Treatment B"), length.out = 100),
  AGE = sample(18:80, 100, replace = TRUE),
  SEX = sample(c("M", "F"), 100, replace = TRUE),
  RACE = sample(c("WHITE", "BLACK", "ASIAN", "OTHER"), 100, replace = TRUE),
  RFSTDTC = sample(seq(as.Date("2024-01-01"), as.Date("2024-06-01"), by = "day"), 100),
  stringsAsFactors = FALSE
)

ae <- data.frame(
  USUBJID = sample(dm$USUBJID, 200, replace = TRUE),
  AETERM = sample(c("Headache", "Nausea", "Fatigue", "Dizziness", "Rash"), 200, replace = TRUE),
  AESEV = sample(c("MILD", "MODERATE", "SEVERE"), 200, replace = TRUE),
  AEOUT = sample(c("RECOVERED", "ONGOING", "NOT RECOVERED"), 200, replace = TRUE),
  stringsAsFactors = FALSE
)

8 Chapter 4: Advanced dplyr Operations - Clinical Data Manipulation

8.1 4.1 Column Selection - Choosing Clinical Variables

The select() function is fundamental for creating analysis datasets from raw SDTM data.

# Basic column selection for demographics analysis
dm_analysis <- dm %>% 
  dplyr::select(USUBJID, ARM, AGE, SEX)

# Range selection (consecutive columns)
dm_basic <- dm %>% 
  dplyr::select(USUBJID: ARM)  # From USUBJID to ARM

# Numeric position selection
dm_first_three <- dm %>% 
  dplyr::select(1:3)

# Helper functions for clinical programming
treatment_vars <- dm %>% 
  dplyr::select(starts_with("ARM"))     # Treatment-related variables

date_vars <- dm %>% 
  dplyr::select(contains("DT"))         # Date variables

baseline_vars <- dm %>% 
  dplyr::select(ends_with("BL"))        # Baseline variables

# Combining selections
key_demographics <- dm %>% 
  dplyr:: select(USUBJID, AGE, SEX, starts_with("ARM"))

# Alternative syntax (less preferred in pipelines)
dm_subset <- dplyr::select(dm, USUBJID, ARM)

8.2 4.2 Dropping Columns - Data Cleanup

# Dropping unnecessary columns for analysis
# Create sample ADSL with many columns
adsl <- dm %>% 
  dplyr::mutate(
    STUDYID = "STUDY001",
    TRT01A = ARM,
    TRT01P = ARM,
    TRT01AN = case_when(
      ARM == "Placebo" ~ 0,
      ARM == "Treatment A" ~ 1,
      ARM == "Treatment B" ~ 2
    ),
    SAFFL = "Y",
    ITTFL = "Y",
    RANDDT = RFSTDTC
  )

# Drop individual columns
adsl_clean <- adsl %>% 
  dplyr::select(-c(STUDYID, RFSTDTC))

# Drop column ranges
adsl_minimal <- adsl %>% 
  dplyr::select(-c(TRT01A: TRT01AN))

# Drop by position
adsl_no_first_col <- adsl %>% 
  dplyr::select(-1)

# Drop multiple non-consecutive columns
adsl_focused <- adsl %>% 
  dplyr::select(-c(1, 3, 5:7))

# Drop using helper functions
adsl_no_treatment <- adsl %>% 
  dplyr::select(-starts_with("TRT"))

adsl_no_dates <- adsl %>% 
  dplyr::select(-ends_with("DT"))

adsl_no_flags <- adsl %>% 
  dplyr::select(-contains("FL"))

8.3 4.3 Filtering Rows - Patient Population Selection

Row filtering is critical for defining analysis populations (Safety, ITT, Per-Protocol).

# Safety population (patients who received at least one dose)
safety_population <- adsl %>% 
  dplyr::filter(SAFFL == "Y")

# Treatment group analysis
placebo_patients <- adsl %>% 
  dplyr::select(USUBJID, TRT01A, SEX, AGE) %>% 
  dplyr::filter(TRT01A == "Placebo")

# Complex filtering (multiple conditions with AND)
elderly_male_treatment <- adsl %>% 
  dplyr::select(USUBJID, TRT01A, SEX, AGE) %>% 
  dplyr::filter(TRT01A == "Treatment A" & SEX == "M" & AGE >= 65)

# OR conditions (either condition can be true)
special_population <- adsl %>% 
  dplyr::select(USUBJID, TRT01A, SEX, AGE) %>% 
  dplyr::filter(TRT01A == "Placebo" & (SEX == "M" | AGE >= 70))

# Age range filtering
adult_population <- adsl %>% 
  dplyr::select(USUBJID, TRT01A, SEX, AGE) %>% 
  dplyr::filter(AGE >= 18 & AGE <= 65)

# Multiple value selection using %in%
active_treatments <- adsl %>% 
  dplyr::select(USUBJID, TRT01A, SEX, AGE) %>% 
  dplyr::filter(TRT01A %in% c("Treatment A", "Treatment B"))

# Exclusion filtering using !  (NOT operator)
non_placebo <- adsl %>% 
  dplyr::select(USUBJID, TRT01A, SEX, AGE) %>% 
  dplyr::filter(! TRT01A == "Placebo")

# Excluding multiple values
excluded_ages <- adsl %>% 
  dplyr::select(USUBJID, TRT01A, SEX, AGE) %>% 
  dplyr::filter(! AGE %in% c(70, 75, 80))

# Combined inclusion and exclusion
analysis_population <- adsl %>% 
  dplyr::select(USUBJID, TRT01A, SEX, AGE) %>% 
  dplyr::filter(TRT01A %in% c("Treatment A", "Treatment B") & 
                ! SEX == "F" & 
                AGE >= 18)

8.4 4.4 Variable Renaming - Standardization

# Single variable rename
adsl_renamed <- adsl %>% 
  dplyr::rename(SUBJECT_ID = USUBJID)

# Multiple variable renames
adsl_standard <- adsl %>% 
  dplyr::rename(
    SUBJECT_ID = USUBJID,
    TREATMENT = TRT01P,
    PLANNED_TREATMENT = TRT01A
  )

# Converting to lowercase (common requirement)
adsl_lower <- adsl
names(adsl_lower) <- tolower(names(adsl_lower))

8.5 4.5 Sorting Data - Clinical Data Ordering

# Single variable sorting
adsl_by_age <- adsl %>% 
  dplyr::select(USUBJID, SEX, AGE, TRT01A) %>% 
  dplyr::arrange(AGE)

# Multiple variable sorting (treatment, then age)
adsl_trt_age <- adsl %>% 
  dplyr::select(USUBJID, SEX, AGE, TRT01A) %>% 
  dplyr:: arrange(TRT01A, AGE)

# Descending order
adsl_age_desc <- adsl %>% 
  dplyr::select(USUBJID, SEX, AGE, TRT01A) %>% 
  dplyr::arrange(desc(AGE))

# Mixed ascending and descending
adsl_mixed_sort <- adsl %>% 
  dplyr::select(USUBJID, SEX, AGE, TRT01A) %>% 
  dplyr::arrange(AGE, desc(TRT01A))

8.6 4.6 Creating New Variables - Derived Variables

Variable creation is essential for analysis datasets, creating flags, categorizations, and derived endpoints.

# Simple variable creation
adsl_enhanced <- adsl %>% 
  dplyr::mutate(STUDY_NAME = "Phase III Efficacy Study")

# Conditional variable creation using if_else()
adsl_derived <- adsl %>%
  dplyr::mutate(
    AGEGR1 = dplyr::if_else(AGE < 65, "<65", ">=65"),                    # Age group
    SAFFL_DERIVED = dplyr::if_else(! is.na(RANDDT), "Y", "N"),           # Safety flag
    SEX_FULL = dplyr::if_else(SEX == "M", "Male", "Female")             # Full gender
  ) %>% 
  dplyr::select(USUBJID, AGE, AGEGR1, SEX, SEX_FULL, RANDDT, SAFFL_DERIVED)

# Complex conditional logic with case_when()
adsl_complex <- adsl %>% 
  dplyr::mutate(
    AGE_GROUP = dplyr::case_when(
      AGE < 18 ~ "Pediatric",
      AGE >= 18 & AGE < 65 ~ "Adult",
      AGE >= 65 & AGE < 75 ~ "Elderly",
      AGE >= 75 ~ "Very Elderly",
      TRUE ~ "Unknown"  # Default case for missing values
    ),
    RISK_CATEGORY = dplyr::case_when(
      AGE >= 65 & SEX == "F" ~ "High Risk",
      AGE >= 70 ~ "High Risk", 
      AGE < 30 ~ "Low Risk",
      TRUE ~ "Standard Risk"
    )
  ) %>% 
  dplyr::select(USUBJID, AGE, SEX, AGE_GROUP, RISK_CATEGORY)

8.7 4.7 Data Binding - Combining Datasets

# Create sample datasets for binding examples
dataset_a <- data.frame(
  USUBJID = c("001", "002", "003"),
  VISIT = c("V1", "V1", "V1"),
  LBTEST = c("Hemoglobin", "Hemoglobin", "Hemoglobin"),
  LBSTRESN = c(12.5, 13.2, 11.8)
)

dataset_b <- data.frame(
  USUBJID = c("004", "005", "006"),
  VISIT = c("V1", "V1", "V1"), 
  LBTEST = c("Hemoglobin", "Hemoglobin", "Hemoglobin"),
  LBSTRESN = c(14.1, 12.9, 13.5)
)

# Row binding (adding more patients/observations)
combined_lab <- dplyr::bind_rows(dataset_a, dataset_b)

# Row binding with source identification
combined_with_source <- dplyr::bind_rows(
  "Batch_1" = dataset_a,
  "Batch_2" = dataset_b,
  .id = "DATA_SOURCE"
)

# Binding with repeated datasets (useful for simulation)
repeated_data <- dplyr::bind_rows(dataset_a, dataset_b, dataset_a, .id = "ITERATION")

# Creating total rows for summary tables
adsl_with_total <- dplyr::bind_rows(
  adsl %>% 
    dplyr::select(USUBJID, TRT01P, TRT01AN) %>% 
    dplyr::slice_head(n = 10),  # First 10 for example
  
  adsl %>% 
    dplyr::slice_head(n = 10) %>% 
    dplyr::mutate(TRT01P = "Total", TRT01AN = 99) %>%
    dplyr::select(USUBJID, TRT01P, TRT01AN)
)

8.8 4.8 Recoding Variables - Data Standardization

# Creating lookup table for recoding
sex_mapping <- c("M" = "Male", "F" = "Female")

# Using recode() function
adsl_recoded <- adsl %>% 
  dplyr::mutate(SEX_LABEL = dplyr::recode(SEX, !!!sex_mapping)) %>% 
  dplyr::select(USUBJID, SEX, SEX_LABEL)

# Direct recoding without lookup table
adsl_direct_recode <- adsl %>% 
  dplyr::mutate(SEX_FULL = dplyr::recode(SEX, 
                                        "M" = "Male",
                                        "F" = "Female")) %>% 
  dplyr::select(USUBJID, SEX, SEX_FULL)

# Alternative using if_else (often more readable)
adsl_alternative <- adsl %>%
  dplyr::mutate(SEX_DESC = dplyr::if_else(SEX == "M", "Male", "Female")) %>% 
  dplyr::select(USUBJID, SEX, SEX_DESC)

8.9 4.9 Factor Creation - Categorical Data Management

# Creating factors with specified levels (important for analysis and reporting)
adsl_factors <- adsl %>%
  dplyr::mutate(
    AGEGR1_FACTOR = factor(
      dplyr::if_else(AGE < 65, "<65", ">=65"), 
      levels = c("<65", ">=65")
    ),
    SEX_FACTOR = factor(SEX, levels = c("M", "F")),
    TRT01A_FACTOR = factor(TRT01A, levels = c("Placebo", "Treatment A", "Treatment B"))
  ) %>% 
  dplyr:: select(USUBJID, AGE, AGEGR1_FACTOR, SEX_FACTOR, TRT01A_FACTOR)

# Verify factor levels
levels(adsl_factors$AGEGR1_FACTOR)
[1] "<65"  ">=65"
levels(adsl_factors$SEX_FACTOR)
[1] "M" "F"
levels(adsl_factors$TRT01A_FACTOR)
[1] "Placebo"     "Treatment A" "Treatment B"

9 Chapter 5: Advanced Data Operations

9.1 5.1 Table Joins - Merging Clinical Datasets

Joins are fundamental for combining SDTM domains or linking analysis datasets.

# Create realistic clinical datasets
patients_demo <- data.frame(
  USUBJID = paste0("STUDY001-", sprintf("%03d", 1:6)),
  AGE = c(45, 67, 23, 55, 34, 42),
  SEX = c("M", "F", "M", "F", "M", "F"),
  ARM = c("Placebo", "Treatment", "Placebo", "Treatment", "Treatment", "Placebo")
)

lab_results <- data.frame(
  USUBJID = paste0("STUDY001-", sprintf("%03d", c(2, 3, 4, 5, 7, 8))),
  LBTEST = rep(c("Hemoglobin", "Creatinine"), each = 3),
  LBSTRESN = c(12.5, 13.2, 11.8, 1.1, 0.9, 1.3),
  VISITNUM = c(1, 1, 1, 1, 1, 1)
)

print("Demographics Data:")
[1] "Demographics Data:"
print(patients_demo)
       USUBJID AGE SEX       ARM
1 STUDY001-001  45   M   Placebo
2 STUDY001-002  67   F Treatment
3 STUDY001-003  23   M   Placebo
4 STUDY001-004  55   F Treatment
5 STUDY001-005  34   M Treatment
6 STUDY001-006  42   F   Placebo
print("Laboratory Results:")
[1] "Laboratory Results:"
print(lab_results)
       USUBJID     LBTEST LBSTRESN VISITNUM
1 STUDY001-002 Hemoglobin     12.5        1
2 STUDY001-003 Hemoglobin     13.2        1
3 STUDY001-004 Hemoglobin     11.8        1
4 STUDY001-005 Creatinine      1.1        1
5 STUDY001-007 Creatinine      0.9        1
6 STUDY001-008 Creatinine      1.3        1

9.1.1 Inner Join - Only Patients with Lab Results

# Inner join: Only patients with both demographics and lab data
inner_result <- dplyr::inner_join(patients_demo, lab_results, by = "USUBJID")
print("Inner Join Result (patients with lab data):")
[1] "Inner Join Result (patients with lab data):"
print(inner_result)
       USUBJID AGE SEX       ARM     LBTEST LBSTRESN VISITNUM
1 STUDY001-002  67   F Treatment Hemoglobin     12.5        1
2 STUDY001-003  23   M   Placebo Hemoglobin     13.2        1
3 STUDY001-004  55   F Treatment Hemoglobin     11.8        1
4 STUDY001-005  34   M Treatment Creatinine      1.1        1

9.1.2 Left Join - All Patients, Lab Data Where Available

# Left join: All patients from demographics, lab data where available
left_result <- dplyr::left_join(patients_demo, lab_results, by = "USUBJID")
print("Left Join Result (all patients):")
[1] "Left Join Result (all patients):"
print(left_result)
       USUBJID AGE SEX       ARM     LBTEST LBSTRESN VISITNUM
1 STUDY001-001  45   M   Placebo       <NA>       NA       NA
2 STUDY001-002  67   F Treatment Hemoglobin     12.5        1
3 STUDY001-003  23   M   Placebo Hemoglobin     13.2        1
4 STUDY001-004  55   F Treatment Hemoglobin     11.8        1
5 STUDY001-005  34   M Treatment Creatinine      1.1        1
6 STUDY001-006  42   F   Placebo       <NA>       NA       NA

9.1.3 Right Join - All Lab Results, Demographics Where Available

# Right join: All lab results, demographics where available
right_result <- dplyr::right_join(patients_demo, lab_results, by = "USUBJID")
print("Right Join Result (all lab results):")
[1] "Right Join Result (all lab results):"
print(right_result)
       USUBJID AGE  SEX       ARM     LBTEST LBSTRESN VISITNUM
1 STUDY001-002  67    F Treatment Hemoglobin     12.5        1
2 STUDY001-003  23    M   Placebo Hemoglobin     13.2        1
3 STUDY001-004  55    F Treatment Hemoglobin     11.8        1
4 STUDY001-005  34    M Treatment Creatinine      1.1        1
5 STUDY001-007  NA <NA>      <NA> Creatinine      0.9        1
6 STUDY001-008  NA <NA>      <NA> Creatinine      1.3        1

9.1.4 Full Join - All Records from Both Datasets

# Full join:  All records from both datasets
full_result <- dplyr::full_join(patients_demo, lab_results, by = "USUBJID")
print("Full Join Result (all records):")
[1] "Full Join Result (all records):"
print(full_result)
       USUBJID AGE  SEX       ARM     LBTEST LBSTRESN VISITNUM
1 STUDY001-001  45    M   Placebo       <NA>       NA       NA
2 STUDY001-002  67    F Treatment Hemoglobin     12.5        1
3 STUDY001-003  23    M   Placebo Hemoglobin     13.2        1
4 STUDY001-004  55    F Treatment Hemoglobin     11.8        1
5 STUDY001-005  34    M Treatment Creatinine      1.1        1
6 STUDY001-006  42    F   Placebo       <NA>       NA       NA
7 STUDY001-007  NA <NA>      <NA> Creatinine      0.9        1
8 STUDY001-008  NA <NA>      <NA> Creatinine      1.3        1

9.1.5 Semi Join - Patients Who Have Lab Results (No Lab Columns)

# Semi join: Patients who have lab results (demographics only)
semi_result <- dplyr::semi_join(patients_demo, lab_results, by = "USUBJID")
print("Semi Join Result (patients with labs, demographics only):")
[1] "Semi Join Result (patients with labs, demographics only):"
print(semi_result)
       USUBJID AGE SEX       ARM
1 STUDY001-002  67   F Treatment
2 STUDY001-003  23   M   Placebo
3 STUDY001-004  55   F Treatment
4 STUDY001-005  34   M Treatment

9.1.6 Anti Join - Patients Without Lab Results

# Anti join: Patients without lab results
anti_result <- dplyr::anti_join(patients_demo, lab_results, by = "USUBJID")
print("Anti Join Result (patients without labs):")
[1] "Anti Join Result (patients without labs):"
print(anti_result)
       USUBJID AGE SEX     ARM
1 STUDY001-001  45   M Placebo
2 STUDY001-006  42   F Placebo
# Reverse anti join:  Lab results without demographics
anti_reverse <- dplyr::anti_join(lab_results, patients_demo, by = "USUBJID")
print("Reverse Anti Join (labs without demographics):")
[1] "Reverse Anti Join (labs without demographics):"
print(anti_reverse)
       USUBJID     LBTEST LBSTRESN VISITNUM
1 STUDY001-007 Creatinine      0.9        1
2 STUDY001-008 Creatinine      1.3        1

9.2 5.2 Data Reshaping with tidyr - Wide vs. Long Format

Clinical data often needs reshaping between wide and long formats for analysis and reporting.

# Create wide-format visit data (common in clinical databases)
visit_data_wide <- data.frame(
  USUBJID = paste0("STUDY001-", sprintf("%03d", 1:5)),
  BASELINE = c(95, 87, 110, 92, 88),
  WEEK_4 = c(92, 85, 108, 89, 85),
  WEEK_8 = c(89, 83, 105, 87, 83),
  WEEK_12 = c(87, 81, 103, 85, 81)
)

print("Wide Format Data (one row per patient):")
[1] "Wide Format Data (one row per patient):"
print(visit_data_wide)
       USUBJID BASELINE WEEK_4 WEEK_8 WEEK_12
1 STUDY001-001       95     92     89      87
2 STUDY001-002       87     85     83      81
3 STUDY001-003      110    108    105     103
4 STUDY001-004       92     89     87      85
5 STUDY001-005       88     85     83      81
# Convert to long format using pivot_longer()
visit_data_long <- visit_data_wide %>% 
  tidyr::pivot_longer(
    cols = c(BASELINE, WEEK_4, WEEK_8, WEEK_12),
    names_to = "VISIT",
    values_to = "LAB_VALUE"
  )

print("Long Format Data (one row per patient-visit):")
[1] "Long Format Data (one row per patient-visit):"
print(visit_data_long)
# A tibble: 20 × 3
   USUBJID      VISIT    LAB_VALUE
   <chr>        <chr>        <dbl>
 1 STUDY001-001 BASELINE        95
 2 STUDY001-001 WEEK_4          92
 3 STUDY001-001 WEEK_8          89
 4 STUDY001-001 WEEK_12         87
 5 STUDY001-002 BASELINE        87
 6 STUDY001-002 WEEK_4          85
 7 STUDY001-002 WEEK_8          83
 8 STUDY001-002 WEEK_12         81
 9 STUDY001-003 BASELINE       110
10 STUDY001-003 WEEK_4         108
11 STUDY001-003 WEEK_8         105
12 STUDY001-003 WEEK_12        103
13 STUDY001-004 BASELINE        92
14 STUDY001-004 WEEK_4          89
15 STUDY001-004 WEEK_8          87
16 STUDY001-004 WEEK_12         85
17 STUDY001-005 BASELINE        88
18 STUDY001-005 WEEK_4          85
19 STUDY001-005 WEEK_8          83
20 STUDY001-005 WEEK_12         81
# Convert back to wide format using pivot_wider()
visit_data_reconstructed <- visit_data_long %>% 
  tidyr::pivot_wider(
    id_cols = USUBJID,
    names_from = VISIT,
    values_from = LAB_VALUE,
    names_prefix = "VISIT_"  # Optional prefix
  )

print("Reconstructed Wide Format:")
[1] "Reconstructed Wide Format:"
print(visit_data_reconstructed)
# A tibble: 5 × 5
  USUBJID      VISIT_BASELINE VISIT_WEEK_4 VISIT_WEEK_8 VISIT_WEEK_12
  <chr>                 <dbl>        <dbl>        <dbl>         <dbl>
1 STUDY001-001             95           92           89            87
2 STUDY001-002             87           85           83            81
3 STUDY001-003            110          108          105           103
4 STUDY001-004             92           89           87            85
5 STUDY001-005             88           85           83            81
# Complex reshaping example with multiple measures
complex_data <- data.frame(
  USUBJID = rep(paste0("STUDY001-", sprintf("%03d", 1:3)), each = 2),
  VISIT = rep(c("BASELINE", "WEEK_4"), 3),
  HEMOGLOBIN = c(12.5, 12.8, 13.1, 13.3, 11.8, 12.1),
  CREATININE = c(1.1, 1.0, 0.9, 0.9, 1.2, 1.1)
)

# Reshape to have one column per visit-test combination
complex_wide <- complex_data %>% 
  tidyr::pivot_wider(
    id_cols = USUBJID,
    names_from = VISIT,
    values_from = c(HEMOGLOBIN, CREATININE),
    names_sep = "_"
  )

print("Complex Wide Format:")
[1] "Complex Wide Format:"
print(complex_wide)
# A tibble: 3 × 5
  USUBJID      HEMOGLOBIN_BASELINE HEMOGLOBIN_WEEK_4 CREATININE_BASELINE
  <chr>                      <dbl>             <dbl>               <dbl>
1 STUDY001-001                12.5              12.8                 1.1
2 STUDY001-002                13.1              13.3                 0.9
3 STUDY001-003                11.8              12.1                 1.2
# ℹ 1 more variable: CREATININE_WEEK_4 <dbl>

9.3 5.3 Counting and Summarizing - Clinical Summary Tables

# Simple counting for demographic tables
treatment_counts <- adsl %>% 
  dplyr::count(TRT01A) %>% 
  dplyr::rename(Treatment = TRT01A, N = n)

print("Treatment Allocation:")
[1] "Treatment Allocation:"
print(treatment_counts)
    Treatment  N
1     Placebo 34
2 Treatment A 33
3 Treatment B 33
# Cross-tabulation (treatment by sex)
treatment_sex_counts <- adsl %>% 
  dplyr::count(TRT01A, SEX) %>% 
  dplyr:: arrange(TRT01A, SEX)

print("Treatment by Sex:")
[1] "Treatment by Sex:"
print(treatment_sex_counts)
       TRT01A SEX  n
1     Placebo   F 15
2     Placebo   M 19
3 Treatment A   F 23
4 Treatment A   M 10
5 Treatment B   F 20
6 Treatment B   M 13
# Filtering before counting (safety population)
safety_counts <- adsl %>% 
  dplyr::filter(SAFFL == "Y") %>%
  dplyr::count(TRT01A) %>% 
  dplyr::rename(Treatment = TRT01A, Safety_N = n)

# Complex counting with adverse events
# Note: Using simulated AE data since file may not exist
adae_sim <- data.frame(
  USUBJID = sample(adsl$USUBJID, 200, replace = TRUE),
  AEBODSYS = sample(c("Nervous System", "Gastrointestinal", "Respiratory"), 200, replace = TRUE),
  AEDECOD = sample(c("Headache", "Nausea", "Cough", "Fatigue"), 200, replace = TRUE),
  TRTA = sample(c("Placebo", "Treatment A", "Treatment B"), 200, replace = TRUE),
  SAFFL = "Y"
)

# Remove duplicates (one AE per patient-term combination)
adae_unique <- adae_sim %>% 
  dplyr::distinct(USUBJID, AEBODSYS, AEDECOD, TRTA, SAFFL)

# Count AEs by treatment
ae_summary <- adae_unique %>%
  dplyr::filter(SAFFL == "Y") %>%
  dplyr::group_by(TRTA) %>%
  dplyr::count(AEBODSYS, AEDECOD) %>% 
  dplyr::ungroup()

# Add denominators for percentages
bign <- adsl %>%
  dplyr::filter(SAFFL == "Y") %>%
  dplyr:: count(TRT01A) %>%
  dplyr::rename(TRTA = TRT01A, total_n = n)

# Calculate percentages
ae_with_percent <- ae_summary %>%
  dplyr::left_join(bign, by = "TRTA") %>%
  dplyr::mutate(
    percent = round(n / total_n * 100, 1),
    display = paste0(n, " (", percent, "%)")
  )

# Reshape for reporting
ae_table <- ae_with_percent %>% 
  dplyr::select(AEBODSYS, AEDECOD, TRTA, display) %>% 
  tidyr::pivot_wider(
    id_cols = c(AEBODSYS, AEDECOD),
    names_from = TRTA,
    values_from = display,
    values_fill = "0 (0.0%)"
  )

print("Adverse Events by Treatment (first 10 rows):")
[1] "Adverse Events by Treatment (first 10 rows):"
print(head(ae_table, 10))
# A tibble: 10 × 5
   AEBODSYS         AEDECOD  Placebo   `Treatment A` `Treatment B`
   <chr>            <chr>    <chr>     <chr>         <chr>        
 1 Gastrointestinal Cough    4 (11.8%) 10 (30.3%)    3 (9.1%)     
 2 Gastrointestinal Fatigue  2 (5.9%)  6 (18.2%)     7 (21.2%)    
 3 Gastrointestinal Headache 7 (20.6%) 8 (24.2%)     5 (15.2%)    
 4 Gastrointestinal Nausea   3 (8.8%)  8 (24.2%)     7 (21.2%)    
 5 Nervous System   Cough    2 (5.9%)  6 (18.2%)     5 (15.2%)    
 6 Nervous System   Fatigue  7 (20.6%) 3 (9.1%)      6 (18.2%)    
 7 Nervous System   Headache 7 (20.6%) 3 (9.1%)      3 (9.1%)     
 8 Nervous System   Nausea   5 (14.7%) 3 (9.1%)      5 (15.2%)    
 9 Respiratory      Cough    6 (17.6%) 5 (15.2%)     7 (21.2%)    
10 Respiratory      Fatigue  6 (17.6%) 7 (21.2%)     6 (18.2%)    

9.4 5.4 Distinct Values and Unique Records

# Get unique treatment combinations
unique_treatments <- adsl %>% 
  dplyr::distinct(TRT01A, TRT01AN) %>% 
  dplyr::arrange(TRT01AN)

print("Unique Treatment Codes:")
[1] "Unique Treatment Codes:"
print(unique_treatments)
       TRT01A TRT01AN
1     Placebo       0
2 Treatment A       1
3 Treatment B       2
# Get unique patient-visit combinations (useful for verification)
unique_patients <- adsl %>% 
  dplyr::distinct(USUBJID) %>% 
  dplyr::summarise(unique_patients = n())

print("Number of Unique Patients:")
[1] "Number of Unique Patients:"
print(unique_patients)
  unique_patients
1             100
# Remove duplicate records while keeping specific columns
adae_distinct <- adae_sim %>% 
  dplyr::distinct(USUBJID, AEDECOD, .keep_all = TRUE)  # One AE term per patient

print(paste("Original AE records:", nrow(adae_sim)))
[1] "Original AE records: 200"
print(paste("After removing duplicates:", nrow(adae_distinct)))
[1] "After removing duplicates: 153"

10 Chapter 6: Best Practices and Advanced Techniques

10.1 6.1 Efficient Pipeline Construction

# Example of an efficient clinical programming pipeline
analysis_ready_data <- adsl %>% 
  # Step 1: Filter to analysis population
  dplyr::filter(SAFFL == "Y", ! is.na(TRT01A)) %>% 
  
  # Step 2: Create derived variables
  dplyr::mutate(
    AGEGR1 = dplyr::case_when(
      AGE < 45 ~ "<45",
      AGE >= 45 & AGE < 65 ~ "45-64", 
      AGE >= 65 ~ ">=65",
      TRUE ~ "Missing"
    ),
    AGEGR1 = factor(AGEGR1, levels = c("<45", "45-64", ">=65")),
    SEX_LABEL = dplyr::if_else(SEX == "M", "Male", "Female")
  ) %>% 
  
  # Step 3: Select final variables
  dplyr::select(USUBJID, TRT01A, TRT01AN, AGE, AGEGR1, SEX, SEX_LABEL) %>% 
  
  # Step 4: Sort for consistent output
  dplyr::arrange(TRT01AN, USUBJID)

print("Analysis Ready Dataset Structure:")
[1] "Analysis Ready Dataset Structure:"
str(analysis_ready_data)
'data.frame':   100 obs. of  7 variables:
 $ USUBJID  : chr  "STUDY001-001" "STUDY001-004" "STUDY001-007" "STUDY001-010" ...
 $ TRT01A   : chr  "Placebo" "Placebo" "Placebo" "Placebo" ...
 $ TRT01AN  : num  0 0 0 0 0 0 0 0 0 0 ...
 $ AGE      : int  32 38 71 36 28 25 57 19 47 66 ...
 $ AGEGR1   : Factor w/ 3 levels "<45","45-64",..: 1 1 3 1 1 1 2 1 2 3 ...
 $ SEX      : chr  "M" "M" "F" "F" ...
 $ SEX_LABEL: chr  "Male" "Male" "Female" "Female" ...
print(head(analysis_ready_data, 10))
        USUBJID  TRT01A TRT01AN AGE AGEGR1 SEX SEX_LABEL
1  STUDY001-001 Placebo       0  32    <45   M      Male
2  STUDY001-004 Placebo       0  38    <45   M      Male
3  STUDY001-007 Placebo       0  71   >=65   F    Female
4  STUDY001-010 Placebo       0  36    <45   F    Female
5  STUDY001-013 Placebo       0  28    <45   F    Female
6  STUDY001-016 Placebo       0  25    <45   F    Female
7  STUDY001-019 Placebo       0  57  45-64   F    Female
8  STUDY001-022 Placebo       0  19    <45   M      Male
9  STUDY001-025 Placebo       0  47  45-64   M      Male
10 STUDY001-028 Placebo       0  66   >=65   F    Female

10.2 6.2 Error Handling and Data Validation

# Data validation pipeline
validation_summary <- adsl %>% 
  dplyr::summarise(
    total_patients = n(),
    missing_age = sum(is.na(AGE)),
    missing_sex = sum(is.na(SEX)),
    missing_treatment = sum(is.na(TRT01A)),
    age_range = paste(min(AGE, na.rm = TRUE), "to", max(AGE, na.rm = TRUE)),
    unique_treatments = n_distinct(TRT01A, na.rm = TRUE),
    .groups = "drop"
  )

print("Data Validation Summary:")
[1] "Data Validation Summary:"
print(validation_summary)
  total_patients missing_age missing_sex missing_treatment age_range
1            100           0           0                 0  18 to 80
  unique_treatments
1                 3
# Check for data integrity issues
integrity_checks <- adsl %>% 
  dplyr::mutate(
    age_flag = dplyr::case_when(
      AGE < 0 | AGE > 120 ~ "Age out of range",
      is.na(AGE) ~ "Missing age",
      TRUE ~ "OK"
    ),
    sex_flag = dplyr::case_when(
      ! SEX %in% c("M", "F") ~ "Invalid sex code",
      is.na(SEX) ~ "Missing sex",
      TRUE ~ "OK"
    )
  ) %>% 
  dplyr::filter(age_flag != "OK" | sex_flag != "OK") %>% 
  dplyr::select(USUBJID, AGE, age_flag, SEX, sex_flag)

print("Data Integrity Issues:")
[1] "Data Integrity Issues:"
print(integrity_checks)
[1] USUBJID  AGE      age_flag SEX      sex_flag
<0 rows> (or 0-length row.names)

This comprehensive guide provides pharmaceutical programmers with the essential R skills for clinical data manipulation, from basic vector operations to advanced dplyr techniques used in SDTM and ADaM dataset creation and analysis.