# Load essential packages for pharmaceutical data analysis
library(haven) # For reading SAS datasets (SDTM, ADaM)
library(dplyr) # For data manipulation
library(tidyr) # For data reshaping
library(stringr) # For string manipulation4 Manipulating Vectors, Data Frames, and Lists
5 Chapter 1: Vector Operations in Pharmaceutical Context
Vectors are the fundamental building blocks of R. In pharmaceutical programming, vectors are used to store patient IDs, treatment codes, lab values, and categorical variables like adverse event terms.
5.1 1.1 Creating Vectors - The Foundation
# Creating vectors - essential for pharmaceutical data
# Patient IDs (sequential numbering)
patient_ids <- 1:10
patient_ids_desc <- 10:1
patient_ids_range <- -5:10
# Using c() function for explicit vector creation
dose_levels <- c(1, 5, 10, 25, 50, 100) # Dose escalation study
safety_population <- c(1:254) # Safety population patient IDs
# Pharmaceutical example: Creating treatment codes
treatment_codes <- c("PLA", "LOW", "MED", "HIGH")5.1.1 Understanding Vector Naming in Clinical Data
# Naming vectors is crucial for clinical data interpretation
dose_groups <- c(0, 5, 10, 25)
names(dose_groups) <- c("Placebo", "Low_Dose", "Medium_Dose", "High_Dose")
print(dose_groups) Placebo Low_Dose Medium_Dose High_Dose
0 5 10 25
# Clinical example: Lab normal ranges
lab_ranges <- c(70, 100, 140)
names(lab_ranges) <- c("Glucose_Low", "Glucose_Normal", "Glucose_High")
print(lab_ranges) Glucose_Low Glucose_Normal Glucose_High
70 100 140
5.2 1.2 Vector Indexing - Accessing Patient Data
Vector indexing is critical when working with patient subsets, safety populations, or specific treatment arms.
# Sample adverse event severity scores
ae_severity <- c(1, 3, 5, 2, 4, 1, 3)
# Accessing specific patient data
ae_severity[2] # Second patient's AE severity[1] 3
ae_severity[c(1, 3, 5)] # Patients 1, 3, and 5[1] 1 5 4
ae_severity[-2] # Exclude patient 2[1] 1 5 2 4 1 3
ae_severity[-c(1, 3)] # Exclude patients 1 and 3[1] 3 2 4 1 3
# Important: Accessing non-existent indices
missing_patient <- ae_severity[10] # Returns NA
class(missing_patient) # Still numeric class[1] "numeric"
# Pharmaceutical application: Safety population subset
safety_ids <- 1:100
efficacy_subset <- safety_ids[1:85] # Efficacy population5.3 1.3 Vector Modification - Updating Clinical Data
# Laboratory values that need updating
lab_values <- c(95, 87, 110, 92)
# Single value modification (data correction)
lab_values[2] <- 89 # Correcting second patient's value
# Multiple value updates (batch correction)
lab_values[c(1, 4)] <- c(96, 94)
# Extending vectors (new patient enrollment)
lab_values[8] <- lab_values[2] # Copying value
lab_values[9] <- 105 # New patient value
# Clinical scenario: Updating multiple positions
visit_scores <- c(10, 15, 20, 25)
visit_scores[c(2, 5)] <- c(16, 30) # Update visits 2 and 55.4 1.4 Arithmetic Operations - Statistical Calculations
# Biomarker values for statistical analysis
biomarker_values <- c(12.5, 15.3, 18.7, 14.2)
# Basic transformations common in clinical analysis
biomarker_values + 2.5 # Adding baseline adjustment[1] 15.0 17.8 21.2 16.7
biomarker_values - 10 # Subtracting control value[1] 2.5 5.3 8.7 4.2
biomarker_values * 1.2 # Scaling factor[1] 15.00 18.36 22.44 17.04
biomarker_values / 2 # Half-life calculation[1] 6.25 7.65 9.35 7.10
# Advanced arithmetic operations
biomarker_values %/% 2 # Integer division[1] 6 7 9 7
biomarker_values %% 3 # Modulo operation[1] 0.5 0.3 0.7 2.2
# Essential statistical functions for clinical data
min(biomarker_values) # Minimum value[1] 12.5
max(biomarker_values) # Maximum value[1] 18.7
median(biomarker_values) # Median (robust to outliers)[1] 14.75
mean(biomarker_values) # Mean value[1] 15.175
range(biomarker_values) # Range (min, max)[1] 12.5 18.7
var(biomarker_values) # Variance[1] 6.849167
sd(biomarker_values) # Standard deviation[1] 2.617091
quantile(biomarker_values) # Quartiles 0% 25% 50% 75% 100%
12.500 13.775 14.750 16.150 18.700
IQR(biomarker_values) # Interquartile range[1] 2.375
# Custom quantiles for clinical cutoffs
quantile(biomarker_values, probs = c(0.25, 0.5, 0.75, 0.95)) 25% 50% 75% 95%
13.775 14.750 16.150 18.190
5.5 1.5 The WHICH Function - Clinical Data Subsetting
The which() function is essential for identifying patients meeting specific clinical criteria.
# Patient vital signs data
systolic_bp <- c(120, 145, 135, 160, 110, 180, 125, 155)
# Identifying hypertensive patients (>140 mmHg)
hypertensive_logical <- systolic_bp > 140
hypertensive_positions <- which(systolic_bp > 140) # Returns positions
hypertensive_values <- systolic_bp[which(systolic_bp > 140)] # Returns values
hypertensive_values_alt <- systolic_bp[systolic_bp > 140] # Alternative method
# Finding extreme values (safety signals)
min_bp <- min(systolic_bp)
min_position <- which(systolic_bp == min_bp) # Position of minimum
min_value <- systolic_bp[which(systolic_bp == min_bp)] # Actual minimum value
# Using which. min() and which.max() for efficiency
which.min(systolic_bp) # Position only[1] 5
which.max(systolic_bp) # Position only[1] 6
# Complex clinical criteria (combination conditions)
# Patients with moderate hypertension (140-160 mmHg)
moderate_hypertension <- which(systolic_bp > 140 & systolic_bp <= 160)
moderate_bp_values <- systolic_bp[moderate_hypertension]
# Patients with either low (<110) or high (>170) BP
extreme_bp_positions <- which(systolic_bp < 110 | systolic_bp > 170)
extreme_bp_values <- systolic_bp[extreme_bp_positions]5.6 1.6 The REP Function - Creating Repetitive Clinical Data
The rep() function is valuable for creating balanced study designs and simulating clinical data.
# Creating balanced treatment allocation
treatment_sequence <- rep(1:4, times = 25) # 25 patients per treatment (4 treatments)
placebo_only <- rep("Placebo", times = 50) # 50 placebo patients
# Alternating treatment pattern
alternating_trt <- rep(c("Active", "Placebo"), times = 10) # 10 cycles
# Creating patient identifiers with repetition
patient_labels <- rep("Patient", times = 100)
center_codes <- rep(c("Site_001", "Site_002", "Site_003"), times = c(30, 40, 50))
# Complex repetition patterns for study design
dose_levels <- rep(1:4, c(10, 15, 20, 25)) # Unequal allocation
treatment_blocks <- rep(1:3, each = 20) # 20 patients per block
# Multi-level repetition (nested study design)
nested_design <- rep(1:4, each = 5, times = 6) # 4 treatments, 5 reps, 6 cycles5.7 1.7 The SEQ Function - Creating Clinical Sequences
The seq() function generates sequences critical for visit schedules, dose escalations, and time series analysis.
# Visit schedule creation (days from baseline)
visit_days <- seq(from = 0, to = 365, by = 28) # Monthly visits for 1 year
dose_escalation <- seq(from = 1, to = 100, by = 10) # Dose escalation steps
# Time-based sequences
hourly_timepoints <- seq(from = 0, to = 24, by = 2) # PK sampling every 2 hours
weekly_visits <- seq(from = 7, to = 84, by = 7) # Weekly visits for 12 weeks
# Creating equal intervals
pk_timepoints <- seq(from = 0, to = 24, length = 13) # 13 timepoints over 24 hours
biomarker_schedule <- seq(from = 1, by = 14, length = 8) # Bi-weekly for 16 weeks
# Combining sequences for complex study designs
baseline_visits <- seq(from = -14, to = 0, by = 7) # Screening period
treatment_visits <- seq(from = 1, to = 168, by = 14) # Treatment period
followup_visits <- seq(from = 182, to = 365, by = 28) # Follow-up period
complete_schedule <- c(baseline_visits, treatment_visits, followup_visits)5.8 1.8 Sequence Helper Functions
# Working with patient vectors
patient_vitals <- c(120, 135, 142, 128, 156, 118, 145, 139)
# seq_len() - creates sequence from 1 to n
patient_count <- length(patient_vitals)
patient_indices <- seq_len(patient_count) # 1, 2, 3, ..., 8
# seq_along() - creates sequence matching vector length
visit_numbers <- seq_along(patient_vitals) # Same as seq_len(length(x))5.9 1.9 Missing Data Handling - Critical in Clinical Studies
# Clinical lab values with missing data (common in real studies)
hemoglobin_values <- c(12.5, NA, 13.2, 11.8, NA, 14.1)
# Detecting missing values
missing_pattern <- is.na(hemoglobin_values) # Logical vector
complete_values <- hemoglobin_values[!is.na(hemoglobin_values)] # Complete cases only
complete_data <- na.omit(hemoglobin_values) # Alternative method
# Important: Never use == with NA
# hemoglobin_values == NA # WRONG - always returns NA
# hemoglobin_values[hemoglobin_values == NA] # WRONG
# Correct approach for missing data analysis
has_missing <- any(is.na(hemoglobin_values))
missing_count <- sum(is.na(hemoglobin_values))
complete_count <- sum(!is.na(hemoglobin_values))5.10 1.10 Vector Element Membership - Clinical Criteria Checking
# Safety population and efficacy population patient IDs
safety_population <- 1:200
efficacy_population <- 1:150
per_protocol_population <- c(1:50, 55:120, 125:148)
# Checking patient membership in populations
patient_in_safety <- 175 %in% safety_population # TRUE
patient_in_efficacy <- 175 %in% efficacy_population # FALSE
# Population overlap analysis
safety_in_efficacy <- safety_population %in% efficacy_population
efficacy_in_pp <- efficacy_population %in% per_protocol_population
# Alternative using is.element()
element_check1 <- is.element(safety_population, efficacy_population)
element_check2 <- is.element(efficacy_population, per_protocol_population)
# Clinical application: Treatment compliance checking
compliant_patients <- c(1, 5, 8, 12, 15, 18, 22, 25, 30)
all_patients <- 1:50
compliance_status <- all_patients %in% compliant_patients5.11 1.11 String Operations - Essential for Clinical Data
# Basic string output
print("Clinical Study Report")[1] "Clinical Study Report"
# String concatenation for patient identifiers
patient_prefix <- "STUDY001-"
patient_numbers <- sprintf("%03d", 1:10) # Zero-padded numbers
patient_ids <- paste(patient_prefix, patient_numbers, sep = "")
# Creating visit labels
visit_labels <- paste("Visit", 1:10, sep = "_")
timepoint_labels <- paste("Day", c(1, 7, 14, 28, 56), sep = "_")
# Handling vectors of different lengths
site_codes <- c("001", "002", "003")
patient_ranges <- 1:30
full_patient_ids <- paste(site_codes, patient_ranges, sep = "-")
# Collapsing for summary output
treatment_arms <- c("Placebo", "Low_Dose", "High_Dose")
treatment_summary <- paste(treatment_arms, collapse = ", ")
# paste0() - efficient concatenation
adverse_event_ids <- paste0("AE", sprintf("%04d", 1:100))
lab_test_codes <- paste0("LAB_", c("HEM", "CHEM", "URN"), "_", 1:5)
# Using cat() for formatted output
cat("Study Status:", "Ongoing", "\n")Study Status: Ongoing
cat("Enrolled Patients:", 156, "/", 200, "\n")Enrolled Patients: 156 / 200
5.12 1.12 Advanced String Manipulation for Clinical Data
# Character counting for data validation
max_adverse_event_length <- max(nchar(c("Headache", "Nausea", "Fatigue", "Dizziness")))
# Case conversion for standardization
raw_adverse_events <- c("headache", "NAUSEA", "Fatigue", "dizziness")
standardized_upper <- toupper(raw_adverse_events)
standardized_lower <- tolower(raw_adverse_events)
# Using casefold() for flexible case conversion
ae_upper <- casefold(raw_adverse_events, upper = TRUE)
ae_lower <- casefold(raw_adverse_events, upper = FALSE)
# Character translation for data cleaning
lab_values_dirty <- "O. 5mg/dL" # O instead of 0
lab_values_clean <- chartr(old = "O", new = "0", x = lab_values_dirty)
# Sorting clinical terms
medical_terms <- c("Zyrtec", "Aspirin", "Benadryl", "Claritin")
terms_ascending <- sort(medical_terms, decreasing = FALSE)
terms_descending <- sort(medical_terms, decreasing = TRUE)
# Substring extraction for coding
study_id <- "PROTO-2024-ONCOLOGY-001"
protocol_year <- substr(study_id, start = 7, stop = 10)
therapeutic_area <- substr(study_id, start = 12, stop = 19)5.13 1.13 Regular Expressions for Clinical Data Cleaning
# Load country data for site analysis
library(countrycode)
countries <- as.vector(countrycode::codelist$country.name.en)
# Countries starting with "A" (common site locations)
countries_a <- grep(pattern = "^A", x = countries, value = TRUE)
countries_a_positions <- grep(pattern = "^A", x = countries)
# Countries ending with "y"
countries_ending_y <- countries[grep(pattern = "y$", x = countries)]
# Multi-word country names (important for site coding)
multiword_countries <- countries[grep(pattern = "\\w\\s\\w", x = countries)]
# Countries ending with "e" OR "i"
countries_e_or_i <- countries[grep(pattern = "e$|i$", x = countries)]
# Countries containing "gin" (pattern matching)
countries_with_gin <- countries[grep(pattern = "gin", x = countries)]
# Metacharacter handling in clinical data
financial_data <- c("cost $1000", "budget", "expense $500")
# Escaping special characters
dollar_entries <- financial_data[grep(pattern = "\\$", x = financial_data)]
# Finding decimal numbers in lab results
lab_results <- c("12.5 mg/dL", "normal", "15.7 mg/dL")
decimal_results <- lab_results[grep(pattern = "\\.", x = lab_results)]
# Anchoring patterns
medication_codes <- c("MED123", "ADMIN456", "MED789")
med_codes_only <- medication_codes[grep("^MED", medication_codes)]
admin_codes_only <- medication_codes[grep("456$", medication_codes)]
# Character classes for validation
study_codes <- c("ST123", "STUDY", "ST456")
numeric_codes <- study_codes[grep("\\d", study_codes)] # Contains digits
alpha_codes <- study_codes[grep("\\D", study_codes)] # Contains non-digits
# Vowel detection in adverse event terms
ae_terms <- c("headache", "nausea", "rash")
vowel_containing <- ae_terms[grep("[aeiou]", ae_terms)]
numeric_containing <- ae_terms[grep("[0-9]", ae_terms)]5.14 1.14 String Replacement - Data Standardization
# Clinical data standardization
report_text <- "Patient received Drug A, and Patient showed improvement with Drug A."
# Single replacement vs. global replacement
single_replace <- sub(pattern = "Drug A", replacement = "Investigational Product", x = report_text)
global_replace <- gsub(pattern = "Drug A", replacement = "Investigational Product", x = report_text)
# Removing whitespace for data cleaning
messy_data <- "Patient ID: 12345 Status: Active"
clean_data <- gsub(pattern = "\\s+", replacement = " ", x = messy_data) # Multiple spaces to single
no_spaces <- gsub(pattern = "\\s", replacement = "", x = messy_data) # Remove all spaces
# String splitting for data parsing
patient_info <- "LastName,FirstName,DOB,Gender"
info_components <- strsplit(x = patient_info, split = ",")
# Parsing structured clinical data
phone_numbers <- c("555-123-4567", "555-987-6543")
phone_parts <- strsplit(x = phone_numbers, split = "-")6 Chapter 2: Data Frame Manipulation with dplyr - Clinical Data Analysis
Data frames are the primary data structure for clinical datasets (SDTM, ADaM). The dplyr package provides a grammar of data manipulation essential for clinical programming.
6.1 2.1 Creating Data Frames - Study Data Structure
# Basic clinical data frame
patient_demographics <- data.frame(
USUBJID = paste0("STUDY001-", sprintf("%03d", 1:5)),
AGE = c(45, 67, 23, 55, 34),
SEX = c("M", "F", "M", "F", "M"),
RACE = c("WHITE", "BLACK", "ASIAN", "WHITE", "OTHER"),
ARM = c("Placebo", "Treatment", "Placebo", "Treatment", "Treatment"),
RANDDT = as.Date(c("2024-01-15", "2024-01-16", "2024-01-17", "2024-01-18", "2024-01-19"))
)
# More complex data frame with vectors
visit_dates <- seq(as.Date("2024-01-01"), length = 10, by = "weeks")
visit_numbers <- 1:10
study_status <- rep(c("Ongoing", "Completed"), each = 5)
visit_schedule <- data.frame(
VISITNUM = visit_numbers,
VISITDT = visit_dates,
STATUS = study_status,
stringsAsFactors = FALSE # Important for character data
)
# Converting matrix to data frame (rare but useful)
lab_matrix <- matrix(data = rnorm(50, mean = 100, sd = 15), nrow = 10, ncol = 5)
rownames(lab_matrix) <- paste("Patient", 1:10, sep = "_")
colnames(lab_matrix) <- paste("Lab", LETTERS[1:5], sep = "_")
lab_dataframe <- as.data.frame(lab_matrix)
# Data frame inspection
dim(patient_demographics) # Dimensions: rows x columns[1] 5 6
nrow(patient_demographics) # Number of patients[1] 5
ncol(patient_demographics) # Number of variables[1] 6
class(patient_demographics) # Object class[1] "data.frame"
str(patient_demographics) # Structure summary'data.frame': 5 obs. of 6 variables:
$ USUBJID: chr "STUDY001-001" "STUDY001-002" "STUDY001-003" "STUDY001-004" ...
$ AGE : num 45 67 23 55 34
$ SEX : chr "M" "F" "M" "F" ...
$ RACE : chr "WHITE" "BLACK" "ASIAN" "WHITE" ...
$ ARM : chr "Placebo" "Treatment" "Placebo" "Treatment" ...
$ RANDDT : Date, format: "2024-01-15" "2024-01-16" ...
6.2 2.2 Accessing Data Frame Elements - Patient Data Extraction
# Single column extraction (returns data frame)
age_df <- patient_demographics["AGE"]
class(age_df) # Still a data frame[1] "data.frame"
# Single column extraction (returns vector)
age_vector <- patient_demographics[["AGE"]]
age_vector_alt <- patient_demographics$AGE
class(age_vector) # Vector[1] "numeric"
# Multiple column extraction
demographics_subset <- patient_demographics[c("USUBJID", "AGE", "SEX")]
# Row and column slicing (critical for patient subsets)
# First patient's age
first_patient_age <- patient_demographics[1, "AGE"]
# First 3 patients, last 3 variables
subset_data <- patient_demographics[1:3, 4:6]
# All male patients (logical indexing)
male_patients <- patient_demographics[patient_demographics$SEX == "M", ]
# Treatment arm patients only
treatment_patients <- patient_demographics[patient_demographics$ARM == "Treatment", ]6.3 2.3 Modifying Data Frames - Clinical Data Updates
# Create working copy
clinical_data <- patient_demographics
# Adding new columns (common in clinical programming)
clinical_data$AGEGR1 <- ifelse(clinical_data$AGE < 65, "<65", ">=65")
clinical_data$STUDY_DAY <- as.numeric(Sys.Date() - clinical_data$RANDDT)
# Adding multiple patients (enrollment updates)
new_patient <- data.frame(
USUBJID = "STUDY001-006",
AGE = 42,
SEX = "F",
RACE = "HISPANIC",
ARM = "Placebo",
RANDDT = as.Date("2024-01-20"),
AGEGR1 = "<65",
STUDY_DAY = 1
)
# Proper way to add rows (rbind)
clinical_data <- rbind(clinical_data, new_patient)
# Adding site information (column-wise merge)
site_info <- data.frame(
SITEID = c("001", "002", "001", "002", "001", "003"),
COUNTRY = c("USA", "USA", "USA", "USA", "USA", "CAN")
)
clinical_data <- cbind(clinical_data, site_info)
# Removing columns (data cleanup)
clinical_data$STUDY_DAY <- NULL
# Removing rows (patient withdrawal)
# Remove last patient (index 6)
clinical_data <- clinical_data[-6, ]6.4 2.4 Data Frame Summary and Subsetting
# Comprehensive data summary
summary(clinical_data) USUBJID AGE SEX RACE
Length:5 Min. :23.0 Length:5 Length:5
Class :character 1st Qu.:34.0 Class :character Class :character
Mode :character Median :45.0 Mode :character Mode :character
Mean :44.8
3rd Qu.:55.0
Max. :67.0
ARM RANDDT AGEGR1 SITEID
Length:5 Min. :2024-01-15 Length:5 Length:5
Class :character 1st Qu.:2024-01-16 Class :character Class :character
Mode :character Median :2024-01-17 Mode :character Mode :character
Mean :2024-01-17
3rd Qu.:2024-01-18
Max. :2024-01-19
COUNTRY
Length:5
Class :character
Mode :character
# Subsetting with conditions (safety population)
male_patients <- subset(clinical_data, SEX == "M")
elderly_female <- subset(clinical_data, SEX == "F" & AGE >= 65)
treatment_elderly <- subset(clinical_data, ARM == "Treatment" & AGE >= 65)
# Using which() for row indices
male_indices <- which(clinical_data$SEX == "M")
male_patients_alt <- clinical_data[male_indices, ]
# Statistical calculations
n_managers <- sum(clinical_data$ARM == "Treatment") # Treatment group size
mean_age <- mean(clinical_data$AGE)
age_by_treatment <- aggregate(AGE ~ ARM, data = clinical_data, FUN = mean)
# Creating composite variables
clinical_data$PATIENT_LABEL <- paste(clinical_data$USUBJID, clinical_data$ARM, sep = "_")
# Apply functions for calculations
numeric_columns <- c("AGE")
apply(clinical_data[numeric_columns], 2, function(x) c(mean = mean(x), sd = sd(x))) AGE
mean 44.80000
sd 17.23949
7 Chapter 3: Reading SAS Datasets - SDTM and ADaM
# Reading SDTM datasets (Study Data Tabulation Model)
# Commented out as files may not exist in all environments
# dm <- haven::read_sas("data/sdtm/dm.sas7bdat") # Demographics
# ae <- haven::read_sas("data/sdtm/ae. sas7bdat") # Adverse Events
# vs <- haven::read_sas("data/sdtm/vs. sas7bdat") # Vital Signs
# lb <- haven::read_sas("data/sdtm/lb. sas7bdat") # Laboratory
# Reading ADaM datasets (Analysis Data Model)
# adsl <- haven::read_sas("data/adam/adsl.sas7bdat") # Subject Level Analysis Dataset
# adae <- haven::read_sas("data/adam/adae.sas7bdat") # Adverse Events Analysis Dataset
# adlb <- haven::read_sas("data/adam/adlb.sas7bdat") # Laboratory Analysis Dataset
# For demonstration, create sample datasets
dm <- data.frame(
USUBJID = paste0("STUDY001-", sprintf("%03d", 1:100)),
ARM = rep(c("Placebo", "Treatment A", "Treatment B"), length.out = 100),
AGE = sample(18:80, 100, replace = TRUE),
SEX = sample(c("M", "F"), 100, replace = TRUE),
RACE = sample(c("WHITE", "BLACK", "ASIAN", "OTHER"), 100, replace = TRUE),
RFSTDTC = sample(seq(as.Date("2024-01-01"), as.Date("2024-06-01"), by = "day"), 100),
stringsAsFactors = FALSE
)
ae <- data.frame(
USUBJID = sample(dm$USUBJID, 200, replace = TRUE),
AETERM = sample(c("Headache", "Nausea", "Fatigue", "Dizziness", "Rash"), 200, replace = TRUE),
AESEV = sample(c("MILD", "MODERATE", "SEVERE"), 200, replace = TRUE),
AEOUT = sample(c("RECOVERED", "ONGOING", "NOT RECOVERED"), 200, replace = TRUE),
stringsAsFactors = FALSE
)8 Chapter 4: Advanced dplyr Operations - Clinical Data Manipulation
8.1 4.1 Column Selection - Choosing Clinical Variables
The select() function is fundamental for creating analysis datasets from raw SDTM data.
# Basic column selection for demographics analysis
dm_analysis <- dm %>%
dplyr::select(USUBJID, ARM, AGE, SEX)
# Range selection (consecutive columns)
dm_basic <- dm %>%
dplyr::select(USUBJID: ARM) # From USUBJID to ARM
# Numeric position selection
dm_first_three <- dm %>%
dplyr::select(1:3)
# Helper functions for clinical programming
treatment_vars <- dm %>%
dplyr::select(starts_with("ARM")) # Treatment-related variables
date_vars <- dm %>%
dplyr::select(contains("DT")) # Date variables
baseline_vars <- dm %>%
dplyr::select(ends_with("BL")) # Baseline variables
# Combining selections
key_demographics <- dm %>%
dplyr:: select(USUBJID, AGE, SEX, starts_with("ARM"))
# Alternative syntax (less preferred in pipelines)
dm_subset <- dplyr::select(dm, USUBJID, ARM)8.2 4.2 Dropping Columns - Data Cleanup
# Dropping unnecessary columns for analysis
# Create sample ADSL with many columns
adsl <- dm %>%
dplyr::mutate(
STUDYID = "STUDY001",
TRT01A = ARM,
TRT01P = ARM,
TRT01AN = case_when(
ARM == "Placebo" ~ 0,
ARM == "Treatment A" ~ 1,
ARM == "Treatment B" ~ 2
),
SAFFL = "Y",
ITTFL = "Y",
RANDDT = RFSTDTC
)
# Drop individual columns
adsl_clean <- adsl %>%
dplyr::select(-c(STUDYID, RFSTDTC))
# Drop column ranges
adsl_minimal <- adsl %>%
dplyr::select(-c(TRT01A: TRT01AN))
# Drop by position
adsl_no_first_col <- adsl %>%
dplyr::select(-1)
# Drop multiple non-consecutive columns
adsl_focused <- adsl %>%
dplyr::select(-c(1, 3, 5:7))
# Drop using helper functions
adsl_no_treatment <- adsl %>%
dplyr::select(-starts_with("TRT"))
adsl_no_dates <- adsl %>%
dplyr::select(-ends_with("DT"))
adsl_no_flags <- adsl %>%
dplyr::select(-contains("FL"))8.3 4.3 Filtering Rows - Patient Population Selection
Row filtering is critical for defining analysis populations (Safety, ITT, Per-Protocol).
# Safety population (patients who received at least one dose)
safety_population <- adsl %>%
dplyr::filter(SAFFL == "Y")
# Treatment group analysis
placebo_patients <- adsl %>%
dplyr::select(USUBJID, TRT01A, SEX, AGE) %>%
dplyr::filter(TRT01A == "Placebo")
# Complex filtering (multiple conditions with AND)
elderly_male_treatment <- adsl %>%
dplyr::select(USUBJID, TRT01A, SEX, AGE) %>%
dplyr::filter(TRT01A == "Treatment A" & SEX == "M" & AGE >= 65)
# OR conditions (either condition can be true)
special_population <- adsl %>%
dplyr::select(USUBJID, TRT01A, SEX, AGE) %>%
dplyr::filter(TRT01A == "Placebo" & (SEX == "M" | AGE >= 70))
# Age range filtering
adult_population <- adsl %>%
dplyr::select(USUBJID, TRT01A, SEX, AGE) %>%
dplyr::filter(AGE >= 18 & AGE <= 65)
# Multiple value selection using %in%
active_treatments <- adsl %>%
dplyr::select(USUBJID, TRT01A, SEX, AGE) %>%
dplyr::filter(TRT01A %in% c("Treatment A", "Treatment B"))
# Exclusion filtering using ! (NOT operator)
non_placebo <- adsl %>%
dplyr::select(USUBJID, TRT01A, SEX, AGE) %>%
dplyr::filter(! TRT01A == "Placebo")
# Excluding multiple values
excluded_ages <- adsl %>%
dplyr::select(USUBJID, TRT01A, SEX, AGE) %>%
dplyr::filter(! AGE %in% c(70, 75, 80))
# Combined inclusion and exclusion
analysis_population <- adsl %>%
dplyr::select(USUBJID, TRT01A, SEX, AGE) %>%
dplyr::filter(TRT01A %in% c("Treatment A", "Treatment B") &
! SEX == "F" &
AGE >= 18)8.4 4.4 Variable Renaming - Standardization
# Single variable rename
adsl_renamed <- adsl %>%
dplyr::rename(SUBJECT_ID = USUBJID)
# Multiple variable renames
adsl_standard <- adsl %>%
dplyr::rename(
SUBJECT_ID = USUBJID,
TREATMENT = TRT01P,
PLANNED_TREATMENT = TRT01A
)
# Converting to lowercase (common requirement)
adsl_lower <- adsl
names(adsl_lower) <- tolower(names(adsl_lower))8.5 4.5 Sorting Data - Clinical Data Ordering
# Single variable sorting
adsl_by_age <- adsl %>%
dplyr::select(USUBJID, SEX, AGE, TRT01A) %>%
dplyr::arrange(AGE)
# Multiple variable sorting (treatment, then age)
adsl_trt_age <- adsl %>%
dplyr::select(USUBJID, SEX, AGE, TRT01A) %>%
dplyr:: arrange(TRT01A, AGE)
# Descending order
adsl_age_desc <- adsl %>%
dplyr::select(USUBJID, SEX, AGE, TRT01A) %>%
dplyr::arrange(desc(AGE))
# Mixed ascending and descending
adsl_mixed_sort <- adsl %>%
dplyr::select(USUBJID, SEX, AGE, TRT01A) %>%
dplyr::arrange(AGE, desc(TRT01A))8.6 4.6 Creating New Variables - Derived Variables
Variable creation is essential for analysis datasets, creating flags, categorizations, and derived endpoints.
# Simple variable creation
adsl_enhanced <- adsl %>%
dplyr::mutate(STUDY_NAME = "Phase III Efficacy Study")
# Conditional variable creation using if_else()
adsl_derived <- adsl %>%
dplyr::mutate(
AGEGR1 = dplyr::if_else(AGE < 65, "<65", ">=65"), # Age group
SAFFL_DERIVED = dplyr::if_else(! is.na(RANDDT), "Y", "N"), # Safety flag
SEX_FULL = dplyr::if_else(SEX == "M", "Male", "Female") # Full gender
) %>%
dplyr::select(USUBJID, AGE, AGEGR1, SEX, SEX_FULL, RANDDT, SAFFL_DERIVED)
# Complex conditional logic with case_when()
adsl_complex <- adsl %>%
dplyr::mutate(
AGE_GROUP = dplyr::case_when(
AGE < 18 ~ "Pediatric",
AGE >= 18 & AGE < 65 ~ "Adult",
AGE >= 65 & AGE < 75 ~ "Elderly",
AGE >= 75 ~ "Very Elderly",
TRUE ~ "Unknown" # Default case for missing values
),
RISK_CATEGORY = dplyr::case_when(
AGE >= 65 & SEX == "F" ~ "High Risk",
AGE >= 70 ~ "High Risk",
AGE < 30 ~ "Low Risk",
TRUE ~ "Standard Risk"
)
) %>%
dplyr::select(USUBJID, AGE, SEX, AGE_GROUP, RISK_CATEGORY)8.7 4.7 Data Binding - Combining Datasets
# Create sample datasets for binding examples
dataset_a <- data.frame(
USUBJID = c("001", "002", "003"),
VISIT = c("V1", "V1", "V1"),
LBTEST = c("Hemoglobin", "Hemoglobin", "Hemoglobin"),
LBSTRESN = c(12.5, 13.2, 11.8)
)
dataset_b <- data.frame(
USUBJID = c("004", "005", "006"),
VISIT = c("V1", "V1", "V1"),
LBTEST = c("Hemoglobin", "Hemoglobin", "Hemoglobin"),
LBSTRESN = c(14.1, 12.9, 13.5)
)
# Row binding (adding more patients/observations)
combined_lab <- dplyr::bind_rows(dataset_a, dataset_b)
# Row binding with source identification
combined_with_source <- dplyr::bind_rows(
"Batch_1" = dataset_a,
"Batch_2" = dataset_b,
.id = "DATA_SOURCE"
)
# Binding with repeated datasets (useful for simulation)
repeated_data <- dplyr::bind_rows(dataset_a, dataset_b, dataset_a, .id = "ITERATION")
# Creating total rows for summary tables
adsl_with_total <- dplyr::bind_rows(
adsl %>%
dplyr::select(USUBJID, TRT01P, TRT01AN) %>%
dplyr::slice_head(n = 10), # First 10 for example
adsl %>%
dplyr::slice_head(n = 10) %>%
dplyr::mutate(TRT01P = "Total", TRT01AN = 99) %>%
dplyr::select(USUBJID, TRT01P, TRT01AN)
)8.8 4.8 Recoding Variables - Data Standardization
# Creating lookup table for recoding
sex_mapping <- c("M" = "Male", "F" = "Female")
# Using recode() function
adsl_recoded <- adsl %>%
dplyr::mutate(SEX_LABEL = dplyr::recode(SEX, !!!sex_mapping)) %>%
dplyr::select(USUBJID, SEX, SEX_LABEL)
# Direct recoding without lookup table
adsl_direct_recode <- adsl %>%
dplyr::mutate(SEX_FULL = dplyr::recode(SEX,
"M" = "Male",
"F" = "Female")) %>%
dplyr::select(USUBJID, SEX, SEX_FULL)
# Alternative using if_else (often more readable)
adsl_alternative <- adsl %>%
dplyr::mutate(SEX_DESC = dplyr::if_else(SEX == "M", "Male", "Female")) %>%
dplyr::select(USUBJID, SEX, SEX_DESC)8.9 4.9 Factor Creation - Categorical Data Management
# Creating factors with specified levels (important for analysis and reporting)
adsl_factors <- adsl %>%
dplyr::mutate(
AGEGR1_FACTOR = factor(
dplyr::if_else(AGE < 65, "<65", ">=65"),
levels = c("<65", ">=65")
),
SEX_FACTOR = factor(SEX, levels = c("M", "F")),
TRT01A_FACTOR = factor(TRT01A, levels = c("Placebo", "Treatment A", "Treatment B"))
) %>%
dplyr:: select(USUBJID, AGE, AGEGR1_FACTOR, SEX_FACTOR, TRT01A_FACTOR)
# Verify factor levels
levels(adsl_factors$AGEGR1_FACTOR)[1] "<65" ">=65"
levels(adsl_factors$SEX_FACTOR)[1] "M" "F"
levels(adsl_factors$TRT01A_FACTOR)[1] "Placebo" "Treatment A" "Treatment B"
9 Chapter 5: Advanced Data Operations
9.1 5.1 Table Joins - Merging Clinical Datasets
Joins are fundamental for combining SDTM domains or linking analysis datasets.
# Create realistic clinical datasets
patients_demo <- data.frame(
USUBJID = paste0("STUDY001-", sprintf("%03d", 1:6)),
AGE = c(45, 67, 23, 55, 34, 42),
SEX = c("M", "F", "M", "F", "M", "F"),
ARM = c("Placebo", "Treatment", "Placebo", "Treatment", "Treatment", "Placebo")
)
lab_results <- data.frame(
USUBJID = paste0("STUDY001-", sprintf("%03d", c(2, 3, 4, 5, 7, 8))),
LBTEST = rep(c("Hemoglobin", "Creatinine"), each = 3),
LBSTRESN = c(12.5, 13.2, 11.8, 1.1, 0.9, 1.3),
VISITNUM = c(1, 1, 1, 1, 1, 1)
)
print("Demographics Data:")[1] "Demographics Data:"
print(patients_demo) USUBJID AGE SEX ARM
1 STUDY001-001 45 M Placebo
2 STUDY001-002 67 F Treatment
3 STUDY001-003 23 M Placebo
4 STUDY001-004 55 F Treatment
5 STUDY001-005 34 M Treatment
6 STUDY001-006 42 F Placebo
print("Laboratory Results:")[1] "Laboratory Results:"
print(lab_results) USUBJID LBTEST LBSTRESN VISITNUM
1 STUDY001-002 Hemoglobin 12.5 1
2 STUDY001-003 Hemoglobin 13.2 1
3 STUDY001-004 Hemoglobin 11.8 1
4 STUDY001-005 Creatinine 1.1 1
5 STUDY001-007 Creatinine 0.9 1
6 STUDY001-008 Creatinine 1.3 1
9.1.1 Inner Join - Only Patients with Lab Results
# Inner join: Only patients with both demographics and lab data
inner_result <- dplyr::inner_join(patients_demo, lab_results, by = "USUBJID")
print("Inner Join Result (patients with lab data):")[1] "Inner Join Result (patients with lab data):"
print(inner_result) USUBJID AGE SEX ARM LBTEST LBSTRESN VISITNUM
1 STUDY001-002 67 F Treatment Hemoglobin 12.5 1
2 STUDY001-003 23 M Placebo Hemoglobin 13.2 1
3 STUDY001-004 55 F Treatment Hemoglobin 11.8 1
4 STUDY001-005 34 M Treatment Creatinine 1.1 1
9.1.2 Left Join - All Patients, Lab Data Where Available
# Left join: All patients from demographics, lab data where available
left_result <- dplyr::left_join(patients_demo, lab_results, by = "USUBJID")
print("Left Join Result (all patients):")[1] "Left Join Result (all patients):"
print(left_result) USUBJID AGE SEX ARM LBTEST LBSTRESN VISITNUM
1 STUDY001-001 45 M Placebo <NA> NA NA
2 STUDY001-002 67 F Treatment Hemoglobin 12.5 1
3 STUDY001-003 23 M Placebo Hemoglobin 13.2 1
4 STUDY001-004 55 F Treatment Hemoglobin 11.8 1
5 STUDY001-005 34 M Treatment Creatinine 1.1 1
6 STUDY001-006 42 F Placebo <NA> NA NA
9.1.3 Right Join - All Lab Results, Demographics Where Available
# Right join: All lab results, demographics where available
right_result <- dplyr::right_join(patients_demo, lab_results, by = "USUBJID")
print("Right Join Result (all lab results):")[1] "Right Join Result (all lab results):"
print(right_result) USUBJID AGE SEX ARM LBTEST LBSTRESN VISITNUM
1 STUDY001-002 67 F Treatment Hemoglobin 12.5 1
2 STUDY001-003 23 M Placebo Hemoglobin 13.2 1
3 STUDY001-004 55 F Treatment Hemoglobin 11.8 1
4 STUDY001-005 34 M Treatment Creatinine 1.1 1
5 STUDY001-007 NA <NA> <NA> Creatinine 0.9 1
6 STUDY001-008 NA <NA> <NA> Creatinine 1.3 1
9.1.4 Full Join - All Records from Both Datasets
# Full join: All records from both datasets
full_result <- dplyr::full_join(patients_demo, lab_results, by = "USUBJID")
print("Full Join Result (all records):")[1] "Full Join Result (all records):"
print(full_result) USUBJID AGE SEX ARM LBTEST LBSTRESN VISITNUM
1 STUDY001-001 45 M Placebo <NA> NA NA
2 STUDY001-002 67 F Treatment Hemoglobin 12.5 1
3 STUDY001-003 23 M Placebo Hemoglobin 13.2 1
4 STUDY001-004 55 F Treatment Hemoglobin 11.8 1
5 STUDY001-005 34 M Treatment Creatinine 1.1 1
6 STUDY001-006 42 F Placebo <NA> NA NA
7 STUDY001-007 NA <NA> <NA> Creatinine 0.9 1
8 STUDY001-008 NA <NA> <NA> Creatinine 1.3 1
9.1.5 Semi Join - Patients Who Have Lab Results (No Lab Columns)
# Semi join: Patients who have lab results (demographics only)
semi_result <- dplyr::semi_join(patients_demo, lab_results, by = "USUBJID")
print("Semi Join Result (patients with labs, demographics only):")[1] "Semi Join Result (patients with labs, demographics only):"
print(semi_result) USUBJID AGE SEX ARM
1 STUDY001-002 67 F Treatment
2 STUDY001-003 23 M Placebo
3 STUDY001-004 55 F Treatment
4 STUDY001-005 34 M Treatment
9.1.6 Anti Join - Patients Without Lab Results
# Anti join: Patients without lab results
anti_result <- dplyr::anti_join(patients_demo, lab_results, by = "USUBJID")
print("Anti Join Result (patients without labs):")[1] "Anti Join Result (patients without labs):"
print(anti_result) USUBJID AGE SEX ARM
1 STUDY001-001 45 M Placebo
2 STUDY001-006 42 F Placebo
# Reverse anti join: Lab results without demographics
anti_reverse <- dplyr::anti_join(lab_results, patients_demo, by = "USUBJID")
print("Reverse Anti Join (labs without demographics):")[1] "Reverse Anti Join (labs without demographics):"
print(anti_reverse) USUBJID LBTEST LBSTRESN VISITNUM
1 STUDY001-007 Creatinine 0.9 1
2 STUDY001-008 Creatinine 1.3 1
9.2 5.2 Data Reshaping with tidyr - Wide vs. Long Format
Clinical data often needs reshaping between wide and long formats for analysis and reporting.
# Create wide-format visit data (common in clinical databases)
visit_data_wide <- data.frame(
USUBJID = paste0("STUDY001-", sprintf("%03d", 1:5)),
BASELINE = c(95, 87, 110, 92, 88),
WEEK_4 = c(92, 85, 108, 89, 85),
WEEK_8 = c(89, 83, 105, 87, 83),
WEEK_12 = c(87, 81, 103, 85, 81)
)
print("Wide Format Data (one row per patient):")[1] "Wide Format Data (one row per patient):"
print(visit_data_wide) USUBJID BASELINE WEEK_4 WEEK_8 WEEK_12
1 STUDY001-001 95 92 89 87
2 STUDY001-002 87 85 83 81
3 STUDY001-003 110 108 105 103
4 STUDY001-004 92 89 87 85
5 STUDY001-005 88 85 83 81
# Convert to long format using pivot_longer()
visit_data_long <- visit_data_wide %>%
tidyr::pivot_longer(
cols = c(BASELINE, WEEK_4, WEEK_8, WEEK_12),
names_to = "VISIT",
values_to = "LAB_VALUE"
)
print("Long Format Data (one row per patient-visit):")[1] "Long Format Data (one row per patient-visit):"
print(visit_data_long)# A tibble: 20 × 3
USUBJID VISIT LAB_VALUE
<chr> <chr> <dbl>
1 STUDY001-001 BASELINE 95
2 STUDY001-001 WEEK_4 92
3 STUDY001-001 WEEK_8 89
4 STUDY001-001 WEEK_12 87
5 STUDY001-002 BASELINE 87
6 STUDY001-002 WEEK_4 85
7 STUDY001-002 WEEK_8 83
8 STUDY001-002 WEEK_12 81
9 STUDY001-003 BASELINE 110
10 STUDY001-003 WEEK_4 108
11 STUDY001-003 WEEK_8 105
12 STUDY001-003 WEEK_12 103
13 STUDY001-004 BASELINE 92
14 STUDY001-004 WEEK_4 89
15 STUDY001-004 WEEK_8 87
16 STUDY001-004 WEEK_12 85
17 STUDY001-005 BASELINE 88
18 STUDY001-005 WEEK_4 85
19 STUDY001-005 WEEK_8 83
20 STUDY001-005 WEEK_12 81
# Convert back to wide format using pivot_wider()
visit_data_reconstructed <- visit_data_long %>%
tidyr::pivot_wider(
id_cols = USUBJID,
names_from = VISIT,
values_from = LAB_VALUE,
names_prefix = "VISIT_" # Optional prefix
)
print("Reconstructed Wide Format:")[1] "Reconstructed Wide Format:"
print(visit_data_reconstructed)# A tibble: 5 × 5
USUBJID VISIT_BASELINE VISIT_WEEK_4 VISIT_WEEK_8 VISIT_WEEK_12
<chr> <dbl> <dbl> <dbl> <dbl>
1 STUDY001-001 95 92 89 87
2 STUDY001-002 87 85 83 81
3 STUDY001-003 110 108 105 103
4 STUDY001-004 92 89 87 85
5 STUDY001-005 88 85 83 81
# Complex reshaping example with multiple measures
complex_data <- data.frame(
USUBJID = rep(paste0("STUDY001-", sprintf("%03d", 1:3)), each = 2),
VISIT = rep(c("BASELINE", "WEEK_4"), 3),
HEMOGLOBIN = c(12.5, 12.8, 13.1, 13.3, 11.8, 12.1),
CREATININE = c(1.1, 1.0, 0.9, 0.9, 1.2, 1.1)
)
# Reshape to have one column per visit-test combination
complex_wide <- complex_data %>%
tidyr::pivot_wider(
id_cols = USUBJID,
names_from = VISIT,
values_from = c(HEMOGLOBIN, CREATININE),
names_sep = "_"
)
print("Complex Wide Format:")[1] "Complex Wide Format:"
print(complex_wide)# A tibble: 3 × 5
USUBJID HEMOGLOBIN_BASELINE HEMOGLOBIN_WEEK_4 CREATININE_BASELINE
<chr> <dbl> <dbl> <dbl>
1 STUDY001-001 12.5 12.8 1.1
2 STUDY001-002 13.1 13.3 0.9
3 STUDY001-003 11.8 12.1 1.2
# ℹ 1 more variable: CREATININE_WEEK_4 <dbl>
9.3 5.3 Counting and Summarizing - Clinical Summary Tables
# Simple counting for demographic tables
treatment_counts <- adsl %>%
dplyr::count(TRT01A) %>%
dplyr::rename(Treatment = TRT01A, N = n)
print("Treatment Allocation:")[1] "Treatment Allocation:"
print(treatment_counts) Treatment N
1 Placebo 34
2 Treatment A 33
3 Treatment B 33
# Cross-tabulation (treatment by sex)
treatment_sex_counts <- adsl %>%
dplyr::count(TRT01A, SEX) %>%
dplyr:: arrange(TRT01A, SEX)
print("Treatment by Sex:")[1] "Treatment by Sex:"
print(treatment_sex_counts) TRT01A SEX n
1 Placebo F 15
2 Placebo M 19
3 Treatment A F 23
4 Treatment A M 10
5 Treatment B F 20
6 Treatment B M 13
# Filtering before counting (safety population)
safety_counts <- adsl %>%
dplyr::filter(SAFFL == "Y") %>%
dplyr::count(TRT01A) %>%
dplyr::rename(Treatment = TRT01A, Safety_N = n)
# Complex counting with adverse events
# Note: Using simulated AE data since file may not exist
adae_sim <- data.frame(
USUBJID = sample(adsl$USUBJID, 200, replace = TRUE),
AEBODSYS = sample(c("Nervous System", "Gastrointestinal", "Respiratory"), 200, replace = TRUE),
AEDECOD = sample(c("Headache", "Nausea", "Cough", "Fatigue"), 200, replace = TRUE),
TRTA = sample(c("Placebo", "Treatment A", "Treatment B"), 200, replace = TRUE),
SAFFL = "Y"
)
# Remove duplicates (one AE per patient-term combination)
adae_unique <- adae_sim %>%
dplyr::distinct(USUBJID, AEBODSYS, AEDECOD, TRTA, SAFFL)
# Count AEs by treatment
ae_summary <- adae_unique %>%
dplyr::filter(SAFFL == "Y") %>%
dplyr::group_by(TRTA) %>%
dplyr::count(AEBODSYS, AEDECOD) %>%
dplyr::ungroup()
# Add denominators for percentages
bign <- adsl %>%
dplyr::filter(SAFFL == "Y") %>%
dplyr:: count(TRT01A) %>%
dplyr::rename(TRTA = TRT01A, total_n = n)
# Calculate percentages
ae_with_percent <- ae_summary %>%
dplyr::left_join(bign, by = "TRTA") %>%
dplyr::mutate(
percent = round(n / total_n * 100, 1),
display = paste0(n, " (", percent, "%)")
)
# Reshape for reporting
ae_table <- ae_with_percent %>%
dplyr::select(AEBODSYS, AEDECOD, TRTA, display) %>%
tidyr::pivot_wider(
id_cols = c(AEBODSYS, AEDECOD),
names_from = TRTA,
values_from = display,
values_fill = "0 (0.0%)"
)
print("Adverse Events by Treatment (first 10 rows):")[1] "Adverse Events by Treatment (first 10 rows):"
print(head(ae_table, 10))# A tibble: 10 × 5
AEBODSYS AEDECOD Placebo `Treatment A` `Treatment B`
<chr> <chr> <chr> <chr> <chr>
1 Gastrointestinal Cough 4 (11.8%) 10 (30.3%) 3 (9.1%)
2 Gastrointestinal Fatigue 2 (5.9%) 6 (18.2%) 7 (21.2%)
3 Gastrointestinal Headache 7 (20.6%) 8 (24.2%) 5 (15.2%)
4 Gastrointestinal Nausea 3 (8.8%) 8 (24.2%) 7 (21.2%)
5 Nervous System Cough 2 (5.9%) 6 (18.2%) 5 (15.2%)
6 Nervous System Fatigue 7 (20.6%) 3 (9.1%) 6 (18.2%)
7 Nervous System Headache 7 (20.6%) 3 (9.1%) 3 (9.1%)
8 Nervous System Nausea 5 (14.7%) 3 (9.1%) 5 (15.2%)
9 Respiratory Cough 6 (17.6%) 5 (15.2%) 7 (21.2%)
10 Respiratory Fatigue 6 (17.6%) 7 (21.2%) 6 (18.2%)
9.4 5.4 Distinct Values and Unique Records
# Get unique treatment combinations
unique_treatments <- adsl %>%
dplyr::distinct(TRT01A, TRT01AN) %>%
dplyr::arrange(TRT01AN)
print("Unique Treatment Codes:")[1] "Unique Treatment Codes:"
print(unique_treatments) TRT01A TRT01AN
1 Placebo 0
2 Treatment A 1
3 Treatment B 2
# Get unique patient-visit combinations (useful for verification)
unique_patients <- adsl %>%
dplyr::distinct(USUBJID) %>%
dplyr::summarise(unique_patients = n())
print("Number of Unique Patients:")[1] "Number of Unique Patients:"
print(unique_patients) unique_patients
1 100
# Remove duplicate records while keeping specific columns
adae_distinct <- adae_sim %>%
dplyr::distinct(USUBJID, AEDECOD, .keep_all = TRUE) # One AE term per patient
print(paste("Original AE records:", nrow(adae_sim)))[1] "Original AE records: 200"
print(paste("After removing duplicates:", nrow(adae_distinct)))[1] "After removing duplicates: 153"
10 Chapter 6: Best Practices and Advanced Techniques
10.1 6.1 Efficient Pipeline Construction
# Example of an efficient clinical programming pipeline
analysis_ready_data <- adsl %>%
# Step 1: Filter to analysis population
dplyr::filter(SAFFL == "Y", ! is.na(TRT01A)) %>%
# Step 2: Create derived variables
dplyr::mutate(
AGEGR1 = dplyr::case_when(
AGE < 45 ~ "<45",
AGE >= 45 & AGE < 65 ~ "45-64",
AGE >= 65 ~ ">=65",
TRUE ~ "Missing"
),
AGEGR1 = factor(AGEGR1, levels = c("<45", "45-64", ">=65")),
SEX_LABEL = dplyr::if_else(SEX == "M", "Male", "Female")
) %>%
# Step 3: Select final variables
dplyr::select(USUBJID, TRT01A, TRT01AN, AGE, AGEGR1, SEX, SEX_LABEL) %>%
# Step 4: Sort for consistent output
dplyr::arrange(TRT01AN, USUBJID)
print("Analysis Ready Dataset Structure:")[1] "Analysis Ready Dataset Structure:"
str(analysis_ready_data)'data.frame': 100 obs. of 7 variables:
$ USUBJID : chr "STUDY001-001" "STUDY001-004" "STUDY001-007" "STUDY001-010" ...
$ TRT01A : chr "Placebo" "Placebo" "Placebo" "Placebo" ...
$ TRT01AN : num 0 0 0 0 0 0 0 0 0 0 ...
$ AGE : int 32 38 71 36 28 25 57 19 47 66 ...
$ AGEGR1 : Factor w/ 3 levels "<45","45-64",..: 1 1 3 1 1 1 2 1 2 3 ...
$ SEX : chr "M" "M" "F" "F" ...
$ SEX_LABEL: chr "Male" "Male" "Female" "Female" ...
print(head(analysis_ready_data, 10)) USUBJID TRT01A TRT01AN AGE AGEGR1 SEX SEX_LABEL
1 STUDY001-001 Placebo 0 32 <45 M Male
2 STUDY001-004 Placebo 0 38 <45 M Male
3 STUDY001-007 Placebo 0 71 >=65 F Female
4 STUDY001-010 Placebo 0 36 <45 F Female
5 STUDY001-013 Placebo 0 28 <45 F Female
6 STUDY001-016 Placebo 0 25 <45 F Female
7 STUDY001-019 Placebo 0 57 45-64 F Female
8 STUDY001-022 Placebo 0 19 <45 M Male
9 STUDY001-025 Placebo 0 47 45-64 M Male
10 STUDY001-028 Placebo 0 66 >=65 F Female
10.2 6.2 Error Handling and Data Validation
# Data validation pipeline
validation_summary <- adsl %>%
dplyr::summarise(
total_patients = n(),
missing_age = sum(is.na(AGE)),
missing_sex = sum(is.na(SEX)),
missing_treatment = sum(is.na(TRT01A)),
age_range = paste(min(AGE, na.rm = TRUE), "to", max(AGE, na.rm = TRUE)),
unique_treatments = n_distinct(TRT01A, na.rm = TRUE),
.groups = "drop"
)
print("Data Validation Summary:")[1] "Data Validation Summary:"
print(validation_summary) total_patients missing_age missing_sex missing_treatment age_range
1 100 0 0 0 18 to 80
unique_treatments
1 3
# Check for data integrity issues
integrity_checks <- adsl %>%
dplyr::mutate(
age_flag = dplyr::case_when(
AGE < 0 | AGE > 120 ~ "Age out of range",
is.na(AGE) ~ "Missing age",
TRUE ~ "OK"
),
sex_flag = dplyr::case_when(
! SEX %in% c("M", "F") ~ "Invalid sex code",
is.na(SEX) ~ "Missing sex",
TRUE ~ "OK"
)
) %>%
dplyr::filter(age_flag != "OK" | sex_flag != "OK") %>%
dplyr::select(USUBJID, AGE, age_flag, SEX, sex_flag)
print("Data Integrity Issues:")[1] "Data Integrity Issues:"
print(integrity_checks)[1] USUBJID AGE age_flag SEX sex_flag
<0 rows> (or 0-length row.names)
This comprehensive guide provides pharmaceutical programmers with the essential R skills for clinical data manipulation, from basic vector operations to advanced dplyr techniques used in SDTM and ADaM dataset creation and analysis.