13  Creating ADaM: ADSL from SDTM-like Inputs

####----
# Program Name : adsl
# Protocol Name: 
# Author       : 
# Creation Date: `<DDMMMYYYY>` by <initials>
# Purpose      : Create ADSL dataset from SDTM datasets
# Modification : <DDMMMYYYY> by <initials>: <summary of changes>
####----

library(tidyverse)
library(lubridate)
library(xportr)
library(haven)
library(stringr)

##----------------------------------------------------------------------
## 0. Helper functions
##----------------------------------------------------------------------

# Function to group together missing strings and NAs
str_missing <- function(x) {
  is.na(x) | str_trim(as.character(x)) == ""
}

# Function to check dataframe metadata
contents <- function(dat) {
  
  row_contents <- function(nm, dat) {
    var <- dat[[nm]]
    data.frame(
      Variable = nm,
      Class    = class(var)[1],
      Label    = ifelse(is.null(attr(var, "label")),    "", attr(var, "label")),
      Format   = ifelse(is.null(attr(var, "SASformat")), "", attr(var, "SASformat")),
      stringsAsFactors = FALSE
    )
  }
  
  purrr::map_dfr(names(dat), row_contents, dat = dat)
}

##----------------------------------------------------------------------
## 1. Load SDTM from PHUSE Test Data Factory
##----------------------------------------------------------------------

sdtm <- "D:/R2SAS/r4sas/data/sdtm"

dm <- read_sas(paste0(sdtm, "/dm.sas7bdat"))
ex <- read_sas(paste0(sdtm, "/ex.sas7bdat"))
vs <- read_sas(paste0(sdtm, "/vs.sas7bdat"))
ds <- read_sas(paste0(sdtm, "/ds.sas7bdat"))

##----------------------------------------------------------------------
## 2. Derive DM-based variables (demo)
##----------------------------------------------------------------------

demo <- dm |> 
  # Keep only necessary columns
  select(
    STUDYID, USUBJID, SUBJID, SITEID,
    AGE, AGEU, SEX, RACE, ETHNIC,
    DTHDTC, ARM, ACTARM
  ) |> 
  mutate(
    # Age groups
    AGEGR1 = case_when(
      AGE < 65                ~ "<65",
      AGE >= 65 & AGE <= 70   ~ "65-70",
      AGE > 70                ~ ">70",
      TRUE                    ~ NA_character_
    ),
    AGEGR1N = case_when(
      AGEGR1 == "<65"   ~ 1,
      AGEGR1 == "65-70" ~ 2,
      AGEGR1 == ">70"   ~ 3,
      TRUE              ~ NA_real_
    ),
    
    # Race numeric
    RACEN = case_when(
      RACE == "WHITE"                                ~ 1,
      RACE == "BLACK OR AFRICAN AMERICAN"            ~ 2,
      RACE == "ASIAN"                                ~ 3,
      RACE == "AMERICAN INDIAN OR ALASKA NATIVE"     ~ 4,
      RACE == "NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER" ~ 5,
      RACE == "UNKNOWN"                              ~ 6,
      TRUE                                           ~ NA_real_
    ),
    
    # Planned treatment
    TRT01P = ARM,
    TRT01PN = case_when(
      TRT01P == "Placebo"               ~ 0,
      TRT01P == "Xanomeline Low Dose"   ~ 6,
      TRT01P == "Xanomeline High Dose"  ~ 9,
      TRUE                              ~ NA_real_
    ),
    
    # Actual treatment
    TRT01A = ACTARM,
    TRT01AN = case_when(
      TRT01A == "Placebo"               ~ 0,
      TRT01A == "Xanomeline Low Dose"   ~ 6,
      TRT01A == "Xanomeline High Dose"  ~ 9,
      TRUE                              ~ NA_real_
    ),
    
    # ITT flag: Y if ARM not missing
    ITTFL = if_else(!str_missing(ARM), "Y", "N"),
    
    # Death date
    DTHDT = as_date(DTHDTC)
  ) |> 
  select(-DTHDTC)

##----------------------------------------------------------------------
## 3. EX-based variables: treatment dates, duration, dose
##----------------------------------------------------------------------

# Treatment dates
treat <- ex |> 
  select(USUBJID, EXSTDTC, EXENDTC) |> 
  mutate(
    across(ends_with("DTC"), as_date)
  ) |> 
  group_by(USUBJID) |> 
  summarise(
    TRTSDT = suppressWarnings(min(EXSTDTC, na.rm = TRUE)),
    TRTEDT = suppressWarnings(max(EXENDTC, na.rm = TRUE)),
    .groups = "drop"
  ) |> 
  mutate(
    # Handle Inf from min/max when all values are NA
    TRTSDT = replace(TRTSDT, is.infinite(TRTSDT), NA_Date_),
    TRTEDT = replace(TRTEDT, is.infinite(TRTEDT), NA_Date_),
    # Treatment duration (days)
    TRTDURD = as.numeric(TRTEDT - TRTSDT + (TRTEDT >= TRTSDT))
  )

# Dosing information
dose <- ex |> 
  group_by(USUBJID) |> 
  summarise(
    DOSE01A = sum(EXDOSE, na.rm = TRUE),
    .groups = "drop"
  ) |> 
  mutate(
    DOSE01U = "mg"
  ) |> 
  select(USUBJID, DOSE01A, DOSE01U)

# Merge DM- and EX-based info
demo1 <- reduce(
  list(demo, treat, dose),
  .f = left_join,
  by = "USUBJID"
) |> 
  mutate(
    # Safety flag: Y if ITTFL = Y and TRTSDT not missing
    SAFFL = if_else(ITTFL == "Y" & !str_missing(TRTSDT), "Y", "N")
  )

##----------------------------------------------------------------------
## 4. DS-based variables: EOS, discontinuation, screen failure
##----------------------------------------------------------------------

demo2 <- demo1 |> 
  left_join(
    ds |> 
      filter(DSCAT == "DISPOSITION EVENT") |> 
      select(USUBJID, DSCAT, DSDECOD, DSTERM),
    by = "USUBJID"
  ) |> 
  mutate(
    # End of study status
    EOSSTT = case_when(
      str_missing(DSDECOD)          ~ "ONGOING",
      DSDECOD == "COMPLETED"        ~ "COMPLETED",
      TRUE                          ~ "DISCONTINUED"
    ),
    # Discontinuation reason and text
    DCSREAS  = if_else(
      DSDECOD != "COMPLETED" & !str_missing(DSDECOD),
      DSDECOD,
      "",
      missing = ""
    ),
    DCSREASP = if_else(
      DSDECOD != "COMPLETED" & !str_missing(DSTERM),
      DSTERM,
      "",
      missing = ""
    ),
    # Screen failure flag
    SCRNFL = if_else(
      DSDECOD == "SCREEN FAILURE",
      "Y",
      "N",
      missing = "N"
    )
  ) |> 
  select(-c(DSCAT, DSDECOD, DSTERM))

##----------------------------------------------------------------------
## 5. VS-based variables: weight, height, BMI
##----------------------------------------------------------------------

# Baseline weight (visit 3)
weight <- vs |> 
  filter(VSTESTCD == "WEIGHT", VISITNUM == 3) |> 
  select(USUBJID, WEIGHTBL = VSSTRESN)

# Baseline height (visit 1)
height <- vs |> 
  filter(VSTESTCD == "HEIGHT", VISITNUM == 1) |> 
  select(USUBJID, HEIGHTBL = VSSTRESN)

demo3 <- reduce(
  list(demo2, weight, height),
  .f = left_join,
  by = "USUBJID"
) |> 
  mutate(
    BMIBL = (WEIGHTBL / HEIGHTBL / HEIGHTBL) * 10000,
    BMIGR1 = case_when(
      BMIBL < 25        ~ "<25",
      BMIBL >= 25       ~ ">=25",
      TRUE              ~ NA_character_
    )
  )

##----------------------------------------------------------------------
## 6. Metadata (dataset + variable level) and final ADSL
##----------------------------------------------------------------------

# Dataset-level metadata
dlm <- tribble(
  ~dataset, ~label,
  "adsl",   "Subject-Level Analysis Dataset"
)

# Variable-level metadata
vlm <- tribble(
  ~dataset, ~variable, ~label, ~type, ~format,
  "adsl", "STUDYID", "Study Identifier", "character", NA_character_,
  "adsl", "USUBJID", "Unique Subject Identifier", "character", NA_character_,
  "adsl", "SUBJID", "Subject Identifier for the Study", "character", NA_character_,
  "adsl", "SITEID", "Study Site Identifier", "character", NA_character_,
  "adsl", "AGE", "Age", "numeric", NA_character_,
  "adsl", "AGEU", "Age Units", "character", NA_character_,
  "adsl", "AGEGR1", "Pooled Age Group 1", "character", NA_character_,
  "adsl", "AGEGR1N", "Pooled Age Group 1 (N)", "numeric", NA_character_,
  "adsl", "SEX", "Sex", "character", NA_character_,
  "adsl", "RACE", "Race", "character", NA_character_,
  "adsl", "RACEN", "Race (N)", "numeric", NA_character_,
  "adsl", "ETHNIC", "Ethnicity", "character", NA_character_,
  "adsl", "SAFFL", "Safety Population Flag", "character", NA_character_,
  "adsl", "ITTFL", "Intent-To-Treat Population Flag", "character", NA_character_,
  "adsl", "SCRNFL", "Screen Failure Population Flag", "character", NA_character_,
  "adsl", "ARM", "Description of Planned Arm", "character", NA_character_,
  "adsl", "ACTARM", "Description of Actual Arm", "character", NA_character_,
  "adsl", "TRT01P", "Planned Treatment for Period 01", "character", NA_character_,
  "adsl", "TRT01PN", "Planned Treatment for Period 01 (N)", "numeric", NA_character_,
  "adsl", "TRT01A", "Actual Treatment for Period 01", "character", NA_character_,
  "adsl", "TRT01AN", "Actual Treatment for Period 01 (N)", "numeric", NA_character_,
  "adsl", "DOSE01A", "Actual Treatment Dose for Period 01", "numeric", NA_character_,
  "adsl", "DOSE01U", "Units for Dose for Period 01", "character", NA_character_,
  "adsl", "TRTSDT", "Date of First Exposure to Treatment", "Date", "DATE9.",
  "adsl", "TRTEDT", "Date of Last Exposure to Treatment", "Date", "DATE9.",
  "adsl", "TRTDURD", "Total Treatment Duration (Days)", "numeric", NA_character_,
  "adsl", "EOSSTT", "End of Study Status", "character", NA_character_,
  "adsl", "DCSREAS", "Reason for Discontinuation from Study", "character", NA_character_,
  "adsl", "DCSREASP", "Reason Spec for Discont from Study", "character", NA_character_,
  "adsl", "DTHDT", "Date of Death", "Date", "DATE9.",
  "adsl", "WEIGHTBL", "Weight (kg) at Baseline", "numeric", NA_character_,
  "adsl", "HEIGHTBL", "Height (cm) at Baseline", "numeric", NA_character_,
  "adsl", "BMIBL", "Body Mass Index (kg/m2) at Baseline", "numeric", NA_character_,
  "adsl", "BMIGR1", "Pooled BMI Group 1", "character", NA_character_
)

adsl <- demo3 |> 
  # Order variables
  select(
    STUDYID, USUBJID, SUBJID, SITEID,
    AGE, AGEU, AGEGR1, AGEGR1N, SEX, RACE, RACEN, ETHNIC,
    SAFFL, ITTFL, SCRNFL,
    ARM, ACTARM,
    TRT01P, TRT01PN, TRT01A, TRT01AN,
    DOSE01A, DOSE01U,
    TRTSDT, TRTEDT, TRTDURD,
    EOSSTT, DCSREAS, DCSREASP,
    DTHDT,
    WEIGHTBL, HEIGHTBL, BMIBL, BMIGR1
  ) |> 
  arrange(USUBJID)  |> 
  # Apply labels / types / formats
  xportr_df_label(dlm, domain   = "adsl") |> 
  xportr_label(vlm,   domain   = "adsl") |> 
  xportr_type(vlm,    domain   = "adsl") |> 
  xportr_format(vlm,  domain   = "adsl") |> 
  # Replace character NA with ""
  mutate(across(where(is.character), ~ replace(., is.na(.), "")))

##----------------------------------------------------------------------
## 7. QC: label + metadata + export
##----------------------------------------------------------------------

# Check dataframe label
attr(adsl, "label")

# Check metadata
adsl |> 
  contents()

# Output to specified path
xportr_write(adsl, "./adsl.xpt")

Note: Real ADSL creation must follow ADaM IG (derive flags, dates, imputations, populations). This example is educational only.

14 ADVS program

# ADVS Dataset Creation Program
# Analysis Dataset for Vital Signs
# Following CDISC ADaM Standards

# Load required libraries
library(dplyr)
library(lubridate)
library(haven)

# Function to create ADVS dataset
create_advs <- function(vs_data, adsl_data) {
  
  # Step 1: Start with VS domain and merge with ADSL
  advs <- vs_data %>%
    # Filter for vital signs records
    filter(! is.na(vsstresn) | !is.na(vsstresc)) %>%
    # Merge with ADSL to get treatment information
    left_join(adsl_data, by = c("studyid" = "STUDYID", "usubjid" = "USUBJID"))
  
  # Step 2: Create basic variables
  advs <- advs %>%
    mutate(
      # Study and subject identifiers
      STUDYID = studyid,
      USUBJID = usubjid,
      
      # Treatment variables from ADSL
      TRTP = TRT01P,
      TRTPN = TRT01PN,
      TRTA = TRT01A,
      TRTAN = TRT01AN,
      
      # Parameter information
      PARAM = case_when(
        vstestcd == "SYSBP" ~ "Systolic Blood Pressure (mmHg)",
        vstestcd == "DIABP" ~ "Diastolic Blood Pressure (mmHg)",
        vstestcd == "PULSE" ~ "Pulse Rate (beats/min)",
        vstestcd == "RESP" ~ "Respiratory Rate (breaths/min)",
        vstestcd == "TEMP" ~ "Body Temperature (C)",
        vstestcd == "WEIGHT" ~ "Weight (kg)",
        vstestcd == "HEIGHT" ~ "Height (cm)",
        TRUE ~ vstest
      ),
      PARAMCD = vstestcd,
      PARAMN = case_when(
        vstestcd == "SYSBP" ~ 1,
        vstestcd == "DIABP" ~ 2,
        vstestcd == "PULSE" ~ 3,
        vstestcd == "RESP" ~ 4,
        vstestcd == "TEMP" ~ 5,
        vstestcd == "WEIGHT" ~ 6,
        vstestcd == "HEIGHT" ~ 7,
        TRUE ~ as.numeric(NA)
      ),
      
      # Analysis value
      AVAL = as.numeric(vsstresn),
      
      # Date/time variables
      ADT = as.Date(substr(vsdtc, 1, 10)),
      ADY = as.numeric(ADT - as.Date(TRTSDT) + (ADT >= as.Date(TRTSDT))),
      
      # Visit information
      VISIT = visit,
      ATPT = vstpt,
      ATPTN = vstptnum
    ) %>%
    # Filter to keep only records with non-missing AVAL
    filter(! is.na(AVAL))
  
  # Step 3: Create analysis visit windows (this would typically come from a separate mapping)
  # For demonstration, creating basic visit windows
  advs <- advs %>%
    mutate(
      AVISIT = case_when(
        grepl("SCREENING|SCREEN", toupper(VISIT)) ~ "Screening",
        grepl("BASELINE|DAY 1", toupper(VISIT)) ~ "Baseline",
        grepl("WEEK 2|DAY 14", toupper(VISIT)) ~ "Week 2",
        grepl("WEEK 4|DAY 28", toupper(VISIT)) ~ "Week 4",
        grepl("WEEK 8|DAY 56", toupper(VISIT)) ~ "Week 8",
        grepl("WEEK 12|DAY 84", toupper(VISIT)) ~ "Week 12",
        grepl("FOLLOW", toupper(VISIT)) ~ "Follow-up",
        TRUE ~ VISIT
      ),
      AVISITN = case_when(
        AVISIT == "Screening" ~ -1,
        AVISIT == "Baseline" ~ 0,
        AVISIT == "Week 2" ~ 2,
        AVISIT == "Week 4" ~ 4,
        AVISIT == "Week 8" ~ 8,
        AVISIT == "Week 12" ~ 12,
        AVISIT == "Follow-up" ~ 99,
        TRUE ~ as.numeric(NA)
      ),
      
      # Analysis window variables (example values)
      AWTARGET = AVISITN * 7, # Target day
      AWU = "DAYS",
      AWLO = AWTARGET - 3, # 3-day window
      AWHI = AWTARGET + 3,
      AWTDIFF = abs(ADY - AWTARGET)
    )
  
  # Step 4:  Determine baseline records
  advs <- advs %>%
    group_by(USUBJID, PARAMCD) %>%
    mutate(
      # Baseline flag - last assessment on or before treatment start date
      baseline_candidate = case_when(
        ADT <= as.Date(TRTSDT) ~ ADT,
        TRUE ~ as.Date(NA)
      )
    ) %>%
    group_by(USUBJID, PARAMCD) %>%
    mutate(
      max_baseline_date = max(baseline_candidate, na.rm = TRUE),
      ABLFL = case_when(
        ! is.na(baseline_candidate) & baseline_candidate == max_baseline_date ~ "Y",
        TRUE ~ ""
      )
    ) %>%
    ungroup()
  
  # Step 5: Create BASE, CHG, and PCHG
  advs <- advs %>%
    group_by(USUBJID, PARAMCD) %>%
    mutate(
      BASE = case_when(
        any(ABLFL == "Y") ~ AVAL[ABLFL == "Y"][1],
        TRUE ~ as. numeric(NA)
      ),
      CHG = case_when(
        ! is.na(BASE) & ADT > as.Date(TRTSDT) ~ AVAL - BASE,
        TRUE ~ as. numeric(NA)
      ),
      PCHG = case_when(
        !is.na(BASE) & BASE != 0 & ADT > as. Date(TRTSDT) ~ ((AVAL - BASE) / BASE) * 100,
        TRUE ~ as.numeric(NA)
      )
    ) %>%
    ungroup()
  
  # Step 6: Create analysis flags
  advs <- advs %>%
    mutate(
      # ANL01FL - Analysis Flag 01 (include all valid records)
      ANL01FL = "Y",
      
      # DTYPE - Derivation Type (empty for observed values)
      DTYPE = ""
    ) %>%
    group_by(USUBJID, PARAMCD, AVISIT) %>%
    mutate(
      # ANL02FL - One record per visit per parameter
      visit_rank = rank(AWTDIFF, ties.method = "first"),
      ANL02FL = case_when(
        ANL01FL == "Y" & visit_rank == 1 ~ "Y",
        TRUE ~ ""
      )
    ) %>%
    ungroup()
  
  # Step 7: Create criterion variables for blood pressure parameters
  advs <- advs %>%
    mutate(
      # Criterion 1: Low BP
      CRIT1 = case_when(
        PARAMCD == "SYSBP" ~ "SYSBP <= 90 mmHg",
        PARAMCD == "DIABP" ~ "DIABP <= 60 mmHg",
        TRUE ~ ""
      ),
      CRIT1FL = case_when(
        PARAMCD == "SYSBP" & AVAL <= 90 ~ "Y",
        PARAMCD == "DIABP" & AVAL <= 60 ~ "Y",
        PARAMCD %in% c("SYSBP", "DIABP") ~ "",
        TRUE ~ ""
      ),
      
      # Criterion 2: High BP
      CRIT2 = case_when(
        PARAMCD == "SYSBP" ~ "SYSBP >= 140 mmHg",
        PARAMCD == "DIABP" ~ "DIABP >= 90 mmHg",
        TRUE ~ ""
      ),
      CRIT2FL = case_when(
        PARAMCD == "SYSBP" & AVAL >= 140 ~ "Y",
        PARAMCD == "DIABP" & AVAL >= 90 ~ "Y",
        PARAMCD %in% c("SYSBP", "DIABP") ~ "",
        TRUE ~ ""
      ),
      
      # Criterion 3: >20% increase from baseline
      CRIT3 = case_when(
        PARAMCD == "SYSBP" ~ "SYSBP change from baseline > 20% increase",
        PARAMCD == "DIABP" ~ "DIABP change from baseline > 20% increase",
        TRUE ~ ""
      ),
      CRIT3FL = case_when(
        PARAMCD == "SYSBP" & ! is.na(PCHG) & PCHG > 20 ~ "Y",
        PARAMCD == "DIABP" & !is.na(PCHG) & PCHG > 20 ~ "Y",
        PARAMCD %in% c("SYSBP", "DIABP") & !is.na(PCHG) ~ "",
        TRUE ~ ""
      ),
      
      # Criterion 4: >20% decrease from baseline
      CRIT4 = case_when(
        PARAMCD == "SYSBP" ~ "SYSBP change from baseline > 20% decrease",
        PARAMCD == "DIABP" ~ "DIABP change from baseline > 20% decrease",
        TRUE ~ ""
      ),
      CRIT4FL = case_when(
        PARAMCD == "SYSBP" & !is.na(PCHG) & PCHG < -20 ~ "Y",
        PARAMCD == "DIABP" & !is. na(PCHG) & PCHG < -20 ~ "Y",
        PARAMCD %in% c("SYSBP", "DIABP") & !is.na(PCHG) ~ "",
        TRUE ~ ""
      )
    )
  
  # Step 8: Select final variables in correct order
  advs_final <- advs %>%
    select(
      STUDYID, USUBJID, TRTP, TRTPN, TRTA, TRTAN,
      PARAM, PARAMCD, PARAMN, AVAL, BASE, CHG, PCHG,
      DTYPE, ADT, ADY, AVISIT, AVISITN, AWTARGET, AWTDIFF,
      AWLO, AWHI, AWU, VISIT, ATPT, ATPTN,
      ABLFL, ANL01FL, ANL02FL,
      CRIT1, CRIT1FL, CRIT2, CRIT2FL, CRIT3, CRIT3FL, CRIT4, CRIT4FL,
      # Include core ADSL variables
      AGE, AGEU, AGEGR1, AGEGR1N, SEX, RACE, RACEN, SAFFL, ITTFL
    ) %>%
    arrange(STUDYID, USUBJID, PARAMN, ADT, ATPTN)
  
  return(advs_final)
}

# Example usage: 
# Load your data
# vs <- read_sas("path/to/vs.sas7bdat")
# adsl <- read_sas("path/to/adsl. sas7bdat")

# Create ADVS dataset
# advs <- create_advs(vs, adsl)

# Write to file
# write_sas(advs, "path/to/advs.sas7bdat")

# Print summary
# cat("ADVS dataset created with", nrow(advs), "records\n")
# cat("Parameters included:", paste(unique(advs$PARAMCD), collapse = ", "), "\n")
# cat("Subjects included:", length(unique(advs$USUBJID)), "\n")

Exercises 1. Add analysis populations (e.g., SAFFL, FASFL) based on simple rules. 2. Derive AGEGR1 as <65 / ≥65 and use ordered factor. 3. Add a treatment end date TRTEDT and compute treatment duration.