Assignment 1

R Markdown

source("datday1.pck")

#creating starting points as 0, 0 alongside appropriate group number
new1 = c(0, 0, 120)
new2 = c(0, 0, 85)
new3 = c(0, 0, 65)

#Binding starting points to their respective matrix
UCdat[[1]] <- rbind(UCdat[[1]], new1)
UCdat[[2]] <- rbind(UCdat[[2]], new2)
UCdat[[3]] <- rbind(UCdat[[3]], new3)

#there seems to be three different groups of data, group 120, group 85, and group 65
#group v1 - time differs from 0.0 to 2.4
#group v2 and v3 - time differs from, 0 to 10,000: 10,000 is presumably censored events

#organizing into order by time for start to be at zero 
UCdat <- lapply(UCdat, function(df) {
  df <- df[order(df[,1]),]
  #adding sequential numbers to replace new1/new2/new3 column
  rownames(df) <- seq_len(nrow(df))
  return(df)
})

#test to see if organization worked as intended
print(UCdat[[2]])

##          v2     
## 1      0.00 0 85
## 2      0.48 1 85
## 3      5.12 1 85
## 4      6.94 1 85
## 5     11.78 1 85
## 6     13.33 1 85
## 7     22.49 1 85
## 8     32.11 1 85
## 9     32.37 1 85
## 10    35.66 1 85
## 11    45.35 1 85
## 12    81.76 1 85
## 13 10000.00 0 85
## 14 10000.00 0 85
## 15 10000.00 0 85
## 16 10000.00 0 85

#1.Build KM estimator

KM_Estim <- function(x) {

  #ordered by time already done in setup chunk
  
  #subtracting 1, because technically, first row for each matrix is just a starting point, not an observation
  n <- nrow(x) - 1
  
  #identifying failure events, where x2 is subset to keep only where failures occurred
  Failure_Count <- x[, 2] == 1
  
  #not directly used in survival probabilities
  Censor_Count <- x[, 2] == 0
  
  #keeping only failure events
  x_Only_Failures <- x[Failure_Count, , drop = FALSE]
  
  #tracks # of at-risk pop
  n1 <- n
  
  #survival probability starts at 1
  Survival_Prob_Rate <- 1
  
   #vector to store survival probabilities and corresponding event times
  Survival_Prob_Vec <- c(1)
  Event_Times_Vec <- c(0)
  
  #loop through each time
  Unique_Times <- unique(x[, 1])
  
  #loop through each time point where a death occurred
  for(i in Unique_Times){
  #calculating # of deaths for time i
    d1 <- sum(x[,1] == i & x[, 2] == 1) #failures
    c1 <- sum(x[,1] == i & x[, 2] == 0) #censors
    
    #Kaplan Meier formula calculation
    if(n1 > 0){
      Survival_Prob_Rate <- Survival_Prob_Rate * (n1 - d1) / n1
      
      #accounting for extra censored event due to input of starting points
      if(!(i == 0 && length(Event_Times_Vec) > 0 && Event_Times_Vec[1] ==0)){
        #storing survival rate and their corresponding event times
        Event_Times_Vec <-c(Event_Times_Vec, i)
        Survival_Prob_Vec <-c(Survival_Prob_Vec, Survival_Prob_Rate)
      }
    }
    
    #reducing # of individuals at risk prior to next iteration
    n1 <- n1 - d1 - c1
    
    if(n1 == 0){
      break
    }

  }
  #returning list of survival estimates to their corresponding time points
    return(list(time = Event_Times_Vec, surv = Survival_Prob_Vec))
}

library(tibble)

#applying Kaplan Meier function to each matrix
KM_Result1 <- KM_Estim(UCdat[[1]])
KM_Result2 <- KM_Estim(UCdat[[2]])
KM_Result3 <- KM_Estim(UCdat[[3]])

#turning to tibble in order to be easier to read
KM_Tibble1 <- tibble(time = KM_Result1$time, survival = KM_Result1$surv)
KM_Tibble2 <- tibble(time = KM_Result2$time, survival = KM_Result2$surv)
KM_Tibble3 <- tibble(time = KM_Result3$time, survival = KM_Result3$surv)

#printing to see output
print(KM_Tibble1)

## # A tibble: 15 × 2
##     time survival
##    <dbl>    <dbl>
##  1 0       1     
##  2 0.082   0.929 
##  3 0.105   0.857 
##  4 0.258   0.786 
##  5 0.306   0.714 
##  6 0.318   0.643 
##  7 0.414   0.571 
##  8 0.417   0.5   
##  9 0.642   0.429 
## 10 0.645   0.357 
## 11 1.09    0.286 
## 12 1.43    0.214 
## 13 1.48    0.143 
## 14 1.57    0.0714
## 15 2.16    0

print(KM_Tibble2)

## # A tibble: 13 × 2
##        time survival
##       <dbl>    <dbl>
##  1     0       1    
##  2     0.48    0.929
##  3     5.12    0.857
##  4     6.94    0.786
##  5    11.8     0.714
##  6    13.3     0.643
##  7    22.5     0.571
##  8    32.1     0.5  
##  9    32.4     0.429
## 10    35.7     0.357
## 11    45.4     0.286
## 12    81.8     0.214
## 13 10000       0.214

print(KM_Tibble3)

## # A tibble: 6 × 2
##      time survival
##     <dbl>    <dbl>
## 1     0      1    
## 2    13      0.929
## 3    17      0.857
## 4    25.7    0.786
## 5    55.6    0.714
## 6 10000      0.714

Plotting Kaplan-Meier survival curves for the datasets given

#office hours notes
#Yes, so allow log scaling of the x axis in your kmplot function and make sure it includes the last censoring point when it is further out, the linear plot worked fine and is sufficient
#3 potential ways to approach the km plot
#1. brute force set xlim after looking at data, to be large enough
#2. plot a long file first (that goes to 10000) so it is long enough
#3. have your program look through to find the min and max of all your kaplan meier estimates to set xlim
#The issue of log x axis is just to make

adjust_value <- 0.1  
KM_Result1$log_time <- log10(KM_Result1$time + adjust_value)
KM_Result2$log_time <- log10(KM_Result2$time + adjust_value)
KM_Result3$log_time <- log10(KM_Result3$time + adjust_value)

min_time <- min(c(KM_Result1$log_time, KM_Result2$log_time, KM_Result3$log_time))
max_time <- max(c(KM_Result1$log_time, KM_Result2$log_time, KM_Result3$log_time))

#to check if order worked
print(KM_Result1)

## $time
##  [1] 0.0000 0.0820 0.1050 0.2580 0.3060 0.3180 0.4140 0.4170 0.6420 0.6450
## [11] 1.0914 1.4322 1.4820 1.5720 2.1600
## 
## $surv
##  [1] 1.00000000 0.92857143 0.85714286 0.78571429 0.71428571 0.64285714
##  [7] 0.57142857 0.50000000 0.42857143 0.35714286 0.28571429 0.21428571
## [13] 0.14285714 0.07142857 0.00000000
## 
## $log_time
##  [1] -1.0000000 -0.7399286 -0.6882461 -0.4461170 -0.3914740 -0.3788237
##  [7] -0.2890369 -0.2865095 -0.1295961 -0.1278437  0.0760576  0.1853155
## [13]  0.1992065  0.2232363  0.3541084

#2.Plot (and over plot with additional plots in different colors) KM estimators

qqplot_exclude_10000 <- function(x, y, title, xlab, ylab, color = "blue") {
  #quantiles will help us avoid - Error in xy.coords(x, y) : 'x' and 'y' lengths differ
  x_quantiles <- quantile(x, probs = 
                            #seq(0,1, length.out = n) generates a sequence of values between 0 and 1
                            #min length x and max length y to ensure we generate only as much as we need per plot
                            seq(0, 1,length.out = min(length(x),length(y))), na.rm = TRUE)
  #removing y-values of 10000 but maintaining equal quantiles
  y_quantiles <- quantile(y[y < 10000], probs = seq(0, 1, length.out = min(length(x), length(y))), na.rm = TRUE)

  #QQ-plot generator - establishing order of input
  qqplot(x_quantiles, y_quantiles, xlab = xlab, ylab = ylab, main = title, col = color)

  #connecting points
  lines(sort(x_quantiles), sort(y_quantiles), col = color, lwd = 2)

  #indicate presence of values at 10,000 so 
  if (any(y >= 10000)) {
    #highest valid y before 10,000
    max_y_below_10000 <- max(y[y < 10000], na.rm = TRUE)
    #dashed reference line to inform about point at 10000
    abline(h = max_y_below_10000, col = "red", lty = 3, lwd = 3)
    text(min(x_quantiles), max_y_below_10000 + 2, labels = "10,000+", col = "red", pos = 4)
  }
}

#3.Calculate Q-Q or time transformation plot for survival curves: show for example data UCdat

#generating the three QQ plots
qqplot_exclude_10000(KM_Result1$time, KM_Result2$time, "QQ Plot: Group 120 vs Group 85", "Group 120", "Group 85", color = "cyan")

qqplot_exclude_10000(KM_Result1$time, KM_Result3$time, "QQ Plot: Group 120 vs Group 65", "Group 120", "Group 65", color = "blue")

qqplot_exclude_10000(KM_Result2$time, KM_Result3$time, "QQ Plot: Group 85 vs Group 65", "Group 85", "Group 65", color = "magenta")

#2/7/2025 2:24 PM • log=“xy” #extra extra plot for futher analysis

qqplot_log_xy_combined <- function(x1, y1, x2, y2, x3, y3, 
                                   title, xlab, ylab, colors = c("pink", "blue", "magenta")) {
  #computing log10 of product: x & y (for X-axis)
  log_x1_product <- log10((x1 * y1) + adjust_value)
  log_x2_product <- log10((x2 * y2) + adjust_value)
  log_x3_product <- log10((x3 * y3) + adjust_value)

  #computing log10(y) (for Y-axis)
  log_y1 <- log10(y1 + adjust_value)
  log_y2 <- log10(y2 + adjust_value)
  log_y3 <- log10(y3 + adjust_value)

  #computing quantiles to fit
  n1 <- min(length(log_x1_product), length(log_y1))
  n2 <- min(length(log_x2_product), length(log_y2))
  n3 <- min(length(log_x3_product), length(log_y3))

  x1_quantiles <- quantile(log_x1_product, probs = seq(0, 1, length.out = n1), na.rm = TRUE)
  y1_quantiles <- quantile(log_y1, probs = seq(0, 1, length.out = n1), na.rm = TRUE)

  x2_quantiles <- quantile(log_x2_product, probs = seq(0, 1, length.out = n2), na.rm = TRUE)
  y2_quantiles <- quantile(log_y2, probs = seq(0, 1, length.out = n2), na.rm = TRUE)

  x3_quantiles <- quantile(log_x3_product, probs = seq(0, 1, length.out = n3), na.rm = TRUE)
  y3_quantiles <- quantile(log_y3, probs = seq(0, 1, length.out = n3), na.rm = TRUE)

  #defining limits for transformed axes
  min_x <- floor(min(c(x1_quantiles, x2_quantiles, x3_quantiles), na.rm = TRUE))
  max_x <- ceiling(max(c(x1_quantiles, x2_quantiles, x3_quantiles), na.rm = TRUE))
  min_y <- floor(min(c(y1_quantiles, y2_quantiles, y3_quantiles), na.rm = TRUE))
  max_y <- ceiling(max(c(y1_quantiles, y2_quantiles, y3_quantiles), na.rm = TRUE))

  #generating main QQ plot with log scaling
  qqplot(x1_quantiles, y1_quantiles, xlab = xlab, ylab = ylab, main = title, col = colors[1], log = "xy", xaxt = "n", yaxt = "n")

  lines(sort(x1_quantiles), sort(y1_quantiles), col = colors[1], lwd = 2)
  lines(sort(x2_quantiles), sort(y2_quantiles), col = colors[2], lwd = 2)
  lines(sort(x3_quantiles), sort(y3_quantiles), col = colors[3], lwd = 2)

  #creating log-based ticks for both x & y axes
  ticks_x <- 10^seq(min_x, max_x, by = 1)
  ticks_y <- 10^seq(min_y, max_y, by = 1)

  #formatting x-axis in appropriate notation
  axis(1, at = log10(ticks_x), labels = formatC(ticks_x, format = "e", digits = 0))

  #formatting y-axis in appropriate notation
  axis(2, at = log10(ticks_y), labels = formatC(ticks_y, format = "e", digits = 0), las = 2)

  legend("topleft", legend = c("120 vs 85", "120 vs 65", "85 vs 65"), col = colors, lwd = 2)
}

qqplot_log_xy_combined(KM_Result1$time, KM_Result2$time, 
                       KM_Result1$time, KM_Result3$time, 
                       KM_Result2$time, KM_Result3$time, 
                       "Combined QQ Plot: log(x * y) vs log(y)", 
                       "log10(Product of Groups)", "log10(y Values)")

## Warning in x1 * y1: longer object length is not a multiple of shorter object
## length

## Warning in x2 * y2: longer object length is not a multiple of shorter object
## length

## Warning in x3 * y3: longer object length is not a multiple of shorter object
## length

## Warning in xy.coords(x, y, xlabel, ylabel, log): 4 x values <= 0 omitted from
## logarithmic plot

## Warning in xy.coords(x, y, xlabel, ylabel, log): 2 y values <= 0 omitted from
## logarithmic plot

Assignment 1

Zahir Roman, David Tohon, Raghav Paratkar, Cem Demirkoparan

2025-02-07

R Markdown

Plotting Kaplan-Meier survival curves for the datasets given