source("datday1.pck")
#creating starting points as 0, 0 alongside appropriate group number
new1 = c(0, 0, 120)
new2 = c(0, 0, 85)
new3 = c(0, 0, 65)
#Binding starting points to their respective matrix
UCdat[[1]] <- rbind(UCdat[[1]], new1)
UCdat[[2]] <- rbind(UCdat[[2]], new2)
UCdat[[3]] <- rbind(UCdat[[3]], new3)
#there seems to be three different groups of data, group 120, group 85, and group 65
#group v1 - time differs from 0.0 to 2.4
#group v2 and v3 - time differs from, 0 to 10,000: 10,000 is presumably censored events
#organizing into order by time for start to be at zero
UCdat <- lapply(UCdat, function(df) {
df <- df[order(df[,1]),]
#adding sequential numbers to replace new1/new2/new3 column
rownames(df) <- seq_len(nrow(df))
return(df)
})
#test to see if organization worked as intended
print(UCdat[[2]])
## v2
## 1 0.00 0 85
## 2 0.48 1 85
## 3 5.12 1 85
## 4 6.94 1 85
## 5 11.78 1 85
## 6 13.33 1 85
## 7 22.49 1 85
## 8 32.11 1 85
## 9 32.37 1 85
## 10 35.66 1 85
## 11 45.35 1 85
## 12 81.76 1 85
## 13 10000.00 0 85
## 14 10000.00 0 85
## 15 10000.00 0 85
## 16 10000.00 0 85
#1.Build KM estimator
KM_Estim <- function(x) {
#ordered by time already done in setup chunk
#subtracting 1, because technically, first row for each matrix is just a starting point, not an observation
n <- nrow(x) - 1
#identifying failure events, where x2 is subset to keep only where failures occurred
Failure_Count <- x[, 2] == 1
#not directly used in survival probabilities
Censor_Count <- x[, 2] == 0
#keeping only failure events
x_Only_Failures <- x[Failure_Count, , drop = FALSE]
#tracks # of at-risk pop
n1 <- n
#survival probability starts at 1
Survival_Prob_Rate <- 1
#vector to store survival probabilities and corresponding event times
Survival_Prob_Vec <- c(1)
Event_Times_Vec <- c(0)
#loop through each time
Unique_Times <- unique(x[, 1])
#loop through each time point where a death occurred
for(i in Unique_Times){
#calculating # of deaths for time i
d1 <- sum(x[,1] == i & x[, 2] == 1) #failures
c1 <- sum(x[,1] == i & x[, 2] == 0) #censors
#Kaplan Meier formula calculation
if(n1 > 0){
Survival_Prob_Rate <- Survival_Prob_Rate * (n1 - d1) / n1
#accounting for extra censored event due to input of starting points
if(!(i == 0 && length(Event_Times_Vec) > 0 && Event_Times_Vec[1] ==0)){
#storing survival rate and their corresponding event times
Event_Times_Vec <-c(Event_Times_Vec, i)
Survival_Prob_Vec <-c(Survival_Prob_Vec, Survival_Prob_Rate)
}
}
#reducing # of individuals at risk prior to next iteration
n1 <- n1 - d1 - c1
if(n1 == 0){
break
}
}
#returning list of survival estimates to their corresponding time points
return(list(time = Event_Times_Vec, surv = Survival_Prob_Vec))
}
library(tibble)
#applying Kaplan Meier function to each matrix
KM_Result1 <- KM_Estim(UCdat[[1]])
KM_Result2 <- KM_Estim(UCdat[[2]])
KM_Result3 <- KM_Estim(UCdat[[3]])
#turning to tibble in order to be easier to read
KM_Tibble1 <- tibble(time = KM_Result1$time, survival = KM_Result1$surv)
KM_Tibble2 <- tibble(time = KM_Result2$time, survival = KM_Result2$surv)
KM_Tibble3 <- tibble(time = KM_Result3$time, survival = KM_Result3$surv)
#printing to see output
print(KM_Tibble1)
## # A tibble: 15 × 2
## time survival
## <dbl> <dbl>
## 1 0 1
## 2 0.082 0.929
## 3 0.105 0.857
## 4 0.258 0.786
## 5 0.306 0.714
## 6 0.318 0.643
## 7 0.414 0.571
## 8 0.417 0.5
## 9 0.642 0.429
## 10 0.645 0.357
## 11 1.09 0.286
## 12 1.43 0.214
## 13 1.48 0.143
## 14 1.57 0.0714
## 15 2.16 0
print(KM_Tibble2)
## # A tibble: 13 × 2
## time survival
## <dbl> <dbl>
## 1 0 1
## 2 0.48 0.929
## 3 5.12 0.857
## 4 6.94 0.786
## 5 11.8 0.714
## 6 13.3 0.643
## 7 22.5 0.571
## 8 32.1 0.5
## 9 32.4 0.429
## 10 35.7 0.357
## 11 45.4 0.286
## 12 81.8 0.214
## 13 10000 0.214
print(KM_Tibble3)
## # A tibble: 6 × 2
## time survival
## <dbl> <dbl>
## 1 0 1
## 2 13 0.929
## 3 17 0.857
## 4 25.7 0.786
## 5 55.6 0.714
## 6 10000 0.714
#office hours notes
#Yes, so allow log scaling of the x axis in your kmplot function and make sure it includes the last censoring point when it is further out, the linear plot worked fine and is sufficient
#3 potential ways to approach the km plot
#1. brute force set xlim after looking at data, to be large enough
#2. plot a long file first (that goes to 10000) so it is long enough
#3. have your program look through to find the min and max of all your kaplan meier estimates to set xlim
#The issue of log x axis is just to make
adjust_value <- 0.1
KM_Result1$log_time <- log10(KM_Result1$time + adjust_value)
KM_Result2$log_time <- log10(KM_Result2$time + adjust_value)
KM_Result3$log_time <- log10(KM_Result3$time + adjust_value)
min_time <- min(c(KM_Result1$log_time, KM_Result2$log_time, KM_Result3$log_time))
max_time <- max(c(KM_Result1$log_time, KM_Result2$log_time, KM_Result3$log_time))
#to check if order worked
print(KM_Result1)
## $time
## [1] 0.0000 0.0820 0.1050 0.2580 0.3060 0.3180 0.4140 0.4170 0.6420 0.6450
## [11] 1.0914 1.4322 1.4820 1.5720 2.1600
##
## $surv
## [1] 1.00000000 0.92857143 0.85714286 0.78571429 0.71428571 0.64285714
## [7] 0.57142857 0.50000000 0.42857143 0.35714286 0.28571429 0.21428571
## [13] 0.14285714 0.07142857 0.00000000
##
## $log_time
## [1] -1.0000000 -0.7399286 -0.6882461 -0.4461170 -0.3914740 -0.3788237
## [7] -0.2890369 -0.2865095 -0.1295961 -0.1278437 0.0760576 0.1853155
## [13] 0.1992065 0.2232363 0.3541084
#2.Plot (and over plot with additional plots in different colors) KM estimators
qqplot_exclude_10000 <- function(x, y, title, xlab, ylab, color = "blue") {
#quantiles will help us avoid - Error in xy.coords(x, y) : 'x' and 'y' lengths differ
x_quantiles <- quantile(x, probs =
#seq(0,1, length.out = n) generates a sequence of values between 0 and 1
#min length x and max length y to ensure we generate only as much as we need per plot
seq(0, 1,length.out = min(length(x),length(y))), na.rm = TRUE)
#removing y-values of 10000 but maintaining equal quantiles
y_quantiles <- quantile(y[y < 10000], probs = seq(0, 1, length.out = min(length(x), length(y))), na.rm = TRUE)
#QQ-plot generator - establishing order of input
qqplot(x_quantiles, y_quantiles, xlab = xlab, ylab = ylab, main = title, col = color)
#connecting points
lines(sort(x_quantiles), sort(y_quantiles), col = color, lwd = 2)
#indicate presence of values at 10,000 so
if (any(y >= 10000)) {
#highest valid y before 10,000
max_y_below_10000 <- max(y[y < 10000], na.rm = TRUE)
#dashed reference line to inform about point at 10000
abline(h = max_y_below_10000, col = "red", lty = 3, lwd = 3)
text(min(x_quantiles), max_y_below_10000 + 2, labels = "10,000+", col = "red", pos = 4)
}
}
#3.Calculate Q-Q or time transformation plot for survival curves: show for example data UCdat
#generating the three QQ plots
qqplot_exclude_10000(KM_Result1$time, KM_Result2$time, "QQ Plot: Group 120 vs Group 85", "Group 120", "Group 85", color = "cyan")
qqplot_exclude_10000(KM_Result1$time, KM_Result3$time, "QQ Plot: Group 120 vs Group 65", "Group 120", "Group 65", color = "blue")
qqplot_exclude_10000(KM_Result2$time, KM_Result3$time, "QQ Plot: Group 85 vs Group 65", "Group 85", "Group 65", color = "magenta")
#2/7/2025 2:24 PM • log=“xy” #extra extra plot for futher analysis
qqplot_log_xy_combined <- function(x1, y1, x2, y2, x3, y3,
title, xlab, ylab, colors = c("pink", "blue", "magenta")) {
#computing log10 of product: x & y (for X-axis)
log_x1_product <- log10((x1 * y1) + adjust_value)
log_x2_product <- log10((x2 * y2) + adjust_value)
log_x3_product <- log10((x3 * y3) + adjust_value)
#computing log10(y) (for Y-axis)
log_y1 <- log10(y1 + adjust_value)
log_y2 <- log10(y2 + adjust_value)
log_y3 <- log10(y3 + adjust_value)
#computing quantiles to fit
n1 <- min(length(log_x1_product), length(log_y1))
n2 <- min(length(log_x2_product), length(log_y2))
n3 <- min(length(log_x3_product), length(log_y3))
x1_quantiles <- quantile(log_x1_product, probs = seq(0, 1, length.out = n1), na.rm = TRUE)
y1_quantiles <- quantile(log_y1, probs = seq(0, 1, length.out = n1), na.rm = TRUE)
x2_quantiles <- quantile(log_x2_product, probs = seq(0, 1, length.out = n2), na.rm = TRUE)
y2_quantiles <- quantile(log_y2, probs = seq(0, 1, length.out = n2), na.rm = TRUE)
x3_quantiles <- quantile(log_x3_product, probs = seq(0, 1, length.out = n3), na.rm = TRUE)
y3_quantiles <- quantile(log_y3, probs = seq(0, 1, length.out = n3), na.rm = TRUE)
#defining limits for transformed axes
min_x <- floor(min(c(x1_quantiles, x2_quantiles, x3_quantiles), na.rm = TRUE))
max_x <- ceiling(max(c(x1_quantiles, x2_quantiles, x3_quantiles), na.rm = TRUE))
min_y <- floor(min(c(y1_quantiles, y2_quantiles, y3_quantiles), na.rm = TRUE))
max_y <- ceiling(max(c(y1_quantiles, y2_quantiles, y3_quantiles), na.rm = TRUE))
#generating main QQ plot with log scaling
qqplot(x1_quantiles, y1_quantiles, xlab = xlab, ylab = ylab, main = title, col = colors[1], log = "xy", xaxt = "n", yaxt = "n")
lines(sort(x1_quantiles), sort(y1_quantiles), col = colors[1], lwd = 2)
lines(sort(x2_quantiles), sort(y2_quantiles), col = colors[2], lwd = 2)
lines(sort(x3_quantiles), sort(y3_quantiles), col = colors[3], lwd = 2)
#creating log-based ticks for both x & y axes
ticks_x <- 10^seq(min_x, max_x, by = 1)
ticks_y <- 10^seq(min_y, max_y, by = 1)
#formatting x-axis in appropriate notation
axis(1, at = log10(ticks_x), labels = formatC(ticks_x, format = "e", digits = 0))
#formatting y-axis in appropriate notation
axis(2, at = log10(ticks_y), labels = formatC(ticks_y, format = "e", digits = 0), las = 2)
legend("topleft", legend = c("120 vs 85", "120 vs 65", "85 vs 65"), col = colors, lwd = 2)
}
qqplot_log_xy_combined(KM_Result1$time, KM_Result2$time,
KM_Result1$time, KM_Result3$time,
KM_Result2$time, KM_Result3$time,
"Combined QQ Plot: log(x * y) vs log(y)",
"log10(Product of Groups)", "log10(y Values)")
## Warning in x1 * y1: longer object length is not a multiple of shorter object
## length
## Warning in x2 * y2: longer object length is not a multiple of shorter object
## length
## Warning in x3 * y3: longer object length is not a multiple of shorter object
## length
## Warning in xy.coords(x, y, xlabel, ylabel, log): 4 x values <= 0 omitted from
## logarithmic plot
## Warning in xy.coords(x, y, xlabel, ylabel, log): 2 y values <= 0 omitted from
## logarithmic plot