Directory Setup

knitr::opts_chunk$set(echo = T)
knitr::opts_chunk$set(fig.width = 12, fig.height = 8)
knitr::opts_knit$set(root.dir = "C:/Users/Collin/Desktop/Academic/Courses/MATH 629/HW1")

Chapter 1

Problem 1

Given the following survival time data (in weeks) for \(n=15\) subjects,

1+, 2,2,2+,2+,3,3,3,3+,4+,4+,5,5,7,8+
where + denotes right censored data, complete the following table:

\(t_{(f)}\)	\(m_{f}\)	\(q_{f}\)	\(R(t_{(f)})\)
0	?	?	15
2	?	?	?
3	?	?	?
5	?	?	?
7	?	?	?

Fill in the missing entries in the table.

Solution:

To help myself understand the process fully, I decided to stack both tables together, similar to what is completed on page 25 in the book.

require(kableExtra)
id <- 1:15
t_weeks <- c(1,2,2,2,2,3,3,3,3,4,4,5,5,7,8)
d <- c(0, 1,1,0,0,1,1,1,0,0,0,1,1,1,0)
X <- rep(1, 15)
p1 <- data.frame(t_weeks, d, X)
sortedp1 <- p1[order(-p1$d, p1$t_weeks), ]
kbl(cbind(id, sortedp1), row.names = F, col.names = c("#", "t(weeks)", "d", "X(group)")) %>%
  kable_styling(position = "center")

#	t(weeks)	d	X(group)
1	2	1	1
2	2	1	1
3	3	1	1
4	3	1	1
5	3	1	1
6	5	1	1
7	5	1	1
8	7	1	1
9	1	0	1
10	2	0	1
11	2	0	1
12	3	0	1
13	4	0	1
14	4	0	1
15	8	0	1

\(t_{(f)}\)	\(m_{f}\)	\(q_{f}\)	\(R(t_{(f)})\)
0	0	0	15
2	2	3	14
3	3	1	10
5	2	2	4
7	1	1	2

Use the table to estimate the following five probabilities \(P(T > t_{(f)} | T \ge t_{(f)})\) for \(t = 0,2,4,5,7\)

Solution (see the table below for values):

\(\hat{S}(0) = 1\)
\(\hat{S}(2) = \frac{14-2}{14}1 = 0.857\)
\(\hat{S}(4) = \frac{10-3}{10}\frac{12}{14}0.857 = 0.6\)
\(\hat{S}(5) = \frac{4-2}{4}0.6 = 0.3\)
\(\hat{S}(7) = \frac{2-1}{2}0.3 = 0.15\)

require(ggplot2)
require(survival)
require(survminer)
require(ggpubr)
require(kableExtra)
survFunc <- Surv(sortedp1$t_weeks, sortedp1$d==1)
kmfit <- survfit(survFunc ~ 1)
prob2_results <- summary(kmfit, times = c(0,2,4,5,7))
prob2Table <- data.frame(prob2_results$time, prob2_results$surv)
ggplot(data = NULL, aes(x = prob2_results$time, y = prob2_results$surv )) + geom_step() + theme_classic() + ylab("S(t)") + xlab("t")

Visual representation of the calculated probabilities using ggplot().

kbl(prob2Table, row.names = F, col.names = c("Time", "Probability"), caption = " Kaplan-Meier Estimates") %>%
  kable_styling(position = "center")

Kaplan-Meier Estimates
Time	Probability
0	1.0000000
2	0.8571429
4	0.6000000
5	0.3000000
7	0.1500000

Compute the average survival time \((\overline{T})\) and the average hazard rate \((\overline{h})\) .

Solution:

t_bar1 <- sum(sortedp1$t_weeks[which(sortedp1$d==1)])/length(sortedp1$t_weeks) # = 2
h_bar1 <- sum(sortedp1$d[which(sortedp1$d==1)])/sum(sortedp1$t_weeks) # = 0.148

\(\overline{T} = 2\)
\(\overline{h} = 0.148\)

Problem 2

Consider the comparison of the following survival curves for groups \(A\), \(B\), and \(C\).

Which group has the worst survival prognosis before time \(t*\) ?

Solution:
Group A

Which group has the best survival prognosis after time \(t*\) ?

Solution:
Group C

Which group has the longest median survival time?

Solution:
Group C

Problem 3

Survival times (in years) are given for two study groups, each with 25 participants. Group 1 has no history of chronic disease (CHR = 0), and group 2 has a positive history of chronic disease (CHR = 1):

Group 1 (CHR = 0):

11.3+, 6.4, 8.5, 13.7+, 10.1, 10.0, 5.7, 8.2, 2.0, 12.0, 9.9, 13.6+, 8.8, 2.2, 1.6, 10.2, 10.7, 11.1+, 5.3, 2.5, 10.6, 2.5, 5.7, 4.8, 2.7

Group 2 (CHR = 1):

4.8, 2.9, 10.7, 9.3, 9.1, 3.9, 5.0, 1.2, 4.4, 12.0, 2.9, 1.2, 6.9, 1.4, 3.0, 3.9, 3.5, 6.5, 9.9, 3.6, 5.2, 9.8, 6.8, 4.7, 3.9

a) Complete the following survival tables for both Group 1 and Group 2:

	\(t_{(f)}\)	\(m_{f}\)	\(q_{f}\)	\(R(t_{(f)})\)
Group 1:	0.0	0	0	25 persons survive \(\ge\) 0 weeks
	1.6	1	0	25 persons survive \(\ge\) 1.6 weeks
	.	.	.	.
	.	.	.	.
	.	.	.	.

	\(t_{(f)}\)	\(m_{f}\)	\(q_{f}\)	\(R(t_{(f)})\)
Group 2:	0.0	0	0	25 persons survive \(\ge\) 0 weeks
	1.2	2	0	25 persons survive \(\ge\) 1.2 weeks
	.	.	.	.
	.	.	.	.
	.	.	.	.

Problem 3 a.1) Code

require(kableExtra)
require(ggplot2)
require(survival)
require(survminer)
require(ggpubr)
require(kableExtra)
t_weeks3.1 <- c(11.3, 6.4, 8.5, 13.7, 10.1, 10.0, 5.7, 8.2, 2.0, 12.0, 9.9, 13.6,
8.8, 2.2, 1.6, 10.2, 10.7, 11.1, 5.3, 2.5, 10.6, 2.5, 5.7, 4.8, 2.7)
d3.1 <- c(0,1,1,0,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,1,1,1,1,1, 1)
p3.1 <- data.frame(t_weeks3.1, d3.1)
sortedp3.1 <- p3.1[order(-p3.1$d3.1, p3.1$t_weeks3.1), ]
group1 <- rep(0, 25)
survFunc3.1 <- Surv(sortedp3.1$t_weeks3.1, sortedp3.1$d3.1==1)
kmfit3.1 <- survfit(survFunc3.1 ~ 1)
results3.1 <- summary(kmfit3.1)
prob3.1Table <- data.frame(results3.1$time, results3.1$surv)
ggplot(data = NULL, aes(results3.1$time, results3.1$surv)) + geom_step() + theme_classic() + ylab("S(t)") + xlab("t")

kbl(prob3.1Table, row.names = F, col.names = c("Time", "Probability"), caption = " Kaplan-Meier Estimates") %>%
  kable_styling(position = "center")

Kaplan-Meier Estimates
Time	Probability
1.6	0.9600000
2.0	0.9200000
2.2	0.8800000
2.5	0.8000000
2.7	0.7600000
4.8	0.7200000
5.3	0.6800000
5.7	0.6000000
6.4	0.5600000
8.2	0.5200000
8.5	0.4800000
8.8	0.4400000
9.9	0.4000000
10.0	0.3600000
10.1	0.3200000
10.2	0.2800000
10.6	0.2400000
10.7	0.2000000
12.0	0.1333333

Using the table above, the solution for Problem 3 a.1) is :

	\(t_{(f)}\)	\(m_{f}\)	\(q_{f}\)	\(R(t_{(f)})\)
Group 1:	0.0	0	0	25 persons survive \(\ge\) 0 weeks
	1.6	1	0	25 persons survive \(\ge\) 1.6 weeks
	2.0	1	0	24 persons survive \(\ge\) 2.0 weeks
	2.2	1	0	23 persons survive \(\ge\) 2.2 weeks
	2.5	1	0	22 persons survive \(\ge\) 2.5 weeks
	2.7	2	0	20 persons survive \(\ge\) 2.7 weeks
	4.8	1	0	19 persons survive \(\ge\) 4.8 weeks
	5.3	1	0	18 persons survive \(\ge\) 5.3 weeks
	5.7	1	0	17 persons survive \(\ge\) 5.7 weeks
	6.4	2	0	15 persons survive \(\ge\) 6.4 weeks
	8.2	1	0	14 persons survive \(\ge\) 8.2 weeks
	8.5	1	0	13 persons survive \(\ge\) 8.5 weeks
	8.8	1	0	12 persons survive \(\ge\) 8.8 weeks
	9.9	1	0	11 persons survive \(\ge\) 9.9 weeks
	10.0	1	0	10 persons survive \(\ge\) 10.0 weeks
	10.1	1	0	9 persons survive \(\ge\) 10.1 weeks
	10.2	1	0	8 persons survive \(\ge\) 10.2 weeks
	10.6	1	0	7 persons survive \(\ge\) 10.6 weeks
	10.7	1	2	6 persons survive \(\ge\) 10.7 weeks
	12.0	1	1	3 persons survive \(\ge\) 12.0 weeks

Problem 3 a.2) Code

require(kableExtra)
require(ggplot2)
require(survival)
require(survminer)
require(ggpubr)
require(kableExtra)
t_weeks3.2 <- c(4.8, 2.9, 10.7, 9.3, 9.1, 3.9, 5.0, 1.2, 4.4, 12.0, 2.9, 1.2, 6.9, 1.4,
3.0, 3.9, 3.5, 6.5, 9.9, 3.6, 5.2, 9.8, 6.8, 4.7, 3.9)
d3.2 <- rep(1, 25)
p3.2 <- data.frame(t_weeks3.2, d3.2)
group2 <- rep(1, 25)
sortedp3.2 <- p3.2[order(-p3.2$d3.2, p3.2$t_weeks3.2), ]

survFunc3.2 <- Surv(sortedp3.2$t_weeks3.2, sortedp3.2$d3.2==1)
kmfit3.2 <- survfit(survFunc3.2 ~ 1)
results3.2 <- summary(kmfit3.2)
prob3.2Table <- data.frame(results3.2$time, results3.2$surv)
ggplot(data = NULL, aes(results3.2$time, results3.2$surv)) + geom_step() + theme_classic() + ylab("S(t)") + xlab("t")

kbl(prob3.2Table, row.names = F, col.names = c("Time", "Probability"), caption = " Kaplan-Meier Estimates") %>%
  kable_styling(position = "center")

Kaplan-Meier Estimates
Time	Probability
1.2	0.92
1.4	0.88
2.9	0.80
3.0	0.76
3.5	0.72
3.6	0.68
3.9	0.56
4.4	0.52
4.7	0.48
4.8	0.44
5.0	0.40
5.2	0.36
6.5	0.32
6.8	0.28
6.9	0.24
9.1	0.20
9.3	0.16
9.8	0.12
9.9	0.08
10.7	0.04
12.0	0.00

Using the table above, the solution for Problem 3 a.2) is :

	\(t_{(f)}\)	\(m_{f}\)	\(R(t_{(f)})\)
Group 1:	0.0	0	25 persons survive \(\ge\) 0 weeks
	1.2	2	25 persons survive \(\ge\) 1.2 weeks
	1.4	1	23 persons survive \(\ge\) 1.4 weeks
	2.9	2	22 persons survive \(\ge\) 2.9weeks
	3.0	1	20 persons survive \(\ge\) 3.0 weeks
	3.5	1	19 persons survive \(\ge\) 3.5 weeks
	3.6	1	18 persons survive \(\ge\) 3.6 weeks
	3.9	3	17 persons survive \(\ge\) 3.9 weeks
	4.4	1	14 persons survive \(\ge\) 4.4 weeks
	4.7	1	13 persons survive \(\ge\) 4.7 weeks
	4.8	1	12 persons survive \(\ge\) 4.8 weeks
	5.0	1	11 persons survive \(\ge\) 5.0 weeks
	5.2	1	10 persons survive \(\ge\) 5.2 weeks
	6.5	1	9 persons survive \(\ge\) 6.5 weeks
	6.8	1	8 persons survive \(\ge\) 6.8 weeks
	6.9	1	7 persons survive \(\ge\) 6.9 weeks
	9.1	1	6 persons survive \(\ge\) 9.1 weeks
	9.3	1	5 persons survive \(\ge\) 9.3 weeks
	9.8	1	4 persons survive \(\ge\) 9.8 weeks
	9.9	1	3 persons survive \(\ge\) 9.9 weeks
	10.7	1	2 persons survive \(\ge\) 10.7 weeks

b) Compute the average survival time (\(\overline{T}\)) and the average hazard rate (\(\overline{h}\)) for each group.

Group 1:

t_bar3.1 <- sum(sortedp3.1$t_weeks3.1[which(sortedp3.1$d3.1==1)])/length(sortedp3.1$t_weeks3.1) # = 5.616
h_bar3.1 <- sum(sortedp3.1$d3.1[which(sortedp3.1$d3.1==1)])/sum(sortedp3.1$t_weeks3.1) # = 0.1105

\(\overline{T}_{G1} = 5.616\)
\(\overline{h}_{G1} = 0.1105\)

Group 2:

t_bar3.2 <- sum(sortedp3.2$t_weeks3.2[which(sortedp3.2$d3.2==1)])/length(sortedp3.2$t_weeks3.2) # = 5.616
h_bar3.2 <- sum(sortedp3.2$d3.2[which(sortedp3.2$d3.2==1)])/sum(sortedp3.2$t_weeks3.2) # = 0.1105

\(\overline{T}_{G2} = 5.46\)
\(\overline{h}_{G2} = 0.1832\)

c) Based on your answer in b), briefly explain which group has better overall survival prospects. What might be the problem with your argument?

To better illustrate this issue, I will plot the survival functions by group. A complete dataset with both groups will need to first be constructed.

require(survival)
require(survminer)
require(dplyr)
temp1 <- cbind(sortedp3.1, group1)

temp2 <- cbind(sortedp3.2, group2)

colnames(temp1) <- colnames(temp2)
p3combined <- rbind(temp1, temp2)
cleanedp3comb <- rename(p3combined, t = t_weeks3.2, d = d3.2, Group = group2)


p3.2fit <- survfit(Surv(t, d) ~ Group, data = cleanedp3comb, conf.type = "log-log")
ggsurvplot(p3.2fit, data = cleanedp3comb, risk.table = F,
           pval = T,
           conf.int = T, ncensor.plot = T,
           ncensor.plot.height = 0.25)

Solution

Based on the average survival rates, Group 1 has better overall survival prospects (5.616 > 5.46). However, there are four cases of censoring in group 1, whereas group 2 has no censoring. However, after observing the KM curves for both groups with confidence intervals included, it is clear that there is a lot of overlap. The conclusion cannot be determined unless further analyses are completed.

Problem 4

Give your own example of each of the following types of censoring:

Right Censoring:

Right censoring occurs when an individual or subject leaves before an event occurs.

Left Censoring:

Left censoring occurs when the event has already occurred prior to the individual enrolling in the study.

Interval Censoring:

Interval censoring occurs when the event is known to occur within an interval, rather than observing the event.

Problem 5

Identify the issue presented in each of the following studies as confounding or interaction. Then discuss how each of these issues might be dealt with in a statistical model.

A statistician working for ACME corporation collects data from male and female employees to see if there is a differecne in the time it takes for new male and female employees to pass ACME Corp’s programming certification. ACME Corp tends to hire male employees from technical colleges and female employees from liberal arts programs where students get at most one year of programming experience.

Solution:

Confounding
One way to adjust for confounding effects is to stratify the data by the two groups of people to see if survival times change based on this grouping.

A statistician working for ACME corporation collects data from male and female employees to see if there is a difference in the time it takes for male and female employees to pass ACME Corp’s English Proficiency Exam (EPE). Older male employees tend to take more time to pass the EPE than female employees and younger male employees tend to take less time to pass the EPE than female employees.

Solution:

Interaction
A cox-proportional hazards model can be used to describe how survival times differ by age and sex.

Present your own examples of confounding and interaction.

Confounding Example

In a study involving discovering the time it takes for athletes to develop a major injury based on the number of hours the athletes complete intense training per week could have confounding variables. The confounding effects could occur because there is also the type of training factor, which certainly has an effect on the outcome and the exposure (intense training).

Interaction Example

In the same example above, interaction could also exist if factors like age, weight, and height are included in the model.

Problem 6

A group of 2000 patients are given different types of heart stents from Advanced Bionics and Juno Devices. A study is conducted concerning time, \(T\), until major heart failure or death. Their survival information collected is given below:

Based on the information above, and assuming independent censoring, summarize the 10 and 20 year survival experiences of the two groups.

Solution:

Advanced Bionics

10-year survival \(= \frac{520}{600} = 0.8\overline{66}\)
20-year survival \(= 1 - \frac{80 + 120 + \frac{120}{400}280}{600} = 0.52\overline{6}\)

Juno Devices

10-year survival \(= \frac{300}{400} = 0.75\)
20-year survival\(= 1 - \frac{100 + 50 + \frac{50}{250}150}{400} = 0.55\)

Problem 7

Given the pdfs below find the survival \((S(t))\) and the hazard function \((h(t))\). Use the fact that:

\(S(t) = P(T > t) = \int_{t}^{\infty}f(u)du\)

Use integration by parts (IBP)

After you find the hazard function, identify a type of survival population that might have such a hazard.

Solution for (a)

Steps:

\(S(t) = P(T > t) = \int_{t}^{\infty}f(u)du = \int_{t}^{\infty}\frac{3}{2}(\frac{t}{2})^{2}e^{{-(\frac{t}{2}})^{3}}du\)
use u-substitution
\(u = -\frac{t^{3}}{8}\)
\(dt = -\frac{8}{3t^{2}}du\)
fractions cancel, and when using u in the integration, the integral becomes \(\int e^{u}du\)
\(\int e^{u}du = e^{u}\)
plug \(u = -\frac{t^{3}}{8}\) into \(e^{u}\)
\(S(t) = -e^{\frac{t^{3}}{8}}\) <—- solution

Recall:

\(h(t) = \frac{f(t)}{s(t)}\)
using the values we have:
\(h(t) = \frac{\frac{3}{2}(\frac{t}{2})^{2}e^{{-(\frac{t}{2}})^{3}}}{-e^{\frac{t^{3}}{8}}}\)
\(h(t) = -\frac{3t^{2}}{8}\) <—- solution

Solution for (b)

\(S(t) = P(T > t) = \int_{t}^{\infty}f(u)du = \int_{t}^{\infty}\frac{1}{9}te^{-\frac{t}{3}}\)
use IBP
Recall:
\(\int udv = uv - \int duv\)
In this case
\(u = t\)
\(du = dt\)
\(dv = e^{-\frac{t}{3}}\)
after integrating \(dv\)
\(v = -3e^{-\frac{t}{3}}\)
the integral becomes
\(S(t) = \frac{1}{9}[-te^{-\frac{t}{3}} + 3\int e^{-\frac{t}{3}}dt]\)
\(S(t) = e^{-\frac{t}{3}}(1 - \frac{t}{9})\) <—- solution
\(h(t) = \frac{f(t)}{s(t)}\)
\(h(t) = \frac{\frac{1}{9}te^{-\frac{t}{3}}}{e^{-\frac{t}{3}}(1 - \frac{t}{9})}\)
which simplifies to:
\(h(t) = \frac{t}{9-t}\) <—- solution

solution for (c)

\(S(t) = P(T > t) = \int_{t}^{\infty}f(u)du = \int_{t}^{\infty}\frac{18}{(t+3)^3}\)
use u-substitution
\(u = t+3\) and \(du = dt\)
the integral becomes:
\(S(t) = 18\int \frac{1}{u^{3}}du\)
\(S(t) = 18( \frac{1}{-2u^{2}})\)
\(S(t) = -\frac{9}{u^{2}}\)
after plugging in the value for u
\(S(t) = -\frac{9}{(t+3)^2}\) <—- solution
\(h(t) = \frac{f(t)}{s(t)}\)
\(h(t) = \frac{\frac{18}{(t+3)^3}}{-\frac{9}{(t+3)^2}}\)
\(h(t) = -\frac{2}{t+3}\) <—- solution

Chapter 2

Problem 1

Multiple myeloma is a malignant disease characterised by the accumulation of abnormal plasma cells, a type of white blood cell, in the bone marrow. Unless treated, the condition is invariably fatal. The aim of a study carried out at the Medical Center of the University of West Virginia, USA, was to examine the association between the values of certain explanatory variables or covariates and the survival time of patients. In the study, the primary response variable was the time, in months, from diagnosis until death from multiple myeloma.

Load the data by using the following command in R:

multmyl <- read.delim("multmylenoma.txt", header = TRUE)

Solution:

multmyl <- read.csv("multmylenoma.csv", header = T)


# I found a case where OW is actually Ow so I will fix that 

multmyl$WS <- replace(multmyl$WS, multmyl$WS == "Ow", "OW")

multmyl

##    Survt Status Clinic WS
## 1      1      1      1 OW
## 2      1      1      2 OW
## 3      1      1      1 RW
## 4      3      0      2 OW
## 5      4      1      1 OW
## 6      4      1      2 RW
## 7      5      1      1 OW
## 8      5      1      1 OW
## 9      5      1      2 OW
## 10     5      1      2 OW
## 11     6      1      1 RW
## 12     6      1      2 RW
## 13     7      0      1 RW
## 14     8      1      2 OW
## 15    10      1      1 OW
## 16    10      0      2 OW
## 17    10      1      1 OW
## 18    10      1      1 OW
## 19    10      1      1 RW
## 20    11      0      2 RW
## 21    12      0      1 OW
## 22    12      1      1 OW
## 23    13      1      2 OW
## 24    14      1      2 RW
## 25    15      0      1 RW
## 26    15      1      1 OW
## 27    16      1      1 OW
## 28    16      1      2 OW
## 29    17      1      1 RW
## 30    18      1      1 RW
## 31    18      0      1 RW
## 32    18      0      2 OW
## 33    18      1      1 RW
## 34    23      1      1 RW
## 35    23      1      1 RW
## 36    36      1      2 RW
## 37    40      1      2 OW
## 38    40      0      1 RW
## 39    40      1      2 RW
## 40    50      1      2 OW
## 41    52      0      2 RW
## 42    55      1      1 OW
## 43    55      0      2 OW
## 44    66      1      2 RW
## 45    66      1      1 RW
## 46    88      0      2 RW
## 47    88      0      2 RW
## 48    88      0      2 RW

Patients are placed in two types of treatment facilities: the myeloma clinic at the City Hospital (coded as Clinic=1) and the private research clinic at the University(coded as Clinic=2). We want to estimate survival experiences of patients in the two clinics from this data set. Derive the Kaplan-Meirer estimates for Clinic 1 and Clinic 2 by filling in the table below.

Soluton:

Note: I rounded to just two decimal points when calculating by hand, so the values slowly lose accuracy compared to the computer-generated results.

		Clinic 1				Clinic 2
\(t_{(f)}\)	\(n_f\)	\(m_F\)	\(q_f\)	\(\hat{S}(t_{(f)})\)	\(t_{(f)}\)	\(n_f\)	\(m_F\)	\(q_f\)	\(\hat{S}(t_{(f)})\)
0.0	25	0	0	1.00	0.0	23	0	0	1.00
1	25	2	0	\(1x\frac{23}{25} = 0.92\)	1	23	1	0	\(1x\frac{22}{23} = 0.96\)
4	23	1	0	\(0.92x\frac{22}{23} = 0.88\)	4	22	1	1	\(0.96x\frac{21}{22} = 0.91\)
5	22	2	0	\(0.88x\frac{20}{22} = 0.80\)	5	20	2	0	\(0.91x\frac{18}{20} = 0.82\)
6	20	1	0	\(0.79x\frac{19}{20} = 0.76\)	6	18	1	0	\(0.82x\frac{17}{18} = 0.78\)
10	19	4	1	\(0.76x\frac{15}{19} = 0.6\)	8	17	1	0	\(0.78x\frac{16}{17} = 0.73\)
12	14	1	1	\(0.6x\frac{13}{14} = 0.56\)	13	16	1	2	\(0.73x\frac{15}{16} = 0.68\)
15	12	1	1	\(0.56x\frac{11}{12} = 0.51\)	14	13	1	0	\(0.68x\frac{12}{13} = 0.63\)
16	10	1	0	\(0.51x\frac{9}{10} = 0.46\)	16	12	1	0	\(0.63x\frac{11}{12} = 0.58\)
17	9	1	0	\(0.46x\frac{8}{9} = 0.41\)	36	10	1	1	\(0.58x\frac{9}{10} = 0.52\)
18	8	2	1	\(0.41x\frac{6}{8} = 0.31\)	40	9	2	0	\(0.52x\frac{7}{9} = 0. 41\)
23	5	2	0	\(0.31x\frac{3}{5} = 0.18\)	50	7	1	0	\(0.41x\frac{6}{7} = 0.35\)
55	2	1	1	\(0.18x\frac{1}{2} = 0.10\)	66	4	1	2	\(0.35x\frac{3}{4} = 0.26\)
66	1	1	0	\(0.10x\frac{0}{1} = 0\)	88	3	0	2	\(0.29x\frac{0}{3} = 0\)

Use R to confirm and then plot your Kaplan-Meirer curves for Clinnic 1 and for Clinic 2. Add a key in the plot.

Solution:

require(survival)
require(survminer)
fit2.c <- survfit(Surv(Survt, Status) ~ Clinic, data = multmyl, conf.int = 0.95, conf.type = "plain")
summary(fit2.c)

## Call: survfit(formula = Surv(Survt, Status) ~ Clinic, data = multmyl, 
##     conf.int = 0.95, conf.type = "plain")
## 
##                 Clinic=1 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1     25       2   0.9200  0.0543      0.81366        1.000
##     4     23       1   0.8800  0.0650      0.75262        1.000
##     5     22       2   0.8000  0.0800      0.64320        0.957
##     6     20       1   0.7600  0.0854      0.59259        0.927
##    10     18       4   0.5911  0.0998      0.39551        0.787
##    12     14       1   0.5489  0.1012      0.35052        0.747
##    15     12       1   0.5031  0.1026      0.30207        0.704
##    16     10       1   0.4528  0.1039      0.24911        0.657
##    17      9       1   0.4025  0.1039      0.19896        0.606
##    18      8       2   0.3019  0.0993      0.10722        0.497
##    23      5       2   0.1811  0.0890      0.00664        0.356
##    55      2       1   0.0906  0.0780      0.00000        0.243
##    66      1       1   0.0000     NaN          NaN          NaN
## 
##                 Clinic=2 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##     1     23       1    0.957  0.0425       0.8732        1.000
##     4     21       1    0.911  0.0601       0.7931        1.000
##     5     20       2    0.820  0.0816       0.6599        0.980
##     6     18       1    0.774  0.0889       0.6001        0.949
##     8     17       1    0.729  0.0946       0.5433        0.914
##    13     14       1    0.677  0.1012       0.4784        0.875
##    14     13       1    0.625  0.1059       0.4170        0.832
##    16     12       1    0.573  0.1092       0.3587        0.787
##    36     10       1    0.515  0.1123       0.2953        0.735
##    40      9       2    0.401  0.1128       0.1797        0.622
##    50      7       1    0.344  0.1103       0.1275        0.560
##    66      4       1    0.258  0.1112       0.0397        0.476

ggsurvplot(fit2.c, data = multmyl, risk.table = F, pval = T, conf.int = F, ncensor.plot = T, ncensor.plot.height = 0.25)

Conduct a log-rank test for survival difference between clinics by first filling in the table below.

I will start by creating two tables that represent the event times and number in the risk set at each time for clinic 1 and clinic 2. I did not want to write down each step by just to then formulate a new table.

require(dplyr)
require(data.table)
# To make things easier to fill in the table, I will partition the previous dataset into two datasets; one for each clinic

clinic1 <- filter(multmyl, Clinic == 1)
clinic2 <- filter(multmyl, Clinic == 2)

fit2.c.1 <- survfit(Surv(clinic1$Survt, clinic1$Status==1) ~ 1, conf.int = 0.95, conf.type = "plain")
fit2.c.2 <- survfit(Surv(clinic2$Survt, clinic2$Status==1) ~ 1, conf.int = 0.95, conf.type = "plain")

nriskdf <- data.frame("time" = fit2.c$time,  "nrisk" = fit2.c$n.risk, "nevent" = fit2.c$n.event, "Clinic" = c(rep(1, length(fit2.c.1$n.risk)), rep(2, length(fit2.c.2$n.risk))))



# Re-filter to see both all info for each clinic

table1 <- filter(nriskdf, Clinic == 1) %>% select(-Clinic)
table2 <- filter(nriskdf, Clinic == 2) %>% select(-Clinic)

combTable <- knitr::kable(list(table1, table2))
add_header_above(combTable, c("Clinic 1", "Clinic 2"))

Clinic 1

Clinic 2

time	nrisk	nevent
1	25	2
4	23	1
5	22	2
6	20	1
7	19	0
10	18	4
12	14	1
15	12	1
16	10	1
17	9	1
18	8	2
23	5	2
40	3	0
55	2	1
66	1	1

time	nrisk	nevent
1	23	1
3	22	0
4	21	1
5	20	2
6	18	1
8	17	1
10	16	0
11	15	0
13	14	1
14	13	1
16	12	1
18	11	0
36	10	1
40	9	2
50	7	1
52	6	0
55	5	0
66	4	1
88	3	0

Solution:

require(dplyr)
require(zoo)
require(kableExtra)

temp2 <- data.frame(merge(table1, table2, by = "time", all = T))

# Remove NA's

temp2$nevent.x[is.na(temp2$nevent.x)] = 0

temp2$nevent.y[is.na(temp2$nevent.y)] = 0

# The function below is from the "zoo" package

# It replaces NA's with the previous non-NA value in a column or row

temp2$nrisk.x <- na.locf(temp2$nrisk.x)
temp2$nrisk.y <- na.locf(temp2$nrisk.y)



# Reorder the columns to fit the correct table format

temp2 <- temp2[c("time", "nevent.x", "nevent.y", "nrisk.x", "nrisk.y")]

# Calculate the expected cell counts

e1f <- ((temp2$nrisk.x)/(temp2$nrisk.x + temp2$nrisk.y))*(temp2$nevent.x + temp2$nevent.y)
e2f <- ((temp2$nrisk.y)/(temp2$nrisk.x + temp2$nrisk.y))*(temp2$nevent.x + temp2$nevent.y)

m1f_e1f <- temp2$nevent.x - e1f
m2f_e2f <- temp2$nevent.y - e2f

temp3 <- data.frame(e1f, e2f, m1f_e1f, m2f_e2f)

p2.dfinal <- cbind(temp2, temp3)

p2.cleaned <- rbind(p2.dfinal, colSums(p2.dfinal))


# Present in table form

kbl(p2.cleaned, booktabs = T) %>%
  pack_rows("Totals", 25, 25)

time	nevent.x	nevent.y	nrisk.x	nrisk.y	e1f	e2f	m1f_e1f	m2f_e2f
1	2	1	25	23	1.5625000	1.4375000	0.4375000	-0.4375000
3	0	0	25	22	0.0000000	0.0000000	0.0000000	0.0000000
4	1	1	23	21	1.0454545	0.9545455	-0.0454545	0.0454545
5	2	2	22	20	2.0952381	1.9047619	-0.0952381	0.0952381
6	1	1	20	18	1.0526316	0.9473684	-0.0526316	0.0526316
7	0	0	19	18	0.0000000	0.0000000	0.0000000	0.0000000
8	0	1	19	17	0.5277778	0.4722222	-0.5277778	0.5277778
10	4	0	18	16	2.1176471	1.8823529	1.8823529	-1.8823529
11	0	0	18	15	0.0000000	0.0000000	0.0000000	0.0000000
12	1	0	14	15	0.4827586	0.5172414	0.5172414	-0.5172414
13	0	1	14	14	0.5000000	0.5000000	-0.5000000	0.5000000
14	0	1	14	13	0.5185185	0.4814815	-0.5185185	0.5185185
15	1	0	12	13	0.4800000	0.5200000	0.5200000	-0.5200000
16	1	1	10	12	0.9090909	1.0909091	0.0909091	-0.0909091
17	1	0	9	12	0.4285714	0.5714286	0.5714286	-0.5714286
18	2	0	8	11	0.8421053	1.1578947	1.1578947	-1.1578947
23	2	0	5	11	0.6250000	1.3750000	1.3750000	-1.3750000
36	0	1	5	10	0.3333333	0.6666667	-0.3333333	0.3333333
40	0	2	3	9	0.5000000	1.5000000	-0.5000000	0.5000000
50	0	1	3	7	0.3000000	0.7000000	-0.3000000	0.3000000
52	0	0	3	6	0.0000000	0.0000000	0.0000000	0.0000000
55	1	0	2	5	0.2857143	0.7142857	0.7142857	-0.7142857
66	1	1	1	4	0.4000000	1.6000000	0.6000000	-0.6000000
88	0	0	1	3	0.0000000	0.0000000	0.0000000	0.0000000
Totals
570	20	14	293	315	15.0063414	18.9936586	4.9936586	-4.9936586

State your null hypothesis, \(H_0\), compute your (approximate) test statistic, LRS, and find it’s p-value. Conduct your significance test at at \(\alpha = 0.10\) . State your conclusion.

Solution:

\(H_0\) : There is no difference between the two survival curves.
\(LRS = \chi^2 \approx \sum_i^2\frac{(0_i-E_i)^2}{Ei}\)
Using our values about, this becomes:
\(\chi^2 = \frac{(4.999)^2}{15.006} + \frac{(-4.999)^2}{18.994} = 2.981\)
Using a Chi-square probabilities table, it is revealed that \(p.val \approx 0.08\). Therefore, the null hypothesis is rejected, suggesting there is sufficient evidence indicating there is a difference between the two survival curves.

Vertify your conclusion by conducting your test in R. Give the p-values for both the approximate statistic given in R.

Solution:

compSurv <- survdiff(Surv(Survt, Status) ~ Clinic, data = multmyl)
compSurv

## Call:
## survdiff(formula = Surv(Survt, Status) ~ Clinic, data = multmyl)
## 
##           N Observed Expected (O-E)^2/E (O-E)^2/V
## Clinic=1 25       20     14.8      1.80      3.56
## Clinic=2 23       14     19.2      1.39      3.56
## 
##  Chisq= 3.6  on 1 degrees of freedom, p= 0.06

The approximate p-value is 0.08, and the exact p-value is 0.06. The approximate statistic is 2.981, and the exact statistic is 3.56.

Compute the Flemington-Harrington Test with \(q = 0\) and \(p = 0.4, 1, 2\) . According to the p-values returned by R, for for which of these values is the test still significant?

Solution:

pweights <- c(0.4, 1, 2)

for (p in pweights) {
  print(paste("For a rho value of: ", p, "The results are: "))
  print(survdiff(Surv(Survt, Status) ~ Clinic, data = multmyl, rho = p))
}

## [1] "For a rho value of:  0.4 The results are: "
## Call:
## survdiff(formula = Surv(Survt, Status) ~ Clinic, data = multmyl, 
##     rho = p)
## 
##           N Observed Expected (O-E)^2/E (O-E)^2/V
## Clinic=1 25     16.6     12.7      1.23       2.9
## Clinic=2 23     11.3     15.2      1.02       2.9
## 
##  Chisq= 2.9  on 1 degrees of freedom, p= 0.09 
## [1] "For a rho value of:  1 The results are: "
## Call:
## survdiff(formula = Surv(Survt, Status) ~ Clinic, data = multmyl, 
##     rho = p)
## 
##           N Observed Expected (O-E)^2/E (O-E)^2/V
## Clinic=1 25    13.14     10.4     0.741      2.05
## Clinic=2 23     8.69     11.5     0.671      2.05
## 
##  Chisq= 2.1  on 1 degrees of freedom, p= 0.2 
## [1] "For a rho value of:  2 The results are: "
## Call:
## survdiff(formula = Surv(Survt, Status) ~ Clinic, data = multmyl, 
##     rho = p)
## 
##           N Observed Expected (O-E)^2/E (O-E)^2/V
## Clinic=1 25     9.67     7.95     0.368      1.16
## Clinic=2 23     6.33     8.04     0.364      1.16
## 
##  Chisq= 1.2  on 1 degrees of freedom, p= 0.3

For \(p = 0.4\) , the null hypothesis is still rejected and the test-statistic is significant.

Researchers believe that the spread of this type of cancer is facilitated by fat cells and thus affected by the weight status of the subjects. In the data, subjects are classified as underweight or regular weight (WS=RW) or overweight (WS=OW) according the their BMI scores. Use R to conduct a stratified log-rank test with respect to Clinic, stratifying on weight status (WS).

Solution:

stratified <- survdiff(Surv(Survt, Status) ~ Clinic + strata(WS), data = multmyl)
stratified

## Call:
## survdiff(formula = Surv(Survt, Status) ~ Clinic + strata(WS), 
##     data = multmyl)
## 
##           N Observed Expected (O-E)^2/E (O-E)^2/V
## Clinic=1 25       20     14.8      1.84       3.7
## Clinic=2 23       14     19.2      1.42       3.7
## 
##  Chisq= 3.7  on 1 degrees of freedom, p= 0.05

Use R to plot a Kaplan-Meirer curve and conduct a log-rank test for weight status using significance level of \(\alpha = 0.10\)

Solution:

require(survival)
require(survminer)

fit2.i <- survfit(Surv(Survt, Status) ~ WS, data = multmyl)
ggsurvplot(fit2.i, data = multmyl, risk.table = F, pval = T, conf.int = F, ncensor.plot = T, ncensor.plot.height = 0.25)

survdiff(Surv(Survt, Status) ~ WS, data = multmyl)

## Call:
## survdiff(formula = Surv(Survt, Status) ~ WS, data = multmyl)
## 
##        N Observed Expected (O-E)^2/E (O-E)^2/V
## WS=OW 24       19     12.4      3.57      6.28
## WS=RW 24       15     21.6      2.04      6.28
## 
##  Chisq= 6.3  on 1 degrees of freedom, p= 0.01

Comparing the results on your unstratified log-rank test in (e) to your stratified log-rank test in (h), considering your results in (i), and inspecting the data, what conclusions can you reach about the confounding effect of weight status?

Solution:

The estimate improved when stratified by weight status. This indicates the weight of the patients influences the survival function.

Use R to find 95% confidence intervals about the KM estimates for both Clinic 1 and for Clinic 2. Use “plain” confidence intervals (i.e confidence intervals using Greenwood’s formula). Plot the KM curves for both clinics with the confidence intervals. What conclusion can you reach about the differences of the effects of Clinic type on survival time throughout the study?

Solution:

ggsurvplot(fit2.c, data = multmyl, risk.table = F, pval = T, conf.int = T, ncensor.plot = T, ncensor.plot.height = 0.25, conf.int.style = "step", palette = c("#E7B800", "#2E9FDF"), surv.median.line = "hv")

It is clear that individuals at clinic 2 have much longer survival times. This is shown by observing the median survival times for each KM curve (black-dashed line).

Find the 95% median survival probabilities for Clinic 1 and for Clinic 2 in R. Verify your answer for Clinic 1 by using the formula at the end of Chapter 2 and the estimates and their standard errors for survival probabilities given in R. According to your confidence intervals is there a statistically significant difference (at the \(\alpha = 0.05\) level) between the estimated median survival times for Clinic 1 and Clinic 2?

Solution:

fit2.l <- survfit(Surv(Survt, Status) ~ Clinic, data = multmyl, conf.int = 0.95)

fit2.l.1 <- survfit(Surv(Survt, Status) ~ Clinic, data = clinic1, conf.int = 0.95)
fit2.l.2 <- survfit(Surv(Survt, Status) ~ Clinic, data = clinic2, conf.int = 0.95)

# For Clinic 1

c1l<- 0.5 - 1.96*fit2.l.1$std.err < fit2.l.1$surv
c1u <- 0.5 + 1.96*fit2.l.1$std.err > fit2.l.1$surv

present1 <- data.frame(fit2.c.1$time, c1u, c1u)

# For Clinic 2

c2l <- 0.5 - 1.96*fit2.l.2$std.err < fit2.l.2$surv
c2u <- 0.5 + 1.96*fit2.l.2$std.err > fit2.l.2$surv

present2 <- data.frame(fit2.l.2$time, c2l, c2u)

present1

##    fit2.c.1.time   c1u c1u.1
## 1              1 FALSE FALSE
## 2              4 FALSE FALSE
## 3              5 FALSE FALSE
## 4              6 FALSE FALSE
## 5              7 FALSE FALSE
## 6             10  TRUE  TRUE
## 7             12  TRUE  TRUE
## 8             15  TRUE  TRUE
## 9             16  TRUE  TRUE
## 10            17  TRUE  TRUE
## 11            18  TRUE  TRUE
## 12            23  TRUE  TRUE
## 13            40  TRUE  TRUE
## 14            55  TRUE  TRUE
## 15            66  TRUE  TRUE

present2

##    fit2.l.2.time  c2l   c2u
## 1              1 TRUE FALSE
## 2              3 TRUE FALSE
## 3              4 TRUE FALSE
## 4              5 TRUE FALSE
## 5              6 TRUE FALSE
## 6              8 TRUE  TRUE
## 7             10 TRUE  TRUE
## 8             11 TRUE  TRUE
## 9             13 TRUE  TRUE
## 10            14 TRUE  TRUE
## 11            16 TRUE  TRUE
## 12            18 TRUE  TRUE
## 13            36 TRUE  TRUE
## 14            40 TRUE  TRUE
## 15            50 TRUE  TRUE
## 16            52 TRUE  TRUE
## 17            55 TRUE  TRUE
## 18            66 TRUE  TRUE
## 19            88 TRUE  TRUE

# Compare to the computed values

fit2.l

## Call: survfit(formula = Surv(Survt, Status) ~ Clinic, data = multmyl, 
##     conf.int = 0.95)
## 
##           n events median 0.95LCL 0.95UCL
## Clinic=1 25     20     16      10      23
## Clinic=2 23     14     40      14      NA

MATH 629 HW 1

Collin Hoskins

2021-09-16

Directory Setup

Chapter 1

Problem 1

Problem 2

Problem 3

Problem 3 a.1) Code

Problem 3 a.2) Code

Problem 4

Problem 5

Problem 6

Problem 7

Chapter 2

Problem 1

#	t(weeks)	d	X(group)
1	2	1	1
2	2	1	1
3	3	1	1
4	3	1	1
5	3	1	1
6	5	1	1
7	5	1	1
8	7	1	1
9	1	0	1
10	2	0	1
11	2	0	1
12	3	0	1
13	4	0	1
14	4	0	1
15	8	0	1

#	t(weeks)	d	X(group)
1	2	1	1
2	2	1	1
3	3	1	1
4	3	1	1
5	3	1	1
6	5	1	1
7	5	1	1
8	7	1	1
9	1	0	1
10	2	0	1
11	2	0	1
12	3	0	1
13	4	0	1
14	4	0	1
15	8	0	1

#	t(weeks)	d	X(group)
1	2	1	1
2	2	1	1
3	3	1	1
4	3	1	1
5	3	1	1
6	5	1	1
7	5	1	1
8	7	1	1
9	1	0	1
10	2	0	1
11	2	0	1
12	3	0	1
13	4	0	1
14	4	0	1
15	8	0	1