outcomes <- read_csv("data/outcomes99/outcomes99.csv")
## Parsed with column specification:
## cols(
## .default = col_integer(),
## V99ERKDATE = col_date(format = ""),
## V99ELKDATE = col_date(format = ""),
## V99ERHDATE = col_date(format = ""),
## V99ELHDATE = col_date(format = ""),
## V99EDDDATE = col_date(format = "")
## )
## See spec(...) for full column specifications.
dim(outcomes)
## [1] 4796 92
summary(outcomes$V99RNTCNT)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000 10.000 11.000 9.817 11.000 11.000 146
for some patients there is no information for the most recent OAI contact we will delete these rows
Select variables of interest from the outcomes table for knee replacement and exclude rows with NaN’s for V99RNTCNT
We will be considering the following variables:
V99RNTCNT: Most recent OAI contactV99ERKVSPR: Closest OAI contact prior to knee replacement (right knee)V99ELKVSPR: Closest OAI contact prior to knee replacement (left knee)V99ERKDATE: Date of right knee replacementV99ELKDATE: Date of left knee replacementtemp <- outcomes %>%
select(id, V99RNTCNT, V99ERKVSPR, V99ELKVSPR, V99ERKDATE, V99ELKDATE) %>%
filter(!is.na(V99RNTCNT))
dim(temp)
## [1] 4650 6
head(temp, 20)
## # A tibble: 20 x 6
## id V99RNTCNT V99ERKVSPR V99ELKVSPR V99ERKDATE V99ELKDATE
## <int> <int> <int> <int> <date> <date>
## 1 9000099 11 NA NA NA NA
## 2 9000296 11 NA NA NA NA
## 3 9000622 1 NA NA NA NA
## 4 9000798 11 NA NA NA NA
## 5 9001104 3 NA NA NA NA
## 6 9001400 11 NA NA NA NA
## 7 9001695 9 NA NA NA NA
## 8 9001897 11 NA NA NA NA
## 9 9002116 11 NA NA NA NA
## 10 9002316 11 NA NA NA NA
## 11 9002411 6 NA NA NA NA
## 12 9002430 11 6 NA 2010-02-15 NA
## 13 9002817 11 NA NA NA NA
## 14 9003126 11 NA NA NA NA
## 15 9003175 11 NA NA NA NA
## 16 9003316 11 NA NA NA NA
## 17 9003380 11 NA NA NA NA
## 18 9003406 11 NA NA NA NA
## 19 9003430 9 NA NA NA NA
## 20 9003658 11 NA NA NA NA
Create time column in months based on info from the V99RNTCNT, V99ERKVSPR and V99ELKVSPR columns
Do we assume here that V99RNTCNT will hold information for knee replacement in case of absent data from V99ERKVSPR and V99ELKVSPR? Why?
temp <- temp %>%
mutate(time = ifelse(
!is.na(temp$V99ERKVSPR),
temp$V99ERKVSPR,
ifelse(!is.na(temp$V99ELKVSPR), temp$V99ELKVSPR,
temp$V99RNTCNT)
))
head(temp, 30)
## # A tibble: 30 x 7
## id V99RNTCNT V99ERKVSPR V99ELKVSPR V99ERKDATE V99ELKDATE time
## <int> <int> <int> <int> <date> <date> <int>
## 1 9000099 11 NA NA NA NA 11
## 2 9000296 11 NA NA NA NA 11
## 3 9000622 1 NA NA NA NA 1
## 4 9000798 11 NA NA NA NA 11
## 5 9001104 3 NA NA NA NA 3
## 6 9001400 11 NA NA NA NA 11
## 7 9001695 9 NA NA NA NA 9
## 8 9001897 11 NA NA NA NA 11
## 9 9002116 11 NA NA NA NA 11
## 10 9002316 11 NA NA NA NA 11
## # ... with 20 more rows
transform time column into months
# transform time column into months
temp$time[temp$time == 1] <- 12
temp$time[temp$time == 2] <- 18
temp$time[temp$time == 3] <- 24
temp$time[temp$time == 4] <- 30
temp$time[temp$time == 5] <- 36
temp$time[temp$time == 6] <- 48
temp$time[temp$time == 7] <- 60
temp$time[temp$time == 8] <- 72
temp$time[temp$time == 9] <- 84
temp$time[temp$time == 10] <- 96
temp$time[temp$time == 11] <- 108
head(temp, 30)
## # A tibble: 30 x 7
## id V99RNTCNT V99ERKVSPR V99ELKVSPR V99ERKDATE V99ELKDATE time
## <int> <int> <int> <int> <date> <date> <dbl>
## 1 9000099 11 NA NA NA NA 108
## 2 9000296 11 NA NA NA NA 108
## 3 9000622 1 NA NA NA NA 12
## 4 9000798 11 NA NA NA NA 108
## 5 9001104 3 NA NA NA NA 24
## 6 9001400 11 NA NA NA NA 108
## 7 9001695 9 NA NA NA NA 84
## 8 9001897 11 NA NA NA NA 108
## 9 9002116 11 NA NA NA NA 108
## 10 9002316 11 NA NA NA NA 108
## # ... with 20 more rows
Create cens column with censoring information
cens = 0 for censored patientsCens = 1 for patients that experience the event (right or left knee replacement)temp <- temp %>%
mutate(cens = ifelse(!is.na(temp$V99ERKDATE), 1,
ifelse(!is.na(temp$V99ELKDATE), 1, 0)))
head(temp, 30)
## # A tibble: 30 x 8
## id V99RNTCNT V99ERKVSPR V99ELKVSPR V99ERKDATE V99ELKDATE time cens
## <int> <int> <int> <int> <date> <date> <dbl> <dbl>
## 1 9.00e6 11 NA NA NA NA 108 0
## 2 9.00e6 11 NA NA NA NA 108 0
## 3 9.00e6 1 NA NA NA NA 12 0
## 4 9.00e6 11 NA NA NA NA 108 0
## 5 9.00e6 3 NA NA NA NA 24 0
## 6 9.00e6 11 NA NA NA NA 108 0
## 7 9.00e6 9 NA NA NA NA 84 0
## 8 9.00e6 11 NA NA NA NA 108 0
## 9 9.00e6 11 NA NA NA NA 108 0
## 10 9.00e6 11 NA NA NA NA 108 0
## # ... with 20 more rows
dim(temp)
## [1] 4650 8
table(temp$time)
##
## 0 12 18 24 30 36 48 60 72 84 96 108
## 26 153 7 137 15 155 368 133 110 164 329 3053
There are patients with time = 0, these patients had a knee replacement before the 12 month visit. Let us remove them because we cannot have time = 0 to perform survival analysis Do we really need to that?
temp <- temp %>% filter(temp$time != 0)
dim (temp)
## [1] 4624 8
Create base data frame with columns id, cens and time
base <- temp %>% select(id, time, cens)
head(base, 30)
## # A tibble: 30 x 3
## id time cens
## <int> <dbl> <dbl>
## 1 9000099 108 0
## 2 9000296 108 0
## 3 9000622 12 0
## 4 9000798 108 0
## 5 9001104 24 0
## 6 9001400 108 0
## 7 9001695 84 0
## 8 9001897 108 0
## 9 9002116 108 0
## 10 9002316 108 0
## # ... with 20 more rows
dim(base)
## [1] 4624 3
Now we build the final dataset to be ready for survival analysis
First we will add STROKE features from Medical History datasets medhitxx, then we will add BMI feature from Physical Exam datasets physexamXX
# read data to add the stroke variable of interest
medical_hist00 <- read.csv('data/medhist/medhist00.csv')
medical_hist03 <- read.csv('data/medhist/medhist03.csv')
medical_hist06 <- read.csv('data/medhist/medhist06.csv')
medical_hist08 <- read.csv('data/medhist/medhist08.csv')
medical_hist10 <- read.csv('data/medhist/medhist10.csv')
Now addd time column to medical_histxx data frames
medical_hist00$time00 <- 0
medical_hist03$time03 <- 24
medical_hist06$time06 <- 48
medical_hist08$time08 <- 72
medical_hist10$time10 <- 96
Now we select only the features we need
medical_hist00 <- medical_hist00 %>% select(ID, V00STROKE, time00)
medical_hist03 <- medical_hist03 %>% select(ID, V03STROKE, time03)
medical_hist06 <- medical_hist06 %>% select(ID, V06STROKE, time06)
medical_hist08 <- medical_hist08 %>% select(ID, V08STROKE, time08)
medical_hist10 <- medical_hist10 %>% select(ID, V10STROKE, time10)
Create stroke data frame with stroke info for all the visits
stroke <- medical_hist00 %>%
full_join(medical_hist03) %>%
full_join(medical_hist06) %>%
full_join(medical_hist08) %>%
full_join(medical_hist10)
## Joining, by = "ID"
## Joining, by = "ID"
## Joining, by = "ID"
## Joining, by = "ID"
dim(medical_hist00)
## [1] 4796 3
dim(stroke)
## [1] 4796 11
Define stroke_sa data frame with info to perform survival analysis
# merge with base to have only the same id's
stroke_sa <- stroke %>%
right_join(base, by =c("ID"= "id")) %>%
select (c(1:11))
dim(stroke_sa)
## [1] 4624 11
head(stroke_sa)
## ID V00STROKE time00 V03STROKE time03 V06STROKE time06 V08STROKE
## 1 9000099 0 0 0 24 0 48 0
## 2 9000296 0 0 NA 24 0 48 0
## 3 9000622 1 0 NA 24 NA 48 NA
## 4 9000798 0 0 0 24 0 48 0
## 5 9001104 0 0 0 24 NA 48 NA
## 6 9001400 0 0 0 24 0 48 0
## time08 V10STROKE time10
## 1 72 0 96
## 2 72 0 96
## 3 72 NA 96
## 4 72 0 96
## 5 72 NA 96
## 6 72 0 96
# read data to add the stroke variable of interest
physical_exam00 <- read.csv('data/physexam/physexam00.csv')
physical_exam01 <- read.csv('data/physexam/physexam01.csv')
physical_exam03 <- read.csv('data/physexam/physexam03.csv')
physical_exam05 <- read.csv('data/physexam/physexam05.csv')
physical_exam06 <- read.csv('data/physexam/physexam06.csv')
physical_exam08 <- read.csv('data/physexam/physexam08.csv')
physical_exam10 <- read.csv('data/physexam/physexam10.csv')
Now addd time column to physical_examxx data frames
physical_exam00$time00 <- 0
physical_exam01$time01 <- 12
physical_exam03$time03 <- 24
physical_exam05$time05 <- 36
physical_exam06$time06 <- 48
physical_exam08$time08 <- 72
# V08KIKBALL did not change in this visit
physical_exam08$V08KIKBALL <- physical_exam06$V06KIKBALL
physical_exam10$time10 <- 96
# V10KIKBALL did not change in this visit
physical_exam10$V10KIKBALL <- physical_exam06$V06KIKBALL
Now we select only the features we need
xxxBMI -> Body mass indexbmicat -> Body mass index categories
xxxKIKBALL-> Either knee, avoid/reduce pain, aching or stiffness by changing or cutting back on normal activities, past 30 days (calc)
xxxWLKAID -> 20-meter walk: using walking aid such as cane
VxxKPRK20D and VxxKPLK20D that represents knee pain, maximum amount experienced during walk
VxxKPLK20B and VxxKPRK20B that represents knee pain, current severity
physical_exam00 <- physical_exam00 %>%
select(ID, P01BMI, V00WLKAID, V00KIKBALL, time00) %>%
mutate(V00BMICAT = ifelse(P01BMI <= 18.5, "Underweight",
ifelse(P01BMI <= 24.9, "Healthy",
ifelse(P01BMI <= 29.9, "Overweight",
ifelse(P01BMI <= 39.9, "Obese",
"Morbidly obese")))))
physical_exam01 <- physical_exam01 %>%
select(ID, V01BMI, V01WLKAID, V01KIKBALL, V01KPLK20B,
V01KPRK20B, V01KPLK20D, V01KPRK20D, time01) %>%
mutate(V01BMICAT = ifelse(V01BMI <= 18.5, "Underweight",
ifelse(V01BMI <= 24.9, "Healthy",
ifelse(V01BMI <= 29.9, "Overweight",
ifelse(V01BMI <= 39.9, "Obese",
"Morbidly obese")))))
physical_exam03 <- physical_exam03 %>%
select(ID, V03BMI, V03WLKAID, V03KIKBALL, V03KPLK20B,
V03KPRK20B, V03KPLK20D, V03KPRK20D, time03) %>%
mutate(V03BMICAT = ifelse(V03BMI <= 18.5, "Underweight",
ifelse(V03BMI <= 24.9, "Healthy",
ifelse(V03BMI <= 29.9, "Overweight",
ifelse(V03BMI <= 39.9, "Obese",
"Morbidly obese")))))
physical_exam05 <- physical_exam05 %>%
select(ID, V05BMI, V05WLKAID, V05KIKBALL, V05KPLK20B,
V05KPRK20B, V05KPLK20D, V05KPRK20D, time05) %>%
mutate(V05BMICAT = ifelse(V05BMI <= 18.5, "Underweight",
ifelse(V05BMI <= 24.9, "Healthy",
ifelse(V05BMI <= 29.9, "Overweight",
ifelse(V05BMI <= 39.9, "Obese",
"Morbidly obese")))))
physical_exam06 <- physical_exam06 %>%
select(ID, V06BMI, V06WLKAID, V06KIKBALL, V06KPLK20B,
V06KPRK20B, V06KPLK20D, V06KPRK20D, time06) %>%
mutate(V06BMICAT = ifelse(V06BMI <= 18.5, "Underweight",
ifelse(V06BMI <= 24.9, "Healthy",
ifelse(V06BMI <= 29.9, "Overweight",
ifelse(V06BMI <= 39.9, "Obese",
"Morbidly obese")))))
physical_exam08 <- physical_exam08 %>%
select(ID, V08BMI, V08WLKAID, V08KIKBALL, V08KPLK20B,
V08KPRK20B, V08KPLK20D, V08KPRK20D, time08) %>%
mutate(V08BMICAT = ifelse(V08BMI <= 18.5, "Underweight",
ifelse(V08BMI <= 24.9, "Healthy",
ifelse(V08BMI <= 29.9, "Overweight",
ifelse(V08BMI <= 39.9, "Obese",
"Morbidly obese")))))
physical_exam10 <- physical_exam10 %>%
select(ID, V10BMI, V10WLKAID, V10KIKBALL, V10KPLK20B,
V10KPRK20B, V10KPLK20D, V10KPRK20D, time10) %>%
mutate(V10BMICAT = ifelse(V10BMI <= 18.5, "Underweight",
ifelse(V10BMI <= 24.9, "Healthy",
ifelse(V10BMI <= 29.9, "Overweight",
ifelse(V10BMI <= 39.9, "Obese",
"Morbidly obese")))))
Create physical_exam data frame with Physical Exam info for all the visits
physical_exam <- physical_exam00 %>%
full_join(physical_exam01) %>%
full_join(physical_exam03) %>%
full_join(physical_exam05) %>%
full_join(physical_exam06) %>%
full_join(physical_exam08) %>%
full_join(physical_exam10)
## Joining, by = "ID"
## Joining, by = "ID"
## Joining, by = "ID"
## Joining, by = "ID"
## Joining, by = "ID"
## Joining, by = "ID"
dim(physical_exam)
## [1] 4796 60
Define bmi_sa data frame with info to perform survival analysis
# merge with base to have only the same id's
physical_exam_sa <- physical_exam %>%
right_join(base, by =c("ID"= "id")) %>%
select (c(1:60))
dim(physical_exam_sa)
## [1] 4624 60
head(physical_exam_sa)
## ID P01BMI V00WLKAID V00KIKBALL time00 V00BMICAT V01BMI V01WLKAID
## 1 9000099 23.8 0 1 0 Healthy 24.1 0
## 2 9000296 29.8 0 1 0 Overweight 29.4 0
## 3 9000622 22.7 0 1 0 Healthy 22.2 0
## 4 9000798 32.4 0 1 0 Obese 32.8 0
## 5 9001104 30.7 0 1 0 Obese 28.7 0
## 6 9001400 23.5 0 1 0 Healthy 23.8 0
## V01KIKBALL V01KPLK20B V01KPRK20B V01KPLK20D V01KPRK20D time01 V01BMICAT
## 1 NA 0 0 0 0 12 Healthy
## 2 NA 0 0 0 0 12 Overweight
## 3 NA 0 1 0 1 12 Healthy
## 4 NA 2 0 1 0 12 Obese
## 5 NA 0 0 0 0 12 Overweight
## 6 NA 0 0 0 0 12 Healthy
## V03BMI V03WLKAID V03KIKBALL V03KPLK20B V03KPRK20B V03KPLK20D V03KPRK20D
## 1 23.4 0 1 0 0 0 0
## 2 NA NA NA NA NA NA NA
## 3 NA NA NA NA NA NA NA
## 4 34.3 0 1 1 0 1 0
## 5 NA NA NA NA NA NA NA
## 6 23.8 0 1 0 0 0 0
## time03 V03BMICAT V05BMI V05WLKAID V05KIKBALL V05KPLK20B V05KPRK20B
## 1 24 Healthy 23.7 0 NA 0 0
## 2 24 <NA> 28.8 0 1 0 0
## 3 24 <NA> NA NA NA NA NA
## 4 24 Obese 33.6 0 NA 0 0
## 5 24 <NA> NA NA NA NA NA
## 6 24 Healthy 23.3 0 NA 0 0
## V05KPLK20D V05KPRK20D time05 V05BMICAT V06BMI V06WLKAID V06KIKBALL
## 1 0 0 36 Healthy 24.7 0 1
## 2 0 0 36 Overweight 28.4 0 1
## 3 NA NA 36 <NA> NA NA NA
## 4 0 0 36 Obese 34.2 0 1
## 5 NA NA 36 <NA> NA NA NA
## 6 0 1 36 Healthy NA NA NA
## V06KPLK20B V06KPRK20B V06KPLK20D V06KPRK20D time06 V06BMICAT V08BMI
## 1 0 0 0 0 48 Healthy 24.5
## 2 0 0 0 0 48 Overweight 28.2
## 3 NA NA NA NA 48 <NA> NA
## 4 0 0 1 0 48 Obese 36.2
## 5 NA NA NA NA 48 <NA> NA
## 6 NA NA NA NA 48 <NA> NA
## V08WLKAID V08KIKBALL V08KPLK20B V08KPRK20B V08KPLK20D V08KPRK20D time08
## 1 0 1 2 2 0 0 72
## 2 0 1 0 0 0 0 72
## 3 NA NA NA NA NA NA 72
## 4 0 1 0 0 0 0 72
## 5 NA NA NA NA NA NA 72
## 6 NA NA NA NA NA NA 72
## V08BMICAT V10BMI V10WLKAID V10KIKBALL V10KPLK20B V10KPRK20B V10KPLK20D
## 1 Healthy 21.7 0 1 1 0 1
## 2 Overweight 28.1 0 1 0 0 0
## 3 <NA> NA NA NA NA NA NA
## 4 Obese 34.3 0 1 0 0 0
## 5 <NA> NA NA NA NA NA NA
## 6 <NA> 22.9 0 NA 0 0 0
## V10KPRK20D time10 V10BMICAT
## 1 0 96 Healthy
## 2 0 96 Overweight
## 3 NA 96 <NA>
## 4 0 96 Obese
## 5 NA 96 <NA>
## 6 0 96 Healthy
The JointSx datasets contain questionnaire results regarding arthritis symptoms in the knee, hip, back, and other joints; arthritis-related joint function and disability; and general health-related function and disability. It also contains data from several standardized instruments for arthritis and general health, including: Western Ontario and McMasters Osteoarthritis Index (WOMAC), the Knee Outcomes of Osteoarthritis Scale (KOOS), and the Medical Outcomes Study (SF12).
Our interest is to study pmlkrcv and pmrkrcvthat measure Left/right knee pain severity in the past 30 days:
jointsx00 <- read.csv('data/jointsx/jointsx00.csv')
jointsx01 <- read.csv('data/jointsx/jointsx01.csv')
jointsx02 <- read.csv('data/jointsx/jointsx02.csv')
jointsx03 <- read.csv('data/jointsx/jointsx03.csv')
jointsx04 <- read.csv('data/jointsx/jointsx04.csv')
jointsx05 <- read.csv('data/jointsx/jointsx05.csv')
jointsx06 <- read.csv('data/jointsx/jointsx06.csv')
jointsx07 <- read.csv('data/jointsx/jointsx07.csv')
jointsx08 <- read.csv('data/jointsx/jointsx08.csv')
jointsx09 <- read.csv('data/jointsx/jointsx09.csv')
jointsx10 <- read.csv('data/jointsx/jointsx10.csv')
jointsx11 <- read.csv('data/jointsx/jointsx11.csv')
Now addd time column to jointsxxx data frames
jointsx00$time00 <- 0
jointsx01$time01 <- 12
jointsx02$time02 <- 18
jointsx03$time03 <- 24
jointsx04$time04 <- 30
jointsx05$time05 <- 36
jointsx06$time06 <- 48
jointsx07$time07 <- 60
jointsx08$time08 <- 72
jointsx09$time09 <- 84
jointsx10$time10 <- 96
jointsx11$time11 <- 108
Now we select only the features we need
jointsx00 <- jointsx00 %>%
select(ID, P01PMRKRCV, P01PMLKRCV,time00)
jointsx01 <- jointsx01 %>%
select(ID, V01PMRKRCV, V01PMLKRCV,time01)
jointsx03 <- jointsx03 %>%
select(ID, V03PMRKRCV, V03PMLKRCV,time03)
jointsx05 <- jointsx05 %>%
select(ID, V05PMRKRCV, V05PMLKRCV,time05)
jointsx06 <- jointsx06 %>%
select(ID, V06PMRKRCV, V06PMLKRCV,time06)
jointsx07 <- jointsx07 %>%
select(ID, V07PMRKRCV, V07PMLKRCV,time07)
jointsx08 <- jointsx08 %>%
select(ID, V08PMRKRCV, V08PMLKRCV,time08)
jointsx09 <- jointsx09 %>%
select(ID, V09PMRKRCV, V09PMLKRCV,time09)
jointsx10 <- jointsx10 %>%
select(ID, V10PMRKRCV, V10PMLKRCV,time10)
jointsx11 <- jointsx11 %>%
select(ID, V11PMRKRCV, V11PMLKRCV,time11)
Create jointsx data frame with Joint Symptoms/Function info for all the visits
jointsx <- jointsx00 %>%
full_join(jointsx01) %>%
full_join(jointsx03) %>%
full_join(jointsx05) %>%
full_join(jointsx06) %>%
full_join(jointsx07) %>%
full_join(jointsx08) %>%
full_join(jointsx09) %>%
full_join(jointsx10) %>%
full_join(jointsx11)
## Joining, by = "ID"
## Joining, by = "ID"
## Joining, by = "ID"
## Joining, by = "ID"
## Joining, by = "ID"
## Joining, by = "ID"
## Joining, by = "ID"
## Joining, by = "ID"
## Joining, by = "ID"
dim(jointsx)
## [1] 4796 31
Define jointsx_sa data frame with info to perform survival analysis
# merge with base to have only the same id's
jointsx_sa <- jointsx %>%
right_join(base, by =c("ID"= "id")) %>%
select (c(1:31))
dim(jointsx_sa)
## [1] 4624 31
head(jointsx_sa)
## ID P01PMRKRCV P01PMLKRCV time00 V01PMRKRCV V01PMLKRCV time01
## 1 9000099 3 2 0 0 0 12
## 2 9000296 0 0 0 0 0 12
## 3 9000622 2 0 0 5 0 12
## 4 9000798 0 4 0 3 7 12
## 5 9001104 0 0 0 4 0 12
## 6 9001400 2 0 0 3 2 12
## V03PMRKRCV V03PMLKRCV time03 V05PMRKRCV V05PMLKRCV time05 V06PMRKRCV
## 1 0 3 24 1 1 36 2
## 2 0 0 24 0 0 36 5
## 3 NA NA 24 NA NA 36 NA
## 4 0 5 24 0 4 36 0
## 5 5 4 24 NA NA 36 NA
## 6 2 0 24 3 3 36 3
## V06PMLKRCV time06 V07PMRKRCV V07PMLKRCV time07 V08PMRKRCV V08PMLKRCV
## 1 2 48 5 2 60 2 2
## 2 0 48 0 0 60 0 0
## 3 NA 48 NA NA 60 NA NA
## 4 3 48 0 4 60 0 5
## 5 NA 48 NA NA 60 NA NA
## 6 1 48 2 0 60 3 2
## time08 V09PMRKRCV V09PMLKRCV time09 V10PMRKRCV V10PMLKRCV time10
## 1 72 0 0 84 1 1 96
## 2 72 0 0 84 0 0 96
## 3 72 NA NA 84 NA NA 96
## 4 72 0 5 84 0 4 96
## 5 72 NA NA 84 2 0 96
## 6 72 3 0 84 0 0 96
## V11PMRKRCV V11PMLKRCV time11
## 1 3 5 108
## 2 0 0 108
## 3 NA NA 108
## 4 0 4 108
## 5 NA NA 108
## 6 0 0 108
Now let us add some time independent feature like race, sex and age (at the study joining time)
tic_col <- read_csv("data/enrollees/enrollees.csv") %>%
select(id = ID, race = P02RACE, sex = P02SEX)
## Parsed with column specification:
## cols(
## .default = col_integer(),
## V00SITE = col_character()
## )
## See spec(...) for full column specifications.
subj <- read_csv("data/subjectchar/subjectchar00.csv") %>%
select(id = ID, age = V00AGE)
## Parsed with column specification:
## cols(
## .default = col_integer(),
## VERSION = col_character(),
## P02DATE = col_date(format = ""),
## P02HR10 = col_character(),
## P02HR11 = col_character(),
## P01SVDATE = col_date(format = ""),
## V00EVDATE = col_date(format = ""),
## V00ENRCR2 = col_character()
## )
## See spec(...) for full column specifications.
tic_col$sex <- factor(tic_col$sex,
levels = c("1", "2"),
labels = c("Male", "Female"))
tic_col$race <- factor(tic_col$race,
levels = c("0", "1", "2", "3"),
labels = c("Other Non-white", "White or Caucasian",
"Black or African American", "Asian"))
tic_col$race <- relevel(tic_col$race, "White or Caucasian")
tic_col <- tic_col %>%
full_join(subj)
## Joining, by = "id"
Finally we use tmerge function from the survival package to get the data in the start/stop format
# Set time range for each patient
base_sa <- tmerge(base, base, id=id, knee_rep = event(time, cens))
head(base_sa, 30)
## id time cens tstart tstop knee_rep
## 1 9000099 108 0 0 108 0
## 2 9000296 108 0 0 108 0
## 3 9000622 12 0 0 12 0
## 4 9000798 108 0 0 108 0
## 5 9001104 24 0 0 24 0
## 6 9001400 108 0 0 108 0
## 7 9001695 84 0 0 84 0
## 8 9001897 108 0 0 108 0
## 9 9002116 108 0 0 108 0
## 10 9002316 108 0 0 108 0
## 11 9002411 48 0 0 48 0
## 12 9002430 48 1 0 48 1
## 13 9002817 108 0 0 108 0
## 14 9003126 108 0 0 108 0
## 15 9003175 108 0 0 108 0
## 16 9003316 108 0 0 108 0
## 17 9003380 108 0 0 108 0
## 18 9003406 108 0 0 108 0
## 19 9003430 84 0 0 84 0
## 20 9003658 108 0 0 108 0
## 21 9003815 108 0 0 108 0
## 22 9003895 108 0 0 108 0
## 23 9004175 108 0 0 108 0
## 24 9004184 108 0 0 108 0
## 25 9004315 108 0 0 108 0
## 26 9004462 12 0 0 12 0
## 27 9004669 108 0 0 108 0
## 28 9004905 108 0 0 108 0
## 29 9005075 36 0 0 36 0
## 30 9005132 24 1 0 24 1
# Add time dependent covariate: stroke
OAI_sa <- tmerge(base_sa, stroke_sa, id = ID,
stroke = tdc(time00, V00STROKE),
stroke = tdc(time03, V03STROKE),
stroke = tdc(time06, V06STROKE),
stroke = tdc(time08, V08STROKE),
stroke = tdc(time10, V10STROKE))
# Add time dependent covariate: bmi
OAI_sa <- tmerge(OAI_sa, physical_exam_sa, id = ID,
bmi = tdc(time00, P01BMI),
bmi = tdc(time01, V01BMI),
bmi = tdc(time03, V03BMI),
bmi = tdc(time05, V05BMI),
bmi = tdc(time06, V06BMI),
bmi = tdc(time08, V08BMI),
bmi = tdc(time10, V10BMI))
# Add time dependent covariate: bmi cateogry
OAI_sa <- tmerge(OAI_sa, physical_exam_sa, id = ID,
bmi_cat = tdc(time00, as_factor(V00BMICAT)),
bmi_cat = tdc(time01, as_factor(V01BMICAT)),
bmi_cat = tdc(time03, as_factor(V03BMICAT)),
bmi_cat = tdc(time05, as_factor(V05BMICAT)),
bmi_cat = tdc(time06, as_factor(V06BMICAT)),
bmi_cat = tdc(time08, as_factor(V08BMICAT)),
bmi_cat = tdc(time10, as_factor(V10BMICAT)))
# Add time dependent covariate: WLKAID
OAI_sa <- tmerge(OAI_sa, physical_exam_sa, id = ID,
wlkaid = tdc(time00, V00WLKAID),
wlkaid = tdc(time01, V01WLKAID),
wlkaid = tdc(time03, V03WLKAID),
wlkaid = tdc(time05, V05WLKAID),
wlkaid = tdc(time06, V06WLKAID),
wlkaid = tdc(time08, V08WLKAID),
wlkaid = tdc(time10, V10WLKAID))
# Add time dependent covariate: KIKBALL
OAI_sa <- tmerge(OAI_sa, physical_exam_sa, id = ID,
kikball = tdc(time00, V00KIKBALL),
kikball = tdc(time01, V01KIKBALL),
kikball = tdc(time03, V03KIKBALL),
kikball = tdc(time05, V05KIKBALL),
kikball = tdc(time06, V06KIKBALL),
kikball = tdc(time08, V08KIKBALL),
kikball = tdc(time10, V10KIKBALL))
# Add time dependent covariate: kplk20b
OAI_sa <- tmerge(OAI_sa, physical_exam_sa, id = ID,
kplk20b = tdc(time01, V01KPLK20B),
kplk20b = tdc(time03, V03KPLK20B),
kplk20b = tdc(time05, V05KPLK20B),
kplk20b = tdc(time06, V06KPLK20B),
kplk20b = tdc(time08, V08KPLK20B),
kplk20b = tdc(time10, V10KPLK20B))
# Add time dependent covariate: kplk20d
OAI_sa <- tmerge(OAI_sa, physical_exam_sa, id = ID,
kplk20d = tdc(time01, V01KPLK20D),
kplk20d = tdc(time03, V03KPLK20D),
kplk20d = tdc(time05, V05KPLK20D),
kplk20d = tdc(time06, V06KPLK20D),
kplk20d = tdc(time08, V08KPLK20D),
kplk20d = tdc(time10, V10KPLK20D))
# Add time dependent covariate: kprk20b
OAI_sa <- tmerge(OAI_sa, physical_exam_sa, id = ID,
kprk20b = tdc(time01, V01KPRK20B),
kprk20b = tdc(time03, V03KPRK20B),
kprk20b = tdc(time05, V05KPRK20B),
kprk20b = tdc(time06, V06KPRK20B),
kprk20b = tdc(time08, V08KPRK20B),
kprk20b = tdc(time10, V10KPRK20B))
# Add time dependent covariate: kprk20r
OAI_sa <- tmerge(OAI_sa, physical_exam_sa, id = ID,
kprk20d = tdc(time01, V01KPRK20D),
kprk20d = tdc(time03, V03KPRK20D),
kprk20d = tdc(time05, V05KPRK20D),
kprk20d = tdc(time06, V06KPRK20D),
kprk20d = tdc(time08, V08KPRK20D),
kprk20d = tdc(time10, V10KPRK20D))
# Add time dependent covariate: PMRKRCV
OAI_sa <- tmerge(OAI_sa, jointsx_sa, id = ID,
PMRKRCV = tdc(time00, P01PMRKRCV),
PMRKRCV = tdc(time01, V01PMRKRCV),
PMRKRCV = tdc(time03, V03PMRKRCV),
PMRKRCV = tdc(time05, V05PMRKRCV),
PMRKRCV = tdc(time06, V06PMRKRCV),
PMRKRCV = tdc(time07, V07PMRKRCV),
PMRKRCV = tdc(time08, V08PMRKRCV),
PMRKRCV = tdc(time09, V09PMRKRCV),
PMRKRCV = tdc(time10, V10PMRKRCV),
PMRKRCV = tdc(time11, V11PMRKRCV))
# Add time dependent covariate: PMLKRCV
OAI_sa <- tmerge(OAI_sa, jointsx_sa, id = ID,
PMLKRCV = tdc(time00, P01PMLKRCV),
PMLKRCV = tdc(time01, V01PMLKRCV),
PMLKRCV = tdc(time03, V03PMLKRCV),
PMLKRCV = tdc(time05, V05PMLKRCV),
PMLKRCV = tdc(time06, V06PMLKRCV),
PMLKRCV = tdc(time07, V07PMLKRCV),
PMLKRCV = tdc(time08, V08PMLKRCV),
PMLKRCV = tdc(time09, V09PMLKRCV),
PMLKRCV = tdc(time10, V10PMLKRCV),
PMLKRCV = tdc(time11, V11PMLKRCV))
OAI_sa <- merge(x = OAI_sa, y = tic_col, by = "id")
head(OAI_sa, 30)
## id time cens tstart tstop knee_rep stroke bmi bmi_cat wlkaid
## 1 9000099 108 0 0 12 0 0 23.8 Healthy 0
## 2 9000099 108 0 12 24 0 0 24.1 Healthy 0
## 3 9000099 108 0 24 36 0 0 23.4 Healthy 0
## 4 9000099 108 0 36 48 0 0 23.7 Healthy 0
## 5 9000099 108 0 48 60 0 0 24.7 Healthy 0
## 6 9000099 108 0 60 72 0 0 24.7 Healthy 0
## 7 9000099 108 0 72 84 0 0 24.5 Healthy 0
## 8 9000099 108 0 84 96 0 0 24.5 Healthy 0
## 9 9000099 108 0 96 108 0 0 21.7 Healthy 0
## 10 9000296 108 0 0 12 0 0 29.8 Overweight 0
## 11 9000296 108 0 12 24 0 0 29.4 Overweight 0
## 12 9000296 108 0 24 36 0 0 29.4 Overweight 0
## 13 9000296 108 0 36 48 0 0 28.8 Overweight 0
## 14 9000296 108 0 48 60 0 0 28.4 Overweight 0
## 15 9000296 108 0 60 72 0 0 28.4 Overweight 0
## 16 9000296 108 0 72 84 0 0 28.2 Overweight 0
## 17 9000296 108 0 84 96 0 0 28.2 Overweight 0
## 18 9000296 108 0 96 108 0 0 28.1 Overweight 0
## 19 9000622 12 0 0 12 0 1 22.7 Healthy 0
## 20 9000798 108 0 0 12 0 0 32.4 Obese 0
## 21 9000798 108 0 12 24 0 0 32.8 Obese 0
## 22 9000798 108 0 24 36 0 0 34.3 Obese 0
## 23 9000798 108 0 36 48 0 0 33.6 Obese 0
## 24 9000798 108 0 48 60 0 0 34.2 Obese 0
## 25 9000798 108 0 60 72 0 0 34.2 Obese 0
## 26 9000798 108 0 72 84 0 0 36.2 Obese 0
## 27 9000798 108 0 84 96 0 0 36.2 Obese 0
## 28 9000798 108 0 96 108 0 0 34.3 Obese 0
## 29 9001104 24 0 0 12 0 0 30.7 Obese 0
## 30 9001104 24 0 12 24 0 0 28.7 Overweight 0
## kikball kplk20b kplk20d kprk20b kprk20d PMRKRCV PMLKRCV
## 1 1 NA NA NA NA 3 2
## 2 1 0 0 0 0 0 0
## 3 1 0 0 0 0 0 3
## 4 1 0 0 0 0 1 1
## 5 1 0 0 0 0 2 2
## 6 1 0 0 0 0 5 2
## 7 1 2 0 2 0 2 2
## 8 1 2 0 2 0 0 0
## 9 1 1 1 0 0 1 1
## 10 1 NA NA NA NA 0 0
## 11 1 0 0 0 0 0 0
## 12 1 0 0 0 0 0 0
## 13 1 0 0 0 0 0 0
## 14 1 0 0 0 0 5 0
## 15 1 0 0 0 0 0 0
## 16 1 0 0 0 0 0 0
## 17 1 0 0 0 0 0 0
## 18 1 0 0 0 0 0 0
## 19 1 NA NA NA NA 2 0
## 20 1 NA NA NA NA 0 4
## 21 1 2 1 0 0 3 7
## 22 1 1 1 0 0 0 5
## 23 1 0 0 0 0 0 4
## 24 1 0 1 0 0 0 3
## 25 1 0 1 0 0 0 4
## 26 1 0 0 0 0 0 5
## 27 1 0 0 0 0 0 5
## 28 1 0 0 0 0 0 4
## 29 1 NA NA NA NA 0 0
## 30 1 0 0 0 0 4 0
## race sex age
## 1 White or Caucasian Male 59
## 2 White or Caucasian Male 59
## 3 White or Caucasian Male 59
## 4 White or Caucasian Male 59
## 5 White or Caucasian Male 59
## 6 White or Caucasian Male 59
## 7 White or Caucasian Male 59
## 8 White or Caucasian Male 59
## 9 White or Caucasian Male 59
## 10 White or Caucasian Male 69
## 11 White or Caucasian Male 69
## 12 White or Caucasian Male 69
## 13 White or Caucasian Male 69
## 14 White or Caucasian Male 69
## 15 White or Caucasian Male 69
## 16 White or Caucasian Male 69
## 17 White or Caucasian Male 69
## 18 White or Caucasian Male 69
## 19 White or Caucasian Female 71
## 20 White or Caucasian Male 56
## 21 White or Caucasian Male 56
## 22 White or Caucasian Male 56
## 23 White or Caucasian Male 56
## 24 White or Caucasian Male 56
## 25 White or Caucasian Male 56
## 26 White or Caucasian Male 56
## 27 White or Caucasian Male 56
## 28 White or Caucasian Male 56
## 29 White or Caucasian Female 72
## 30 White or Caucasian Female 72
# first 20 rows of columns necessary to perform survival analysis
OAI_sa[1:20, c(1, 4:20)]
## id tstart tstop knee_rep stroke bmi bmi_cat wlkaid kikball
## 1 9000099 0 12 0 0 23.8 Healthy 0 1
## 2 9000099 12 24 0 0 24.1 Healthy 0 1
## 3 9000099 24 36 0 0 23.4 Healthy 0 1
## 4 9000099 36 48 0 0 23.7 Healthy 0 1
## 5 9000099 48 60 0 0 24.7 Healthy 0 1
## 6 9000099 60 72 0 0 24.7 Healthy 0 1
## 7 9000099 72 84 0 0 24.5 Healthy 0 1
## 8 9000099 84 96 0 0 24.5 Healthy 0 1
## 9 9000099 96 108 0 0 21.7 Healthy 0 1
## 10 9000296 0 12 0 0 29.8 Overweight 0 1
## 11 9000296 12 24 0 0 29.4 Overweight 0 1
## 12 9000296 24 36 0 0 29.4 Overweight 0 1
## 13 9000296 36 48 0 0 28.8 Overweight 0 1
## 14 9000296 48 60 0 0 28.4 Overweight 0 1
## 15 9000296 60 72 0 0 28.4 Overweight 0 1
## 16 9000296 72 84 0 0 28.2 Overweight 0 1
## 17 9000296 84 96 0 0 28.2 Overweight 0 1
## 18 9000296 96 108 0 0 28.1 Overweight 0 1
## 19 9000622 0 12 0 1 22.7 Healthy 0 1
## 20 9000798 0 12 0 0 32.4 Obese 0 1
## kplk20b kplk20d kprk20b kprk20d PMRKRCV PMLKRCV race
## 1 NA NA NA NA 3 2 White or Caucasian
## 2 0 0 0 0 0 0 White or Caucasian
## 3 0 0 0 0 0 3 White or Caucasian
## 4 0 0 0 0 1 1 White or Caucasian
## 5 0 0 0 0 2 2 White or Caucasian
## 6 0 0 0 0 5 2 White or Caucasian
## 7 2 0 2 0 2 2 White or Caucasian
## 8 2 0 2 0 0 0 White or Caucasian
## 9 1 1 0 0 1 1 White or Caucasian
## 10 NA NA NA NA 0 0 White or Caucasian
## 11 0 0 0 0 0 0 White or Caucasian
## 12 0 0 0 0 0 0 White or Caucasian
## 13 0 0 0 0 0 0 White or Caucasian
## 14 0 0 0 0 5 0 White or Caucasian
## 15 0 0 0 0 0 0 White or Caucasian
## 16 0 0 0 0 0 0 White or Caucasian
## 17 0 0 0 0 0 0 White or Caucasian
## 18 0 0 0 0 0 0 White or Caucasian
## 19 NA NA NA NA 2 0 White or Caucasian
## 20 NA NA NA NA 0 4 White or Caucasian
## sex age
## 1 Male 59
## 2 Male 59
## 3 Male 59
## 4 Male 59
## 5 Male 59
## 6 Male 59
## 7 Male 59
## 8 Male 59
## 9 Male 59
## 10 Male 69
## 11 Male 69
## 12 Male 69
## 13 Male 69
## 14 Male 69
## 15 Male 69
## 16 Male 69
## 17 Male 69
## 18 Male 69
## 19 Female 71
## 20 Male 56