Outcome Dataset

outcomes <- read_csv("data/outcomes99/outcomes99.csv")
## Parsed with column specification:
## cols(
##   .default = col_integer(),
##   V99ERKDATE = col_date(format = ""),
##   V99ELKDATE = col_date(format = ""),
##   V99ERHDATE = col_date(format = ""),
##   V99ELHDATE = col_date(format = ""),
##   V99EDDDATE = col_date(format = "")
## )
## See spec(...) for full column specifications.
dim(outcomes)
## [1] 4796   92
summary(outcomes$V99RNTCNT)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000  10.000  11.000   9.817  11.000  11.000     146

for some patients there is no information for the most recent OAI contact we will delete these rows

Response Variable

Select variables of interest from the outcomes table for knee replacement and exclude rows with NaN’s for V99RNTCNT

We will be considering the following variables:

temp <- outcomes %>% 
  select(id, V99RNTCNT, V99ERKVSPR, V99ELKVSPR, V99ERKDATE, V99ELKDATE) %>% 
  filter(!is.na(V99RNTCNT))
dim(temp)
## [1] 4650    6
head(temp, 20)
## # A tibble: 20 x 6
##         id V99RNTCNT V99ERKVSPR V99ELKVSPR V99ERKDATE V99ELKDATE
##      <int>     <int>      <int>      <int> <date>     <date>    
##  1 9000099        11         NA         NA NA         NA        
##  2 9000296        11         NA         NA NA         NA        
##  3 9000622         1         NA         NA NA         NA        
##  4 9000798        11         NA         NA NA         NA        
##  5 9001104         3         NA         NA NA         NA        
##  6 9001400        11         NA         NA NA         NA        
##  7 9001695         9         NA         NA NA         NA        
##  8 9001897        11         NA         NA NA         NA        
##  9 9002116        11         NA         NA NA         NA        
## 10 9002316        11         NA         NA NA         NA        
## 11 9002411         6         NA         NA NA         NA        
## 12 9002430        11          6         NA 2010-02-15 NA        
## 13 9002817        11         NA         NA NA         NA        
## 14 9003126        11         NA         NA NA         NA        
## 15 9003175        11         NA         NA NA         NA        
## 16 9003316        11         NA         NA NA         NA        
## 17 9003380        11         NA         NA NA         NA        
## 18 9003406        11         NA         NA NA         NA        
## 19 9003430         9         NA         NA NA         NA        
## 20 9003658        11         NA         NA NA         NA

Create time column in months based on info from the V99RNTCNT, V99ERKVSPR and V99ELKVSPR columns

Do we assume here that V99RNTCNT will hold information for knee replacement in case of absent data from V99ERKVSPR and V99ELKVSPR? Why?

temp <- temp %>%
  mutate(time = ifelse(
  !is.na(temp$V99ERKVSPR),
  temp$V99ERKVSPR,
  ifelse(!is.na(temp$V99ELKVSPR), temp$V99ELKVSPR,
  temp$V99RNTCNT)
  ))
head(temp, 30)
## # A tibble: 30 x 7
##         id V99RNTCNT V99ERKVSPR V99ELKVSPR V99ERKDATE V99ELKDATE  time
##      <int>     <int>      <int>      <int> <date>     <date>     <int>
##  1 9000099        11         NA         NA NA         NA            11
##  2 9000296        11         NA         NA NA         NA            11
##  3 9000622         1         NA         NA NA         NA             1
##  4 9000798        11         NA         NA NA         NA            11
##  5 9001104         3         NA         NA NA         NA             3
##  6 9001400        11         NA         NA NA         NA            11
##  7 9001695         9         NA         NA NA         NA             9
##  8 9001897        11         NA         NA NA         NA            11
##  9 9002116        11         NA         NA NA         NA            11
## 10 9002316        11         NA         NA NA         NA            11
## # ... with 20 more rows

transform time column into months

# transform time column into months
temp$time[temp$time == 1] <- 12
temp$time[temp$time == 2] <- 18
temp$time[temp$time == 3] <- 24
temp$time[temp$time == 4] <- 30
temp$time[temp$time == 5] <- 36
temp$time[temp$time == 6] <- 48
temp$time[temp$time == 7] <- 60
temp$time[temp$time == 8] <- 72
temp$time[temp$time == 9] <- 84
temp$time[temp$time == 10] <- 96
temp$time[temp$time == 11] <- 108


head(temp, 30)
## # A tibble: 30 x 7
##         id V99RNTCNT V99ERKVSPR V99ELKVSPR V99ERKDATE V99ELKDATE  time
##      <int>     <int>      <int>      <int> <date>     <date>     <dbl>
##  1 9000099        11         NA         NA NA         NA           108
##  2 9000296        11         NA         NA NA         NA           108
##  3 9000622         1         NA         NA NA         NA            12
##  4 9000798        11         NA         NA NA         NA           108
##  5 9001104         3         NA         NA NA         NA            24
##  6 9001400        11         NA         NA NA         NA           108
##  7 9001695         9         NA         NA NA         NA            84
##  8 9001897        11         NA         NA NA         NA           108
##  9 9002116        11         NA         NA NA         NA           108
## 10 9002316        11         NA         NA NA         NA           108
## # ... with 20 more rows

Create cens column with censoring information

temp <- temp %>% 
  mutate(cens = ifelse(!is.na(temp$V99ERKDATE), 1, 
                                      ifelse(!is.na(temp$V99ELKDATE), 1, 0)))
head(temp, 30)
## # A tibble: 30 x 8
##        id V99RNTCNT V99ERKVSPR V99ELKVSPR V99ERKDATE V99ELKDATE  time  cens
##     <int>     <int>      <int>      <int> <date>     <date>     <dbl> <dbl>
##  1 9.00e6        11         NA         NA NA         NA           108     0
##  2 9.00e6        11         NA         NA NA         NA           108     0
##  3 9.00e6         1         NA         NA NA         NA            12     0
##  4 9.00e6        11         NA         NA NA         NA           108     0
##  5 9.00e6         3         NA         NA NA         NA            24     0
##  6 9.00e6        11         NA         NA NA         NA           108     0
##  7 9.00e6         9         NA         NA NA         NA            84     0
##  8 9.00e6        11         NA         NA NA         NA           108     0
##  9 9.00e6        11         NA         NA NA         NA           108     0
## 10 9.00e6        11         NA         NA NA         NA           108     0
## # ... with 20 more rows
dim(temp)
## [1] 4650    8
table(temp$time)
## 
##    0   12   18   24   30   36   48   60   72   84   96  108 
##   26  153    7  137   15  155  368  133  110  164  329 3053

There are patients with time = 0, these patients had a knee replacement before the 12 month visit. Let us remove them because we cannot have time = 0 to perform survival analysis Do we really need to that?

temp <- temp %>% filter(temp$time != 0)
dim (temp)
## [1] 4624    8

Create base data frame with columns id, cens and time

base <- temp %>%  select(id, time, cens)
head(base, 30)
## # A tibble: 30 x 3
##         id  time  cens
##      <int> <dbl> <dbl>
##  1 9000099   108     0
##  2 9000296   108     0
##  3 9000622    12     0
##  4 9000798   108     0
##  5 9001104    24     0
##  6 9001400   108     0
##  7 9001695    84     0
##  8 9001897   108     0
##  9 9002116   108     0
## 10 9002316   108     0
## # ... with 20 more rows
dim(base)
## [1] 4624    3

Survival Analysis Dataset

Now we build the final dataset to be ready for survival analysis

First we will add STROKE features from Medical History datasets medhitxx, then we will add BMI feature from Physical Exam datasets physexamXX

Medical History

# read data to add the stroke variable of interest
medical_hist00 <- read.csv('data/medhist/medhist00.csv')
medical_hist03 <- read.csv('data/medhist/medhist03.csv')
medical_hist06 <- read.csv('data/medhist/medhist06.csv')
medical_hist08 <- read.csv('data/medhist/medhist08.csv')
medical_hist10 <- read.csv('data/medhist/medhist10.csv')

Now addd time column to medical_histxx data frames

medical_hist00$time00 <- 0
medical_hist03$time03 <- 24
medical_hist06$time06 <- 48
medical_hist08$time08 <- 72
medical_hist10$time10 <- 96

Now we select only the features we need

medical_hist00 <- medical_hist00 %>%  select(ID, V00STROKE, time00)
medical_hist03 <- medical_hist03 %>%  select(ID, V03STROKE, time03)
medical_hist06 <- medical_hist06 %>%  select(ID, V06STROKE, time06)
medical_hist08 <- medical_hist08 %>%  select(ID, V08STROKE, time08)
medical_hist10 <- medical_hist10 %>%  select(ID, V10STROKE, time10)

Create stroke data frame with stroke info for all the visits

stroke <- medical_hist00 %>% 
  full_join(medical_hist03) %>% 
  full_join(medical_hist06) %>% 
  full_join(medical_hist08) %>% 
  full_join(medical_hist10)
## Joining, by = "ID"
## Joining, by = "ID"
## Joining, by = "ID"
## Joining, by = "ID"
dim(medical_hist00)
## [1] 4796    3
dim(stroke)
## [1] 4796   11

Define stroke_sa data frame with info to perform survival analysis

# merge with base to have only the same id's
stroke_sa <- stroke %>% 
  right_join(base, by =c("ID"= "id")) %>% 
  select (c(1:11))

dim(stroke_sa)
## [1] 4624   11
head(stroke_sa)
##        ID V00STROKE time00 V03STROKE time03 V06STROKE time06 V08STROKE
## 1 9000099         0      0         0     24         0     48         0
## 2 9000296         0      0        NA     24         0     48         0
## 3 9000622         1      0        NA     24        NA     48        NA
## 4 9000798         0      0         0     24         0     48         0
## 5 9001104         0      0         0     24        NA     48        NA
## 6 9001400         0      0         0     24         0     48         0
##   time08 V10STROKE time10
## 1     72         0     96
## 2     72         0     96
## 3     72        NA     96
## 4     72         0     96
## 5     72        NA     96
## 6     72         0     96

Physical Exam

# read data to add the stroke variable of interest
physical_exam00 <- read.csv('data/physexam/physexam00.csv')
physical_exam01 <- read.csv('data/physexam/physexam01.csv')
physical_exam03 <- read.csv('data/physexam/physexam03.csv')
physical_exam05 <- read.csv('data/physexam/physexam05.csv')
physical_exam06 <- read.csv('data/physexam/physexam06.csv')
physical_exam08 <- read.csv('data/physexam/physexam08.csv')
physical_exam10 <- read.csv('data/physexam/physexam10.csv')

Now addd time column to physical_examxx data frames

physical_exam00$time00 <- 0
physical_exam01$time01 <- 12
physical_exam03$time03 <- 24
physical_exam05$time05 <- 36
physical_exam06$time06 <- 48
physical_exam08$time08 <- 72
# V08KIKBALL did not change in this visit
physical_exam08$V08KIKBALL <- physical_exam06$V06KIKBALL
physical_exam10$time10 <- 96
# V10KIKBALL did not change in this visit
physical_exam10$V10KIKBALL <- physical_exam06$V06KIKBALL

Now we select only the features we need

  • xxxBMI -> Body mass index
  • bmicat -> Body mass index categories
    • 1 = Underweight (BMI below 18.5);
    • 2 = Healthy (BMI of 18.5 to 24.9);
    • 3 = Overweight (BMI of 25.0 to 29.9);
    • 4 = Obese (BMI of 30.0 to 39.9);
    • 5 = Morbidly obese (BMI over 40)
  • xxxKIKBALL-> Either knee, avoid/reduce pain, aching or stiffness by changing or cutting back on normal activities, past 30 days (calc)
    • Isometric strength: which leg use to kick ball
    • 1= Right; 2= Left; 3= Both
  • xxxWLKAID -> 20-meter walk: using walking aid such as cane
    • 0=No; 1=Yes
  • VxxKPRK20D and VxxKPLK20D that represents knee pain, maximum amount experienced during walk
    • 0=No pain;
    • 10=Pain as bad as you can imagine
  • VxxKPLK20B and VxxKPRK20B that represents knee pain, current severity
    • 0=No pain;
    • 10=Pain as bad as you can imagine
physical_exam00 <- physical_exam00 %>%
  select(ID, P01BMI, V00WLKAID, V00KIKBALL, time00) %>% 
  mutate(V00BMICAT = ifelse(P01BMI <= 18.5, "Underweight",
                         ifelse(P01BMI <= 24.9, "Healthy",
                                ifelse(P01BMI <= 29.9, "Overweight",
                                       ifelse(P01BMI <= 39.9, "Obese",
                                              "Morbidly obese")))))
physical_exam01 <- physical_exam01 %>%  
  select(ID, V01BMI, V01WLKAID, V01KIKBALL, V01KPLK20B,
         V01KPRK20B, V01KPLK20D, V01KPRK20D, time01) %>%
  mutate(V01BMICAT = ifelse(V01BMI <= 18.5, "Underweight",
                         ifelse(V01BMI <= 24.9, "Healthy",
                                ifelse(V01BMI <= 29.9, "Overweight",
                                       ifelse(V01BMI <= 39.9, "Obese",
                                              "Morbidly obese")))))
physical_exam03 <- physical_exam03 %>%
  select(ID, V03BMI, V03WLKAID, V03KIKBALL, V03KPLK20B,
         V03KPRK20B, V03KPLK20D, V03KPRK20D, time03) %>%
  mutate(V03BMICAT = ifelse(V03BMI <= 18.5, "Underweight",
                         ifelse(V03BMI <= 24.9, "Healthy",
                                ifelse(V03BMI <= 29.9, "Overweight",
                                       ifelse(V03BMI <= 39.9, "Obese",
                                              "Morbidly obese")))))
physical_exam05 <- physical_exam05 %>%  
  select(ID, V05BMI, V05WLKAID, V05KIKBALL, V05KPLK20B,
         V05KPRK20B, V05KPLK20D, V05KPRK20D, time05) %>%
  mutate(V05BMICAT = ifelse(V05BMI <= 18.5, "Underweight",
                         ifelse(V05BMI <= 24.9, "Healthy",
                                ifelse(V05BMI <= 29.9, "Overweight",
                                       ifelse(V05BMI <= 39.9, "Obese",
                                              "Morbidly obese")))))

physical_exam06 <- physical_exam06 %>%  
  select(ID, V06BMI, V06WLKAID, V06KIKBALL, V06KPLK20B,
         V06KPRK20B, V06KPLK20D, V06KPRK20D, time06) %>%
  mutate(V06BMICAT = ifelse(V06BMI <= 18.5, "Underweight",
                         ifelse(V06BMI <= 24.9, "Healthy",
                                ifelse(V06BMI <= 29.9, "Overweight",
                                       ifelse(V06BMI <= 39.9, "Obese",
                                              "Morbidly obese")))))

physical_exam08 <- physical_exam08 %>%  
  select(ID, V08BMI, V08WLKAID, V08KIKBALL, V08KPLK20B,
         V08KPRK20B, V08KPLK20D, V08KPRK20D, time08) %>%
  mutate(V08BMICAT = ifelse(V08BMI <= 18.5, "Underweight",
                         ifelse(V08BMI <= 24.9, "Healthy",
                                ifelse(V08BMI <= 29.9, "Overweight",
                                       ifelse(V08BMI <= 39.9, "Obese",
                                              "Morbidly obese")))))
physical_exam10 <- physical_exam10 %>%  
  select(ID, V10BMI, V10WLKAID, V10KIKBALL, V10KPLK20B,
         V10KPRK20B, V10KPLK20D, V10KPRK20D, time10) %>%
  mutate(V10BMICAT = ifelse(V10BMI <= 18.5, "Underweight",
                         ifelse(V10BMI <= 24.9, "Healthy",
                                ifelse(V10BMI <= 29.9, "Overweight",
                                       ifelse(V10BMI <= 39.9, "Obese",
                                              "Morbidly obese")))))

Create physical_exam data frame with Physical Exam info for all the visits

 physical_exam <- physical_exam00 %>% 
  full_join(physical_exam01) %>% 
  full_join(physical_exam03) %>% 
  full_join(physical_exam05) %>% 
  full_join(physical_exam06) %>% 
  full_join(physical_exam08) %>% 
  full_join(physical_exam10)
## Joining, by = "ID"
## Joining, by = "ID"
## Joining, by = "ID"
## Joining, by = "ID"
## Joining, by = "ID"
## Joining, by = "ID"
dim(physical_exam)
## [1] 4796   60

Define bmi_sa data frame with info to perform survival analysis

# merge with base to have only the same id's
physical_exam_sa <- physical_exam %>% 
  right_join(base, by =c("ID"= "id")) %>% 
  select (c(1:60))

dim(physical_exam_sa)
## [1] 4624   60
head(physical_exam_sa)
##        ID P01BMI V00WLKAID V00KIKBALL time00  V00BMICAT V01BMI V01WLKAID
## 1 9000099   23.8         0          1      0    Healthy   24.1         0
## 2 9000296   29.8         0          1      0 Overweight   29.4         0
## 3 9000622   22.7         0          1      0    Healthy   22.2         0
## 4 9000798   32.4         0          1      0      Obese   32.8         0
## 5 9001104   30.7         0          1      0      Obese   28.7         0
## 6 9001400   23.5         0          1      0    Healthy   23.8         0
##   V01KIKBALL V01KPLK20B V01KPRK20B V01KPLK20D V01KPRK20D time01  V01BMICAT
## 1         NA          0          0          0          0     12    Healthy
## 2         NA          0          0          0          0     12 Overweight
## 3         NA          0          1          0          1     12    Healthy
## 4         NA          2          0          1          0     12      Obese
## 5         NA          0          0          0          0     12 Overweight
## 6         NA          0          0          0          0     12    Healthy
##   V03BMI V03WLKAID V03KIKBALL V03KPLK20B V03KPRK20B V03KPLK20D V03KPRK20D
## 1   23.4         0          1          0          0          0          0
## 2     NA        NA         NA         NA         NA         NA         NA
## 3     NA        NA         NA         NA         NA         NA         NA
## 4   34.3         0          1          1          0          1          0
## 5     NA        NA         NA         NA         NA         NA         NA
## 6   23.8         0          1          0          0          0          0
##   time03 V03BMICAT V05BMI V05WLKAID V05KIKBALL V05KPLK20B V05KPRK20B
## 1     24   Healthy   23.7         0         NA          0          0
## 2     24      <NA>   28.8         0          1          0          0
## 3     24      <NA>     NA        NA         NA         NA         NA
## 4     24     Obese   33.6         0         NA          0          0
## 5     24      <NA>     NA        NA         NA         NA         NA
## 6     24   Healthy   23.3         0         NA          0          0
##   V05KPLK20D V05KPRK20D time05  V05BMICAT V06BMI V06WLKAID V06KIKBALL
## 1          0          0     36    Healthy   24.7         0          1
## 2          0          0     36 Overweight   28.4         0          1
## 3         NA         NA     36       <NA>     NA        NA         NA
## 4          0          0     36      Obese   34.2         0          1
## 5         NA         NA     36       <NA>     NA        NA         NA
## 6          0          1     36    Healthy     NA        NA         NA
##   V06KPLK20B V06KPRK20B V06KPLK20D V06KPRK20D time06  V06BMICAT V08BMI
## 1          0          0          0          0     48    Healthy   24.5
## 2          0          0          0          0     48 Overweight   28.2
## 3         NA         NA         NA         NA     48       <NA>     NA
## 4          0          0          1          0     48      Obese   36.2
## 5         NA         NA         NA         NA     48       <NA>     NA
## 6         NA         NA         NA         NA     48       <NA>     NA
##   V08WLKAID V08KIKBALL V08KPLK20B V08KPRK20B V08KPLK20D V08KPRK20D time08
## 1         0          1          2          2          0          0     72
## 2         0          1          0          0          0          0     72
## 3        NA         NA         NA         NA         NA         NA     72
## 4         0          1          0          0          0          0     72
## 5        NA         NA         NA         NA         NA         NA     72
## 6        NA         NA         NA         NA         NA         NA     72
##    V08BMICAT V10BMI V10WLKAID V10KIKBALL V10KPLK20B V10KPRK20B V10KPLK20D
## 1    Healthy   21.7         0          1          1          0          1
## 2 Overweight   28.1         0          1          0          0          0
## 3       <NA>     NA        NA         NA         NA         NA         NA
## 4      Obese   34.3         0          1          0          0          0
## 5       <NA>     NA        NA         NA         NA         NA         NA
## 6       <NA>   22.9         0         NA          0          0          0
##   V10KPRK20D time10  V10BMICAT
## 1          0     96    Healthy
## 2          0     96 Overweight
## 3         NA     96       <NA>
## 4          0     96      Obese
## 5         NA     96       <NA>
## 6          0     96    Healthy

JointSx

The JointSx datasets contain questionnaire results regarding arthritis symptoms in the knee, hip, back, and other joints; arthritis-related joint function and disability; and general health-related function and disability. It also contains data from several standardized instruments for arthritis and general health, including: Western Ontario and McMasters Osteoarthritis Index (WOMAC), the Knee Outcomes of Osteoarthritis Scale (KOOS), and the Medical Outcomes Study (SF12).

Our interest is to study pmlkrcv and pmrkrcvthat measure Left/right knee pain severity in the past 30 days:

  • 0 = No pain;
  • 10 = Pain as bad as you can imagine
jointsx00 <- read.csv('data/jointsx/jointsx00.csv')
jointsx01 <- read.csv('data/jointsx/jointsx01.csv')
jointsx02 <- read.csv('data/jointsx/jointsx02.csv')
jointsx03 <- read.csv('data/jointsx/jointsx03.csv')
jointsx04 <- read.csv('data/jointsx/jointsx04.csv')
jointsx05 <- read.csv('data/jointsx/jointsx05.csv')
jointsx06 <- read.csv('data/jointsx/jointsx06.csv')
jointsx07 <- read.csv('data/jointsx/jointsx07.csv')
jointsx08 <- read.csv('data/jointsx/jointsx08.csv')
jointsx09 <- read.csv('data/jointsx/jointsx09.csv')
jointsx10 <- read.csv('data/jointsx/jointsx10.csv')
jointsx11 <- read.csv('data/jointsx/jointsx11.csv')

Now addd time column to jointsxxx data frames

jointsx00$time00 <- 0
jointsx01$time01 <- 12
jointsx02$time02 <- 18
jointsx03$time03 <- 24
jointsx04$time04 <- 30
jointsx05$time05 <- 36
jointsx06$time06 <- 48
jointsx07$time07 <- 60
jointsx08$time08 <- 72
jointsx09$time09 <- 84
jointsx10$time10 <- 96
jointsx11$time11 <- 108

Now we select only the features we need

jointsx00 <- jointsx00 %>% 
   select(ID, P01PMRKRCV, P01PMLKRCV,time00)
jointsx01 <- jointsx01 %>% 
   select(ID, V01PMRKRCV, V01PMLKRCV,time01)
jointsx03 <- jointsx03 %>% 
   select(ID, V03PMRKRCV, V03PMLKRCV,time03)
jointsx05 <- jointsx05 %>% 
   select(ID, V05PMRKRCV, V05PMLKRCV,time05)
jointsx06 <- jointsx06 %>% 
   select(ID, V06PMRKRCV, V06PMLKRCV,time06)
jointsx07 <- jointsx07 %>% 
   select(ID, V07PMRKRCV, V07PMLKRCV,time07)
jointsx08 <- jointsx08 %>% 
   select(ID, V08PMRKRCV, V08PMLKRCV,time08)
jointsx09 <- jointsx09 %>% 
   select(ID, V09PMRKRCV, V09PMLKRCV,time09)
jointsx10 <- jointsx10 %>% 
   select(ID, V10PMRKRCV, V10PMLKRCV,time10)
jointsx11 <- jointsx11 %>% 
   select(ID, V11PMRKRCV, V11PMLKRCV,time11)

Create jointsx data frame with Joint Symptoms/Function info for all the visits

jointsx <- jointsx00 %>% 
  full_join(jointsx01) %>%
  full_join(jointsx03) %>%
  full_join(jointsx05) %>%
  full_join(jointsx06) %>%
  full_join(jointsx07) %>%
  full_join(jointsx08) %>%
  full_join(jointsx09) %>%
  full_join(jointsx10) %>%
  full_join(jointsx11)
## Joining, by = "ID"
## Joining, by = "ID"
## Joining, by = "ID"
## Joining, by = "ID"
## Joining, by = "ID"
## Joining, by = "ID"
## Joining, by = "ID"
## Joining, by = "ID"
## Joining, by = "ID"
dim(jointsx)
## [1] 4796   31

Define jointsx_sa data frame with info to perform survival analysis

# merge with base to have only the same id's
jointsx_sa <- jointsx %>% 
  right_join(base, by =c("ID"= "id")) %>% 
  select (c(1:31))

dim(jointsx_sa)
## [1] 4624   31
head(jointsx_sa)
##        ID P01PMRKRCV P01PMLKRCV time00 V01PMRKRCV V01PMLKRCV time01
## 1 9000099          3          2      0          0          0     12
## 2 9000296          0          0      0          0          0     12
## 3 9000622          2          0      0          5          0     12
## 4 9000798          0          4      0          3          7     12
## 5 9001104          0          0      0          4          0     12
## 6 9001400          2          0      0          3          2     12
##   V03PMRKRCV V03PMLKRCV time03 V05PMRKRCV V05PMLKRCV time05 V06PMRKRCV
## 1          0          3     24          1          1     36          2
## 2          0          0     24          0          0     36          5
## 3         NA         NA     24         NA         NA     36         NA
## 4          0          5     24          0          4     36          0
## 5          5          4     24         NA         NA     36         NA
## 6          2          0     24          3          3     36          3
##   V06PMLKRCV time06 V07PMRKRCV V07PMLKRCV time07 V08PMRKRCV V08PMLKRCV
## 1          2     48          5          2     60          2          2
## 2          0     48          0          0     60          0          0
## 3         NA     48         NA         NA     60         NA         NA
## 4          3     48          0          4     60          0          5
## 5         NA     48         NA         NA     60         NA         NA
## 6          1     48          2          0     60          3          2
##   time08 V09PMRKRCV V09PMLKRCV time09 V10PMRKRCV V10PMLKRCV time10
## 1     72          0          0     84          1          1     96
## 2     72          0          0     84          0          0     96
## 3     72         NA         NA     84         NA         NA     96
## 4     72          0          5     84          0          4     96
## 5     72         NA         NA     84          2          0     96
## 6     72          3          0     84          0          0     96
##   V11PMRKRCV V11PMLKRCV time11
## 1          3          5    108
## 2          0          0    108
## 3         NA         NA    108
## 4          0          4    108
## 5         NA         NA    108
## 6          0          0    108

Time Independent Features

Now let us add some time independent feature like race, sex and age (at the study joining time)

tic_col <- read_csv("data/enrollees/enrollees.csv") %>% 
  select(id = ID, race = P02RACE, sex = P02SEX)
## Parsed with column specification:
## cols(
##   .default = col_integer(),
##   V00SITE = col_character()
## )
## See spec(...) for full column specifications.
subj <-  read_csv("data/subjectchar/subjectchar00.csv") %>% 
  select(id = ID, age = V00AGE)
## Parsed with column specification:
## cols(
##   .default = col_integer(),
##   VERSION = col_character(),
##   P02DATE = col_date(format = ""),
##   P02HR10 = col_character(),
##   P02HR11 = col_character(),
##   P01SVDATE = col_date(format = ""),
##   V00EVDATE = col_date(format = ""),
##   V00ENRCR2 = col_character()
## )
## See spec(...) for full column specifications.
tic_col$sex <- factor(tic_col$sex, 
                     levels = c("1", "2"), 
                     labels = c("Male", "Female"))
tic_col$race <- factor(tic_col$race, 
                      levels = c("0", "1", "2", "3"), 
                      labels = c("Other Non-white", "White or Caucasian", 
                                 "Black or African American", "Asian"))
tic_col$race <- relevel(tic_col$race, "White or Caucasian")
tic_col <- tic_col %>% 
  full_join(subj)
## Joining, by = "id"

Final Dataset

Finally we use tmerge function from the survival package to get the data in the start/stop format

# Set time range for each patient
base_sa <- tmerge(base, base, id=id, knee_rep = event(time, cens))

head(base_sa, 30)
##         id time cens tstart tstop knee_rep
## 1  9000099  108    0      0   108        0
## 2  9000296  108    0      0   108        0
## 3  9000622   12    0      0    12        0
## 4  9000798  108    0      0   108        0
## 5  9001104   24    0      0    24        0
## 6  9001400  108    0      0   108        0
## 7  9001695   84    0      0    84        0
## 8  9001897  108    0      0   108        0
## 9  9002116  108    0      0   108        0
## 10 9002316  108    0      0   108        0
## 11 9002411   48    0      0    48        0
## 12 9002430   48    1      0    48        1
## 13 9002817  108    0      0   108        0
## 14 9003126  108    0      0   108        0
## 15 9003175  108    0      0   108        0
## 16 9003316  108    0      0   108        0
## 17 9003380  108    0      0   108        0
## 18 9003406  108    0      0   108        0
## 19 9003430   84    0      0    84        0
## 20 9003658  108    0      0   108        0
## 21 9003815  108    0      0   108        0
## 22 9003895  108    0      0   108        0
## 23 9004175  108    0      0   108        0
## 24 9004184  108    0      0   108        0
## 25 9004315  108    0      0   108        0
## 26 9004462   12    0      0    12        0
## 27 9004669  108    0      0   108        0
## 28 9004905  108    0      0   108        0
## 29 9005075   36    0      0    36        0
## 30 9005132   24    1      0    24        1
# Add time dependent covariate: stroke
OAI_sa <- tmerge(base_sa, stroke_sa, id = ID, 
                 stroke = tdc(time00, V00STROKE),
                 stroke = tdc(time03, V03STROKE),
                 stroke = tdc(time06, V06STROKE),
                 stroke = tdc(time08, V08STROKE),
                 stroke = tdc(time10, V10STROKE))

# Add time dependent covariate: bmi
OAI_sa <- tmerge(OAI_sa, physical_exam_sa, id = ID, 
                 bmi = tdc(time00, P01BMI),
                 bmi = tdc(time01, V01BMI),
                 bmi = tdc(time03, V03BMI),
                 bmi = tdc(time05, V05BMI),
                 bmi = tdc(time06, V06BMI),
                 bmi = tdc(time08, V08BMI),
                 bmi = tdc(time10, V10BMI))

# Add time dependent covariate: bmi cateogry
OAI_sa <- tmerge(OAI_sa, physical_exam_sa, id = ID,
                 bmi_cat = tdc(time00, as_factor(V00BMICAT)),
                 bmi_cat = tdc(time01, as_factor(V01BMICAT)),
                 bmi_cat = tdc(time03, as_factor(V03BMICAT)),
                 bmi_cat = tdc(time05, as_factor(V05BMICAT)),
                 bmi_cat = tdc(time06, as_factor(V06BMICAT)),
                 bmi_cat = tdc(time08, as_factor(V08BMICAT)),
                 bmi_cat = tdc(time10, as_factor(V10BMICAT)))

# Add time dependent covariate: WLKAID
OAI_sa <- tmerge(OAI_sa, physical_exam_sa, id = ID, 
                 wlkaid = tdc(time00, V00WLKAID),
                 wlkaid = tdc(time01, V01WLKAID),
                 wlkaid = tdc(time03, V03WLKAID),
                 wlkaid = tdc(time05, V05WLKAID),
                 wlkaid = tdc(time06, V06WLKAID),
                 wlkaid = tdc(time08, V08WLKAID),
                 wlkaid = tdc(time10, V10WLKAID))

# Add time dependent covariate: KIKBALL
OAI_sa <- tmerge(OAI_sa, physical_exam_sa, id = ID, 
                 kikball = tdc(time00, V00KIKBALL),
                 kikball = tdc(time01, V01KIKBALL),
                 kikball = tdc(time03, V03KIKBALL),
                 kikball = tdc(time05, V05KIKBALL),
                 kikball = tdc(time06, V06KIKBALL),
                 kikball = tdc(time08, V08KIKBALL),
                 kikball = tdc(time10, V10KIKBALL))

# Add time dependent covariate: kplk20b
OAI_sa <- tmerge(OAI_sa, physical_exam_sa, id = ID, 
                 kplk20b = tdc(time01, V01KPLK20B),
                 kplk20b = tdc(time03, V03KPLK20B),
                 kplk20b = tdc(time05, V05KPLK20B),
                 kplk20b = tdc(time06, V06KPLK20B),
                 kplk20b = tdc(time08, V08KPLK20B),
                 kplk20b = tdc(time10, V10KPLK20B))

# Add time dependent covariate: kplk20d
OAI_sa <- tmerge(OAI_sa, physical_exam_sa, id = ID, 
                 kplk20d = tdc(time01, V01KPLK20D),
                 kplk20d = tdc(time03, V03KPLK20D),
                 kplk20d = tdc(time05, V05KPLK20D),
                 kplk20d = tdc(time06, V06KPLK20D),
                 kplk20d = tdc(time08, V08KPLK20D),
                 kplk20d = tdc(time10, V10KPLK20D))

# Add time dependent covariate: kprk20b
OAI_sa <- tmerge(OAI_sa, physical_exam_sa, id = ID, 
                 kprk20b = tdc(time01, V01KPRK20B),
                 kprk20b = tdc(time03, V03KPRK20B),
                 kprk20b = tdc(time05, V05KPRK20B),
                 kprk20b = tdc(time06, V06KPRK20B),
                 kprk20b = tdc(time08, V08KPRK20B),
                 kprk20b = tdc(time10, V10KPRK20B))

# Add time dependent covariate: kprk20r
OAI_sa <- tmerge(OAI_sa, physical_exam_sa, id = ID, 
                 kprk20d = tdc(time01, V01KPRK20D),
                 kprk20d = tdc(time03, V03KPRK20D),
                 kprk20d = tdc(time05, V05KPRK20D),
                 kprk20d = tdc(time06, V06KPRK20D),
                 kprk20d = tdc(time08, V08KPRK20D),
                 kprk20d = tdc(time10, V10KPRK20D))

# Add time dependent covariate: PMRKRCV
OAI_sa <- tmerge(OAI_sa, jointsx_sa, id = ID,
                 PMRKRCV = tdc(time00, P01PMRKRCV),
                 PMRKRCV = tdc(time01, V01PMRKRCV),
                 PMRKRCV = tdc(time03, V03PMRKRCV),
                 PMRKRCV = tdc(time05, V05PMRKRCV),
                 PMRKRCV = tdc(time06, V06PMRKRCV),
                 PMRKRCV = tdc(time07, V07PMRKRCV),
                 PMRKRCV = tdc(time08, V08PMRKRCV),
                 PMRKRCV = tdc(time09, V09PMRKRCV),
                 PMRKRCV = tdc(time10, V10PMRKRCV),
                 PMRKRCV = tdc(time11, V11PMRKRCV))

# Add time dependent covariate: PMLKRCV
OAI_sa <- tmerge(OAI_sa, jointsx_sa, id = ID,
                 PMLKRCV = tdc(time00, P01PMLKRCV),
                 PMLKRCV = tdc(time01, V01PMLKRCV),
                 PMLKRCV = tdc(time03, V03PMLKRCV),
                 PMLKRCV = tdc(time05, V05PMLKRCV),
                 PMLKRCV = tdc(time06, V06PMLKRCV),
                 PMLKRCV = tdc(time07, V07PMLKRCV),
                 PMLKRCV = tdc(time08, V08PMLKRCV),
                 PMLKRCV = tdc(time09, V09PMLKRCV),
                 PMLKRCV = tdc(time10, V10PMLKRCV),
                 PMLKRCV = tdc(time11, V11PMLKRCV))

OAI_sa <- merge(x = OAI_sa, y = tic_col, by = "id")
head(OAI_sa, 30)
##         id time cens tstart tstop knee_rep stroke  bmi    bmi_cat wlkaid
## 1  9000099  108    0      0    12        0      0 23.8    Healthy      0
## 2  9000099  108    0     12    24        0      0 24.1    Healthy      0
## 3  9000099  108    0     24    36        0      0 23.4    Healthy      0
## 4  9000099  108    0     36    48        0      0 23.7    Healthy      0
## 5  9000099  108    0     48    60        0      0 24.7    Healthy      0
## 6  9000099  108    0     60    72        0      0 24.7    Healthy      0
## 7  9000099  108    0     72    84        0      0 24.5    Healthy      0
## 8  9000099  108    0     84    96        0      0 24.5    Healthy      0
## 9  9000099  108    0     96   108        0      0 21.7    Healthy      0
## 10 9000296  108    0      0    12        0      0 29.8 Overweight      0
## 11 9000296  108    0     12    24        0      0 29.4 Overweight      0
## 12 9000296  108    0     24    36        0      0 29.4 Overweight      0
## 13 9000296  108    0     36    48        0      0 28.8 Overweight      0
## 14 9000296  108    0     48    60        0      0 28.4 Overweight      0
## 15 9000296  108    0     60    72        0      0 28.4 Overweight      0
## 16 9000296  108    0     72    84        0      0 28.2 Overweight      0
## 17 9000296  108    0     84    96        0      0 28.2 Overweight      0
## 18 9000296  108    0     96   108        0      0 28.1 Overweight      0
## 19 9000622   12    0      0    12        0      1 22.7    Healthy      0
## 20 9000798  108    0      0    12        0      0 32.4      Obese      0
## 21 9000798  108    0     12    24        0      0 32.8      Obese      0
## 22 9000798  108    0     24    36        0      0 34.3      Obese      0
## 23 9000798  108    0     36    48        0      0 33.6      Obese      0
## 24 9000798  108    0     48    60        0      0 34.2      Obese      0
## 25 9000798  108    0     60    72        0      0 34.2      Obese      0
## 26 9000798  108    0     72    84        0      0 36.2      Obese      0
## 27 9000798  108    0     84    96        0      0 36.2      Obese      0
## 28 9000798  108    0     96   108        0      0 34.3      Obese      0
## 29 9001104   24    0      0    12        0      0 30.7      Obese      0
## 30 9001104   24    0     12    24        0      0 28.7 Overweight      0
##    kikball kplk20b kplk20d kprk20b kprk20d PMRKRCV PMLKRCV
## 1        1      NA      NA      NA      NA       3       2
## 2        1       0       0       0       0       0       0
## 3        1       0       0       0       0       0       3
## 4        1       0       0       0       0       1       1
## 5        1       0       0       0       0       2       2
## 6        1       0       0       0       0       5       2
## 7        1       2       0       2       0       2       2
## 8        1       2       0       2       0       0       0
## 9        1       1       1       0       0       1       1
## 10       1      NA      NA      NA      NA       0       0
## 11       1       0       0       0       0       0       0
## 12       1       0       0       0       0       0       0
## 13       1       0       0       0       0       0       0
## 14       1       0       0       0       0       5       0
## 15       1       0       0       0       0       0       0
## 16       1       0       0       0       0       0       0
## 17       1       0       0       0       0       0       0
## 18       1       0       0       0       0       0       0
## 19       1      NA      NA      NA      NA       2       0
## 20       1      NA      NA      NA      NA       0       4
## 21       1       2       1       0       0       3       7
## 22       1       1       1       0       0       0       5
## 23       1       0       0       0       0       0       4
## 24       1       0       1       0       0       0       3
## 25       1       0       1       0       0       0       4
## 26       1       0       0       0       0       0       5
## 27       1       0       0       0       0       0       5
## 28       1       0       0       0       0       0       4
## 29       1      NA      NA      NA      NA       0       0
## 30       1       0       0       0       0       4       0
##                  race    sex age
## 1  White or Caucasian   Male  59
## 2  White or Caucasian   Male  59
## 3  White or Caucasian   Male  59
## 4  White or Caucasian   Male  59
## 5  White or Caucasian   Male  59
## 6  White or Caucasian   Male  59
## 7  White or Caucasian   Male  59
## 8  White or Caucasian   Male  59
## 9  White or Caucasian   Male  59
## 10 White or Caucasian   Male  69
## 11 White or Caucasian   Male  69
## 12 White or Caucasian   Male  69
## 13 White or Caucasian   Male  69
## 14 White or Caucasian   Male  69
## 15 White or Caucasian   Male  69
## 16 White or Caucasian   Male  69
## 17 White or Caucasian   Male  69
## 18 White or Caucasian   Male  69
## 19 White or Caucasian Female  71
## 20 White or Caucasian   Male  56
## 21 White or Caucasian   Male  56
## 22 White or Caucasian   Male  56
## 23 White or Caucasian   Male  56
## 24 White or Caucasian   Male  56
## 25 White or Caucasian   Male  56
## 26 White or Caucasian   Male  56
## 27 White or Caucasian   Male  56
## 28 White or Caucasian   Male  56
## 29 White or Caucasian Female  72
## 30 White or Caucasian Female  72
# first 20 rows of columns necessary to perform survival analysis
OAI_sa[1:20, c(1, 4:20)]
##         id tstart tstop knee_rep stroke  bmi    bmi_cat wlkaid kikball
## 1  9000099      0    12        0      0 23.8    Healthy      0       1
## 2  9000099     12    24        0      0 24.1    Healthy      0       1
## 3  9000099     24    36        0      0 23.4    Healthy      0       1
## 4  9000099     36    48        0      0 23.7    Healthy      0       1
## 5  9000099     48    60        0      0 24.7    Healthy      0       1
## 6  9000099     60    72        0      0 24.7    Healthy      0       1
## 7  9000099     72    84        0      0 24.5    Healthy      0       1
## 8  9000099     84    96        0      0 24.5    Healthy      0       1
## 9  9000099     96   108        0      0 21.7    Healthy      0       1
## 10 9000296      0    12        0      0 29.8 Overweight      0       1
## 11 9000296     12    24        0      0 29.4 Overweight      0       1
## 12 9000296     24    36        0      0 29.4 Overweight      0       1
## 13 9000296     36    48        0      0 28.8 Overweight      0       1
## 14 9000296     48    60        0      0 28.4 Overweight      0       1
## 15 9000296     60    72        0      0 28.4 Overweight      0       1
## 16 9000296     72    84        0      0 28.2 Overweight      0       1
## 17 9000296     84    96        0      0 28.2 Overweight      0       1
## 18 9000296     96   108        0      0 28.1 Overweight      0       1
## 19 9000622      0    12        0      1 22.7    Healthy      0       1
## 20 9000798      0    12        0      0 32.4      Obese      0       1
##    kplk20b kplk20d kprk20b kprk20d PMRKRCV PMLKRCV               race
## 1       NA      NA      NA      NA       3       2 White or Caucasian
## 2        0       0       0       0       0       0 White or Caucasian
## 3        0       0       0       0       0       3 White or Caucasian
## 4        0       0       0       0       1       1 White or Caucasian
## 5        0       0       0       0       2       2 White or Caucasian
## 6        0       0       0       0       5       2 White or Caucasian
## 7        2       0       2       0       2       2 White or Caucasian
## 8        2       0       2       0       0       0 White or Caucasian
## 9        1       1       0       0       1       1 White or Caucasian
## 10      NA      NA      NA      NA       0       0 White or Caucasian
## 11       0       0       0       0       0       0 White or Caucasian
## 12       0       0       0       0       0       0 White or Caucasian
## 13       0       0       0       0       0       0 White or Caucasian
## 14       0       0       0       0       5       0 White or Caucasian
## 15       0       0       0       0       0       0 White or Caucasian
## 16       0       0       0       0       0       0 White or Caucasian
## 17       0       0       0       0       0       0 White or Caucasian
## 18       0       0       0       0       0       0 White or Caucasian
## 19      NA      NA      NA      NA       2       0 White or Caucasian
## 20      NA      NA      NA      NA       0       4 White or Caucasian
##       sex age
## 1    Male  59
## 2    Male  59
## 3    Male  59
## 4    Male  59
## 5    Male  59
## 6    Male  59
## 7    Male  59
## 8    Male  59
## 9    Male  59
## 10   Male  69
## 11   Male  69
## 12   Male  69
## 13   Male  69
## 14   Male  69
## 15   Male  69
## 16   Male  69
## 17   Male  69
## 18   Male  69
## 19 Female  71
## 20   Male  56