Load data

### run this script on local cuz it is slow when runing on rmd

rm(list = ls())
library("rio")
library(magrittr)
x_2015 <- import('LLCP2015.XPT')
x_2013 <- import('LLCP2013.XPT')
x_2011 <- import('LLCP2011.XPT')
## merge
names(x_2011)[85] = c('EXRACT11')
names(x_2011)[88] = 'EXRACT21'
inter1500 <- Reduce(intersect, list(names(x_2015),
                                   names(x_2013),
                                    names(x_2011)
                                    ))


index_2015 <- which(names(x_2015) %in% inter1500)
index_2013 <- which(names(x_2013) %in% inter1500)
index_2011 <- which(names(x_2011) %in% inter1500)

total <- rbind(x_2015[, index_2015],
               x_2013[, index_2013],
               x_2011[, index_2011])

Processing some variables

#names(total)
###GENeral health
## change general health in to categorial
GENH_map = c('Excellent',
             'Very Good',
             'Good',
             'Fair',
             'Poor',
             'Don\'t know\\not sure',
             'Refused')
total$GENHLTH_F = as.factor(GENH_map[total$GENHLTH])
total$GENHLTH_F = factor(total$GENHLTH_F,
                         levels(total$GENHLTH_F)[c(1,6,3,2,4,5,7)])

###AGE
AGE_map = c('18-24',
            '25-29',
            '30-34',
            '35-39',
            '40-44',
            '45-49',
            '50-54',
            '55-59',
            '60-64',
            '65-69',
            '70-74',
            '75-79',
            '80 to older',
            'Missing')

total$X_AGEG5YR_F = as.factor(AGE_map[total$X_AGEG5YR])

##Physical health
total$PHYSHLTH_N = total$PHYSHLTH
total$PHYSHLTH_N[total$PHYSHLTH_N > 31] = 0

#mental health
total$MENTHLTH_N = total$MENTHLTH
total$MENTHLTH_N[total$MENTHLTH_N > 31] = 0

#Income map
Income_map <- c('Less than $10,000',
                'Less than $15,000',
                'Less than $20,000',
                'Less than $25,000',
                'Less than $35,000',
                'Less than $50,000',
                'Less than $75,000',
                '$75,000 or more',
                'Don\'t know/not sure',
                'Refused',
                NA)
total$INCOME2_F = as.factor(total$INCOME2)
total$INCOME2_F = as.factor(Income_map[total$INCOME2])
total$INCOME2_F = factor(total$INCOME2_F,
                    levels(total$INCOME2_F)[c(2,3,4,5,6,7,8,1,9,10)])

#BMI5
total$X_BMI_n = total$X_BMI5
total$X_BMI_n[total$X_BMI5 > 9998] = NA
total$X_BMI_n_100 = total$X_BMI_n/100

### Get the date right
total$IYEAR_F = as.factor(total$IYEAR)

##PAINDX and PACAT1
X_PAINDX1_N = c(x_2015$X_PAINDX1,
        x_2013$X_PAINDX1,
        x_2011$X_PAINDEX)

X_PAINDX1_F_map = c('Meet Aerobic Recommendations',
            'Did Not Meet Aerobic Recommendations',
            'Don’t know/Not Sure/Refused/Missing')


X_PAINDX1_F = as.factor(X_PAINDX1_F_map[X_PAINDX1_N])


X_PACAT1_N = c(x_2015$X_PACAT1,
               x_2013$X_PACAT1,
               x_2011$X_PACAT)

X_PACAT1_F_map = c('Highly Active',
                   'Active',
                   'Insufficiently Active',
                   'Inactive',
                   'Don\'t know, not sure')

X_PACAT1_F = as.factor(X_PACAT1_F_map[X_PACAT1_N])
total$X_PAINDX1_F = X_PAINDX1_F
total$X_PACAT1_F = X_PACAT1_F

###PAFRE and PARDUR
PADUR1_ = as.numeric(total$PADUR1_)
PAFREQ1_ = as.numeric(total$PAFREQ1_)

brk1 = c(0, 30, 45, 60, 270)
brk2 = c(233, 2000, 3000, 5000, 98000)

cutPADUR = cut(PADUR1_, breaks = brk1, include.lowest = T)
cutPAFRE = cut(PAFREQ1_, breaks = brk2, include.lowest = T)

total$cutPADUR = cutPADUR
total$cutPAFRE = cutPAFRE

total$cutPADUR %>% summary()
total$cutPAFRE %>% summary()

###dependent variables 
X_PAINDX1_F_map = c(FALSE, TRUE)
total$X_PAINDX1_F_logic = as.factor(X_PAINDX1_F_map[as.numeric(total$X_PAINDX1_F)]) %>% as.logical()

X_PASTRNG_F_map = c(FALSE, TRUE)
total$X_PASTRNG_F_logic = 
    as.factor(X_PASTRNG_F_map[as.numeric(total$X_PASTRNG)]) %>% as.logical()

###health care coverage
X_HCVU651_F_map = c(TRUE, FALSE, NULL)
total$X_HCVU651_F = as.factor(X_HCVU651_F_map[as.numeric(total$X_HCVU651)])

## Chronic diseases CVDCRHD4(heart disease), CHCOCNCR(cancer), DIABETE3(diabetes)
CVDCRHD4_F_map = c(TRUE, FALSE, NULL, NULL)
total$CVDCRHD4_F = as.factor(CVDCRHD4_F_map[as.numeric(total$CVDCRHD4)])

CHCOCNCR_F_map = c(TRUE, FALSE, NULL, NULL)
total$CHCOCNCR_F = as.factor(CHCOCNCR_F_map[as.numeric(total$CHCOCNCR)])

DIABETE3_F_map = c(TRUE, FALSE, FALSE, FALSE, NULL, NULL)
total$DIABETE3_F= as.factor(DIABETE3_F_map[as.numeric(total$DIABETE3)])

Subset into data that is used for modeling

#total = readRDS('total-05-31.rds')
#saveRDS(total, 'total-05-31.rds')
select = c('X_PAINDX1_F_logic', 'X_PASTRNG_F_logic','X_HCVU651_F', 
               'cutPADUR', 'cutPAFRE', 'X_AGE_G',
               'SEX',  'EXRACT11',
               'X_BMI5CAT', 'CVDCRHD4_F',
           'CHCOCNCR_F', 'DIABETE3_F',
           'PADUR1_', 'PAFREQ1_',
           'X_BMI5'
           )
exer_data1 <- total[,select]
exer_data1[,3][1:10]
exer_data1[,(3:12)] = lapply(exer_data1[, (3:12)], factor)
exer_data1 <- exer_data1[complete.cases(exer_data1),]

dim(exer_data1)
#saveRDS(exer_data1, 'exer_data1.rds')
Make a table to extract the met value
metDf <- cbind(x_2015$EXRACT11, x_2015$METVL11_)
metDf <- metDf[!duplicated(metDf),]
metDF <- data.frame(type = metDf[,1], met = metDf[, 2])
saveRDS(metDF, 'met-table.rds')
#exer_data1 = readRDS('exer_data1.rds')
met_table = readRDS('met-table.rds')
exer_type = as.numeric(exer_data1$EXRACT11)
exer_met = rep(0, length(exer_type))
for (i in 1:length(exer_met)) {
    if (length(which(met_table$type== exer_type[i])) != 0) {
    exer_met[i] = met_table$met[which(met_table$type== exer_type[i])]
    }
}
exer_data1$met = as.factor(exer_met)
saveRDS(exer_data1, 'exer_data1.rds')