Processing some variables
#names(total)
###GENeral health
## change general health in to categorial
GENH_map = c('Excellent',
'Very Good',
'Good',
'Fair',
'Poor',
'Don\'t know\\not sure',
'Refused')
total$GENHLTH_F = as.factor(GENH_map[total$GENHLTH])
total$GENHLTH_F = factor(total$GENHLTH_F,
levels(total$GENHLTH_F)[c(1,6,3,2,4,5,7)])
###AGE
AGE_map = c('18-24',
'25-29',
'30-34',
'35-39',
'40-44',
'45-49',
'50-54',
'55-59',
'60-64',
'65-69',
'70-74',
'75-79',
'80 to older',
'Missing')
total$X_AGEG5YR_F = as.factor(AGE_map[total$X_AGEG5YR])
##Physical health
total$PHYSHLTH_N = total$PHYSHLTH
total$PHYSHLTH_N[total$PHYSHLTH_N > 31] = 0
#mental health
total$MENTHLTH_N = total$MENTHLTH
total$MENTHLTH_N[total$MENTHLTH_N > 31] = 0
#Income map
Income_map <- c('Less than $10,000',
'Less than $15,000',
'Less than $20,000',
'Less than $25,000',
'Less than $35,000',
'Less than $50,000',
'Less than $75,000',
'$75,000 or more',
'Don\'t know/not sure',
'Refused',
NA)
total$INCOME2_F = as.factor(total$INCOME2)
total$INCOME2_F = as.factor(Income_map[total$INCOME2])
total$INCOME2_F = factor(total$INCOME2_F,
levels(total$INCOME2_F)[c(2,3,4,5,6,7,8,1,9,10)])
#BMI5
total$X_BMI_n = total$X_BMI5
total$X_BMI_n[total$X_BMI5 > 9998] = NA
total$X_BMI_n_100 = total$X_BMI_n/100
### Get the date right
total$IYEAR_F = as.factor(total$IYEAR)
##PAINDX and PACAT1
X_PAINDX1_N = c(x_2015$X_PAINDX1,
x_2013$X_PAINDX1,
x_2011$X_PAINDEX)
X_PAINDX1_F_map = c('Meet Aerobic Recommendations',
'Did Not Meet Aerobic Recommendations',
'Don’t know/Not Sure/Refused/Missing')
X_PAINDX1_F = as.factor(X_PAINDX1_F_map[X_PAINDX1_N])
X_PACAT1_N = c(x_2015$X_PACAT1,
x_2013$X_PACAT1,
x_2011$X_PACAT)
X_PACAT1_F_map = c('Highly Active',
'Active',
'Insufficiently Active',
'Inactive',
'Don\'t know, not sure')
X_PACAT1_F = as.factor(X_PACAT1_F_map[X_PACAT1_N])
total$X_PAINDX1_F = X_PAINDX1_F
total$X_PACAT1_F = X_PACAT1_F
###PAFRE and PARDUR
PADUR1_ = as.numeric(total$PADUR1_)
PAFREQ1_ = as.numeric(total$PAFREQ1_)
brk1 = c(0, 30, 45, 60, 270)
brk2 = c(233, 2000, 3000, 5000, 98000)
cutPADUR = cut(PADUR1_, breaks = brk1, include.lowest = T)
cutPAFRE = cut(PAFREQ1_, breaks = brk2, include.lowest = T)
total$cutPADUR = cutPADUR
total$cutPAFRE = cutPAFRE
total$cutPADUR %>% summary()
total$cutPAFRE %>% summary()
###dependent variables
X_PAINDX1_F_map = c(FALSE, TRUE)
total$X_PAINDX1_F_logic = as.factor(X_PAINDX1_F_map[as.numeric(total$X_PAINDX1_F)]) %>% as.logical()
X_PASTRNG_F_map = c(FALSE, TRUE)
total$X_PASTRNG_F_logic =
as.factor(X_PASTRNG_F_map[as.numeric(total$X_PASTRNG)]) %>% as.logical()
###health care coverage
X_HCVU651_F_map = c(TRUE, FALSE, NULL)
total$X_HCVU651_F = as.factor(X_HCVU651_F_map[as.numeric(total$X_HCVU651)])
## Chronic diseases CVDCRHD4(heart disease), CHCOCNCR(cancer), DIABETE3(diabetes)
CVDCRHD4_F_map = c(TRUE, FALSE, NULL, NULL)
total$CVDCRHD4_F = as.factor(CVDCRHD4_F_map[as.numeric(total$CVDCRHD4)])
CHCOCNCR_F_map = c(TRUE, FALSE, NULL, NULL)
total$CHCOCNCR_F = as.factor(CHCOCNCR_F_map[as.numeric(total$CHCOCNCR)])
DIABETE3_F_map = c(TRUE, FALSE, FALSE, FALSE, NULL, NULL)
total$DIABETE3_F= as.factor(DIABETE3_F_map[as.numeric(total$DIABETE3)])
Subset into data that is used for modeling
#total = readRDS('total-05-31.rds')
#saveRDS(total, 'total-05-31.rds')
select = c('X_PAINDX1_F_logic', 'X_PASTRNG_F_logic','X_HCVU651_F',
'cutPADUR', 'cutPAFRE', 'X_AGE_G',
'SEX', 'EXRACT11',
'X_BMI5CAT', 'CVDCRHD4_F',
'CHCOCNCR_F', 'DIABETE3_F',
'PADUR1_', 'PAFREQ1_',
'X_BMI5'
)
exer_data1 <- total[,select]
exer_data1[,3][1:10]
exer_data1[,(3:12)] = lapply(exer_data1[, (3:12)], factor)
exer_data1 <- exer_data1[complete.cases(exer_data1),]
dim(exer_data1)
#saveRDS(exer_data1, 'exer_data1.rds')