# import required packages
library(caTools)
library(VIM)
library(DMwR)
library(tidyverse)
library(ROCR)
library(sqldf)
# model packages
library(randomForest)
library(e1071)
# mice package
library(mice)
# get data
path <- "/Users/cy/Desktop/dazuoye/sas/framingham_heart_disease.csv"
raw_data <- read.csv(path)
# 这边可以考虑增加变量收缩压与舒张压之差、描述收缩压、舒张压与高血压等级的变量
# reference: https://zh.wikipedia.org/wiki/%E8%A1%80%E5%A3%93
# reference: http://blog.sina.com.cn/s/blog_6406a7740102v4mz.html
# looking at the structure of data
str(raw_data)
'data.frame': 4238 obs. of 16 variables:
$ male : int 1 0 1 0 0 0 0 0 1 1 ...
$ age : int 39 46 48 61 46 43 63 45 52 43 ...
$ education : int 4 2 1 3 3 2 1 2 1 1 ...
$ currentSmoker : int 0 0 1 1 1 0 0 1 0 1 ...
$ cigsPerDay : int 0 0 20 30 23 0 0 20 0 30 ...
$ BPMeds : int 0 0 0 0 0 0 0 0 0 0 ...
$ prevalentStroke: int 0 0 0 0 0 0 0 0 0 0 ...
$ prevalentHyp : int 0 0 0 1 0 1 0 0 1 1 ...
$ diabetes : int 0 0 0 0 0 0 0 0 0 0 ...
$ totChol : int 195 250 245 225 285 228 205 313 260 225 ...
$ sysBP : num 106 121 128 150 130 ...
$ diaBP : num 70 81 80 95 84 110 71 71 89 107 ...
$ BMI : num 27 28.7 25.3 28.6 23.1 ...
$ heartRate : int 80 95 75 65 85 77 60 79 76 93 ...
$ glucose : int 77 76 70 103 85 99 85 78 79 88 ...
$ TenYearCHD : int 0 0 0 1 0 0 1 0 0 0 ...
# 考虑增加变量bplevel
raw_data <- sqldf("select *, case when sysBP < 90 and diaBP < 60 then 1
when (sysBP between 90 and 139) and (diaBP between 60 and 89) then 2
when sysBP >=140 and diaBP >= 90 then 3 else 4 end as bplevel
from raw_data")
# raw_data <- sqldf("select *, case when sysBP is not null and diaBP is not null
# then sysBP - diaBP else null end as sd from raw_data")
# 对变量类别进行区分
names_to_factor <- c("male", "education", "currentSmoker", "BPMeds",'prevalentStroke','prevalentHyp','diabetes','TenYearCHD', 'bplevel')
raw_data[names_to_factor] <- map(raw_data[names_to_factor], as.factor)
str(raw_data)
'data.frame': 4238 obs. of 17 variables:
$ male : Factor w/ 2 levels "0","1": 2 1 2 1 1 1 1 1 2 2 ...
$ age : int 39 46 48 61 46 43 63 45 52 43 ...
$ education : Factor w/ 4 levels "1","2","3","4": 4 2 1 3 3 2 1 2 1 1 ...
$ currentSmoker : Factor w/ 2 levels "0","1": 1 1 2 2 2 1 1 2 1 2 ...
$ cigsPerDay : int 0 0 20 30 23 0 0 20 0 30 ...
$ BPMeds : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ prevalentStroke: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ prevalentHyp : Factor w/ 2 levels "0","1": 1 1 1 2 1 2 1 1 2 2 ...
$ diabetes : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ totChol : int 195 250 245 225 285 228 205 313 260 225 ...
$ sysBP : num 106 121 128 150 130 ...
$ diaBP : num 70 81 80 95 84 110 71 71 89 107 ...
$ BMI : num 27 28.7 25.3 28.6 23.1 ...
$ heartRate : int 80 95 75 65 85 77 60 79 76 93 ...
$ glucose : int 77 76 70 103 85 99 85 78 79 88 ...
$ TenYearCHD : Factor w/ 2 levels "0","1": 1 1 1 2 1 1 2 1 1 1 ...
$ bplevel : Factor w/ 4 levels "1","2","3","4": 2 2 2 3 2 3 2 2 4 3 ...
变量名 | 描述 | |
---|---|---|
male | 0 = Female; 1 = Male | |
age | Age at exam time | |
education | 1 = Some High School; 2 = High School or GED; 3 = Some College or Vocational School; 4 = college | |
currentSmoker | 0 = nonsmoker; 1 = smoker | |
cigsPerDay | number of cigarettes smoked per day (estimated average) | |
BPMeds | 0 = Not on Blood Pressure medications; 1 = Is on Blood Pressure medications | |
prevalentStroke | whether or not the patient had previously had a stroke | |
prevalentHyp | whether or not the patient was hypertensive(高血压) | |
diabetes | whether or not the patient had diabetes 0 = No; 1 = Yes | |
totChol | total cholesterol level 总胆固醇水平 mg/dL | |
sysBP | systolic blood pressure 收缩压 mmHg | |
diaBP | diastolic blood pressure 舒张压 mmHg | |
BMI | Body Mass Index calculated as: Weight (kg) / Height(meter-squared) | |
heartRate | Beats/Min (Ventricular 心室) | |
glucose | glucose level 糖尿病水平 | |
bplevel | 1 = 低血糖; 2 = 正常血压; 3 = 高血压; 4 = 其它 血压等级 | |
TenYearCHD | 10 year risk of coronary heart disease (Target) |
# 这里我们使用mice包进行缺失值处理
aggr(raw_data, prop=T, cex.lab = 0.9, cex.axis = 0.7)
matrixplot(raw_data, cex.axis = 0.7)
Click in a column to sort by the corresponding variable.
To regain use of the VIM GUI and the R console, click outside the plot region.
由上图可以看出,除了glucose变量,其它变量的缺失比例都低于5%,而glucose变量缺失率超过了10%。对此的处理策略是保留glucose变量的缺失值,直接删除其它变量的缺失值。 现在处理glucose的缺失值,
# 处理glucose列
filtered_data <- subset(raw_data, !is.na(education) & !is.na(cigsPerDay) & !is.na(BPMeds) & !is.na(totChol) & !is.na(BMI) & !is.na(heartRate))
# 查看glucose与其它变量的线性相关性确定mice的填充策略
glucoseLog = glm(glucose ~ ., data = filtered_data)
summary(glucoseLog)
Call:
glm(formula = glucose ~ ., data = filtered_data)
Deviance Residuals:
Min 1Q Median 3Q Max
-125.912 -8.542 -1.517 6.864 222.839
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 79.249824 11.558924 6.856 8.28e-12 ***
male1 0.872646 0.690526 1.264 0.206404
age 0.040159 0.043142 0.931 0.351990
education2 0.435531 0.766161 0.568 0.569758
education3 1.558096 0.915870 1.701 0.088987 .
education4 0.061647 1.049741 0.059 0.953174
currentSmoker1 0.091473 0.994228 0.092 0.926700
cigsPerDay -0.072377 0.042893 -1.687 0.091617 .
BPMeds1 0.740322 1.908862 0.388 0.698161
prevalentStroke1 1.585779 4.134256 0.384 0.701319
prevalentHyp1 -0.974976 1.106982 -0.881 0.378510
diabetes1 87.728554 1.936045 45.313 < 2e-16 ***
totChol -0.001776 0.007414 -0.240 0.810712
sysBP 0.124314 0.028570 4.351 1.39e-05 ***
diaBP -0.119063 0.048042 -2.478 0.013246 *
BMI 0.089097 0.084933 1.049 0.294235
heartRate 0.114831 0.026970 4.258 2.12e-05 ***
TenYearCHD1 3.378054 0.906749 3.725 0.000198 ***
bplevel2 -19.042551 10.887942 -1.749 0.080383 .
bplevel3 -20.095204 11.066460 -1.816 0.069473 .
bplevel4 -19.624199 10.965356 -1.790 0.073593 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for gaussian family taken to be 349.1509)
Null deviance: 2089542 on 3655 degrees of freedom
Residual deviance: 1269164 on 3635 degrees of freedom
(331 observations deleted due to missingness)
AIC: 31806
Number of Fisher Scoring iterations: 2
# mice包填充,排除不重要的变量。至于为什么不选diaBP,主要是后面的相关性分析中,这两个变量会造成多重共线性
mice_mod <- mice(filtered_data[, names(filtered_data) %in% c('diabetes', 'sysBP', 'heartRate', 'TenYearCHD', 'glucose')], m=5, method = "pmm", maxit = 50, seed=2333, print = FALSE)
#查看填充结果
summary(mice_mod)
Class: mids
Number of multiple imputations: 5
Imputation methods:
diabetes sysBP heartRate glucose TenYearCHD
"" "" "" "pmm" ""
PredictorMatrix:
diabetes sysBP heartRate glucose TenYearCHD
diabetes 0 1 1 1 1
sysBP 1 0 1 1 1
heartRate 1 1 0 1 1
glucose 1 1 1 0 1
TenYearCHD 1 1 1 1 0
# 查看原始数据和插补后的数据分布情况
densityplot(mice_mod)
stripplot(mice_mod, pch=12)
# 填充数据
mice_output <- complete(mice_mod, 4)
filtered_data$glucose <- mice_output$glucose
sum(is.na(filtered_data))
[1] 0
completed_data <- filtered_data
# 查看有无重复行并删除重复行
sum(duplicated(completed_data))
[1] 0
completed_data <- completed_data[!duplicated(completed_data), ]
# look into outliers
ggplot(completed_data)+geom_boxplot(aes(factor(1),age))
ggplot(completed_data)+geom_boxplot(aes(factor(1),cigsPerDay))
ggplot(completed_data)+geom_boxplot(aes(factor(1),totChol))
ggplot(completed_data)+geom_boxplot(aes(factor(1),sysBP))
ggplot(completed_data)+geom_boxplot(aes(factor(1),diaBP))
ggplot(completed_data)+geom_boxplot(aes(factor(1),BMI))
ggplot(completed_data)+geom_boxplot(aes(factor(1),heartRate))
ggplot(completed_data)+geom_boxplot(aes(factor(1),glucose))
# 查看cigsPerDay
cigs_sub <- completed_data[which(completed_data$cigsPerDay>=60),]
# 查看totChol,删除异常点
tot_sub <- completed_data[which(completed_data$totChol>=500),]
# 查看sysBP, 删除异常点
sysbp_sub <- completed_data[which(completed_data$sysBP>=250),]
# 查看BMI
bmi_sub <- completed_data[which(completed_data$BMI>=50),]
totChol: 查看网络资料,总胆固醇水平大于240mg/dl已属于非常高,故删去水平值为600mg/dl的记录。 sysBP: 去掉收缩压为295mg/dl的记录
# 删除各变量离群点
completed_data <- completed_data[which(completed_data$totChol!=600),]
completed_data <- completed_data[which(completed_data$sysBP!=295),]
# 分类型变量列联分析
ggplot(completed_data)+geom_boxplot(aes(factor(1),age,fill=completed_data$TenYearCHD))
ggplot(completed_data)+geom_boxplot(aes(factor(1),completed_data$totChol,fill=completed_data$TenYearCHD))
completed_data %>% filter(glucose < 200) %>%
ggplot(aes(x=factor(1),y=glucose,fill=TenYearCHD))+geom_boxplot(aes(x=factor(1),y=glucose,fill=TenYearCHD))
ggplot(completed_data)+geom_boxplot(aes(factor(1),completed_data$heartRate,fill=completed_data$TenYearCHD))
ggplot(completed_data)+geom_boxplot(aes(factor(1),completed_data$BMI,fill=completed_data$TenYearCHD))
ggplot(completed_data)+geom_boxplot(aes(factor(1),completed_data$diaBP,fill=completed_data$TenYearCHD))
ggplot(completed_data)+geom_boxplot(aes(factor(1),completed_data$sysBP,fill=completed_data$TenYearCHD))
由图像知,glucose和hearRate变量有不显著的风险
table1=table(completed_data$male,completed_data$TenYearCHD)
chisq.test(table1)
Pearson's Chi-squared test with Yates' continuity correction
data: table1
X-squared = 33.573, df = 1, p-value = 6.863e-09
table1
0 1
0 1987 271
1 1405 322
table2=table(completed_data$education,completed_data$TenYearCHD)
chisq.test(table2)
Pearson's Chi-squared test
data: table2
X-squared = 31.018, df = 3, p-value = 8.425e-07
table3=table(completed_data$currentSmoker,completed_data$TenYearCHD)
chisq.test(table3)
Pearson's Chi-squared test with Yates' continuity correction
data: table3
X-squared = 2.0634, df = 1, p-value = 0.1509
table4=table(completed_data$BPMeds,completed_data$TenYearCHD)
chisq.test(table4)
Pearson's Chi-squared test with Yates' continuity correction
data: table4
X-squared = 30.92, df = 1, p-value = 2.689e-08
table5=table(completed_data$prevalentStroke,completed_data$TenYearCHD)
chisq.test(table5)
Chi-squared近似算法有可能不准
Pearson's Chi-squared test with Yates' continuity correction
data: table5
X-squared = 6.4451, df = 1, p-value = 0.01113
table6=table(completed_data$prevalentHyp,completed_data$TenYearCHD)
chisq.test(table6)
Pearson's Chi-squared test with Yates' continuity correction
data: table6
X-squared = 120.91, df = 1, p-value < 2.2e-16
table7=table(completed_data$diabetes,completed_data$TenYearCHD)
chisq.test(table7)
Pearson's Chi-squared test with Yates' continuity correction
data: table7
X-squared = 28.074, df = 1, p-value = 1.168e-07
table8=table(completed_data$bplevel,completed_data$TenYearCHD)
chisq.test(table8)
Chi-squared近似算法有可能不准
Pearson's Chi-squared test
data: table8
X-squared = 127.61, df = 3, p-value < 2.2e-16
require(GGally)
载入需要的程辑包:GGally
载入程辑包:‘GGally’
The following object is masked from ‘package:dplyr’:
nasa
full_data <- completed_data
ggpairs(full_data[,c(2,5,10:15)])
diaBP和sysBP有多重共线性的危险。 (解释下它的实际意义)
currentSmoker变量可能不显著,下面进入模型 ### model
# 划分数据集
full_data <- completed_data
set.seed(500)
split = sample.split(full_data$TenYearCHD, SplitRatio = 0.70)
train = subset(full_data, split==TRUE)
test = subset(full_data, split==FALSE)
# Logistic Regression Model - Using all variables
fulldataLog = glm(TenYearCHD ~ ., data = train, family=binomial)
summary(fulldataLog)
Call:
glm(formula = TenYearCHD ~ ., family = binomial, data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.9425 -0.5856 -0.4301 -0.2918 2.8495
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -4.9507254 1.6527924 -2.995 0.00274 **
male1 0.5175404 0.1256533 4.119 3.81e-05 ***
age 0.0566658 0.0078252 7.241 4.44e-13 ***
education2 -0.3009831 0.1439599 -2.091 0.03655 *
education3 -0.0789876 0.1673850 -0.472 0.63700
education4 -0.1223822 0.1886704 -0.649 0.51656
currentSmoker1 0.0737162 0.1795557 0.411 0.68140
cigsPerDay 0.0148933 0.0073097 2.037 0.04160 *
BPMeds1 0.4818730 0.2658025 1.813 0.06985 .
prevalentStroke1 -0.1777638 0.7073075 -0.251 0.80156
prevalentHyp1 0.1362271 0.1869772 0.729 0.46626
diabetes1 -0.0482980 0.3944725 -0.122 0.90255
totChol 0.0020074 0.0013064 1.537 0.12440
sysBP 0.0160675 0.0047292 3.398 0.00068 ***
diaBP -0.0003059 0.0084883 -0.036 0.97125
BMI 0.0087659 0.0150549 0.582 0.56039
heartRate -0.0020778 0.0047899 -0.434 0.66445
glucose 0.0072516 0.0027270 2.659 0.00783 **
bplevel2 -3.4026272 1.4687897 -2.317 0.02052 *
bplevel3 -3.4861383 1.5108769 -2.307 0.02103 *
bplevel4 -3.3888626 1.4863921 -2.280 0.02261 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2346.2 on 2788 degrees of freedom
Residual deviance: 2087.0 on 2768 degrees of freedom
AIC: 2129
Number of Fisher Scoring iterations: 5
fulldataLog = glm(TenYearCHD ~ male + age + cigsPerDay + sysBP + glucose + bplevel, data = train, family=binomial)
summary(fulldataLog)
Call:
glm(formula = TenYearCHD ~ male + age + cigsPerDay + sysBP +
glucose + bplevel, family = binomial, data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.0141 -0.5899 -0.4362 -0.3017 2.7885
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -5.054670 1.477346 -3.421 0.000623 ***
male1 0.512933 0.120903 4.242 2.21e-05 ***
age 0.061775 0.007360 8.394 < 2e-16 ***
cigsPerDay 0.016747 0.004855 3.449 0.000562 ***
sysBP 0.018185 0.003993 4.554 5.27e-06 ***
glucose 0.006832 0.002086 3.275 0.001056 **
bplevel2 -3.333911 1.402487 -2.377 0.017447 *
bplevel3 -3.334127 1.432283 -2.328 0.019921 *
bplevel4 -3.252923 1.417910 -2.294 0.021781 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 2346.2 on 2788 degrees of freedom
Residual deviance: 2098.7 on 2780 degrees of freedom
AIC: 2116.7
Number of Fisher Scoring iterations: 5
predictTest = predict(fulldataLog, type="response", newdata=test)
glm_table <- table(test$TenYearCHD, predictTest > 0.5)
ACCU = sum(diag(glm_table)) / sum(glm_table)
ACCU
[1] 0.8528428
set.seed(22)
rf_model <- randomForest(TenYearCHD ~ ., data = train)
# get importance
importance <- importance(rf_model)
importance
MeanDecreaseGini
male 13.6772726
age 82.5789715
education 29.4778562
currentSmoker 6.7740002
cigsPerDay 35.8671039
BPMeds 5.4417504
prevalentStroke 0.6992751
prevalentHyp 10.1434309
diabetes 3.6040110
totChol 86.0396068
sysBP 94.9900044
diaBP 77.9668212
BMI 90.6283445
heartRate 66.3866265
glucose 82.3949862
bplevel 13.6431956
# 选择重要的因素
set.seed(33)
rf_model <- randomForest(TenYearCHD ~ age + totChol + sysBP + diaBP + BMI + heartRate + glucose, data = train, proximity = TRUE)
# show model error
plot(rf_model)
legend('topright', colnames(rf_model$err.rate), col=1:3, fill=1:3)
# get importance
importance <- importance(rf_model)
varImportance <- data.frame(Variables = row.names(importance),
Importance = round(importance[ ,'MeanDecreaseGini'], 2))
rankImportance <- varImportance %>% mutate(Rank = paste0('#', dense_rank(desc(Importance))))
ggplot(rankImportance, aes(x = reorder(Variables, Importance),
y = importance, fill = Importance)) +
geom_bar(stat = 'identity') +
geom_text(aes(x = Variables, y = 0.5, label = Rank),
hjust=0, vjust=0.55, size=4, colour='red') +
labs(x = 'Variables') +
coord_flip() +
theme_bw()
# 这里有患病风险的error不降反升,需要探究其中原因
# 绘制分类图像
# MDSplot(rf_model, fac, k=2, palette=NULL, pch=20)
pred<-predict(rf_model,newdata=test)
pred_out_1<-predict(object=rf_model,newdata=test,type="prob") #输出概率
table <- table(pred,test$TenYearCHD)
sum(diag(table))/sum(table) #预测准确率
[1] 0.84699
plot(margin(rf_model,test$TenYearCHD))
minimal value for n is 3, returning requested palette with 3 different levels
# 参考资料 http://studio.galaxystatistics.com/report/svm/article3/
# 先进行模型调优
tuned <- tune.svm(TenYearCHD ~ .,data = train,gamma = 10^(-6:-1),cost = 10^(1:2))
summary(tuned)
Parameter tuning of ‘svm’:
- sampling method: 10-fold cross validation
- best parameters:
- best performance: 0.1459503
- Detailed performance results:
# 使用turning函数得到最佳参数设置支持向量机
model.tuned <- svm(TenYearCHD ~ ., data=train, gamma=tuned$best.parameters$gamma, cost=tuned$best.parameters$cost)
summary(model.tuned)
Call:
svm(formula = TenYearCHD ~ ., data = train, gamma = tuned$best.parameters$gamma,
cost = tuned$best.parameters$cost)
Parameters:
SVM-Type: C-classification
SVM-Kernel: radial
cost: 100
Number of Support Vectors: 989
( 578 411 )
Number of Classes: 2
Levels:
0 1
# 调用predict函数基于刚配置好的SVM模型进行类标号的预测:
svm.tuned.pred <- predict(model.tuned, test[,!names(test) %in% c("TenYearCHD")])
svm.tuned.table <- table(svm.tuned.pred, test$TenYearCHD)
svm.tuned.table
svm.tuned.pred 0 1
0 1011 172
1 7 6
accuracy.test.svm <- sum(diag(svm.tuned.table))/sum(svm.tuned.table)
list('测试集预测混淆矩阵'=svm.tuned.table, '测试集预测正确率'=accuracy.test.svm)
$测试集预测混淆矩阵
svm.tuned.pred 0 1
0 1011 172
1 7 6
$测试集预测正确率
[1] 0.8503344
根据上面三个模型的结果,可以看出预测结果的类别数量分布非常不均衡
sum(full_data$TenYearCHD == 1)
[1] 593
sum(full_data$TenYearCHD == 0)
[1] 3392
# 针对这一现象,需要采取措施平衡数据集