Package cần làm quen/sử dụng: BMA
, visreg
, GGally
, table1
, caret
, pROC
Dữ liệu thực hành: “wdbc.csv” Mục tiêu: tiên lượng ung thư dùng các biến radius, texture, perimeter, area, smooth, compact, concavity, concave.points, symmetry, fractal.
Đọc dữ liệu “wdbc.csv” vào R và gọi là ca.
ca = read.csv("C:\\Users\\Nguyen\\Desktop\\Workshop NCKH 2019\\Dataset\\Wdbc.csv")
ca$group[ca$diagnosis=="M"] <- 1
ca$group[ca$diagnosis=="B"] <- 0
ca$group = as.factor(ca$group)
names(ca)
## [1] "id" "diagnosis" "radius"
## [4] "texture" "perimeter" "area"
## [7] "smooth" "compact" "concavity"
## [10] "concave.points" "symmetry" "fractal"
## [13] "radius_se" "texture_se" "perimeter_se"
## [16] "area_se" "smooth_se" "compact_se"
## [19] "concavity_se" "concave.points_se" "symmetry_se"
## [22] "fractal_se" "radius_worst" "texture_worst"
## [25] "perimeter_worst" "area_worst" "smooth_worst"
## [28] "compact_worst" "concavity_worst" "concave.points_worst"
## [31] "symmetry_worst" "fractal_worst" "group"
Lọc ra các biến số cần để phân tích, đặt tên cho bộ dữ liệu mới tên ca.
ca = ca[, c("group", "radius", "texture" , "perimeter", "area" , "smooth" , "compact" , "concavity" , "concave.points" , "symmetry" , "fractal")]
names(ca)
## [1] "group" "radius" "texture" "perimeter"
## [5] "area" "smooth" "compact" "concavity"
## [9] "concave.points" "symmetry" "fractal"
Mô tả dữ liệu phân nhóm theo group
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
table1(~radius + texture + perimeter + area + smooth + compact + concavity + concave.points + symmetry + fractal | group, data=ca)
0 (n=357) |
1 (n=212) |
Overall (n=569) |
|
---|---|---|---|
radius | |||
Mean (SD) | 12.1 (1.78) | 17.5 (3.20) | 14.1 (3.52) |
Median [Min, Max] | 12.2 [6.98, 17.9] | 17.3 [11.0, 28.1] | 13.4 [6.98, 28.1] |
texture | |||
Mean (SD) | 17.9 (4.00) | 21.6 (3.78) | 19.3 (4.30) |
Median [Min, Max] | 17.4 [9.71, 33.8] | 21.5 [10.4, 39.3] | 18.8 [9.71, 39.3] |
perimeter | |||
Mean (SD) | 78.1 (11.8) | 115 (21.9) | 92.0 (24.3) |
Median [Min, Max] | 78.2 [43.8, 115] | 114 [71.9, 189] | 86.2 [43.8, 189] |
area | |||
Mean (SD) | 463 (134) | 978 (368) | 655 (352) |
Median [Min, Max] | 458 [144, 992] | 932 [362, 2500] | 551 [144, 2500] |
smooth | |||
Mean (SD) | 0.0925 (0.0134) | 0.103 (0.0126) | 0.0964 (0.0141) |
Median [Min, Max] | 0.0908 [0.0526, 0.163] | 0.102 [0.0737, 0.145] | 0.0959 [0.0526, 0.163] |
compact | |||
Mean (SD) | 0.0801 (0.0337) | 0.145 (0.0540) | 0.104 (0.0528) |
Median [Min, Max] | 0.0753 [0.0194, 0.224] | 0.132 [0.0461, 0.345] | 0.0926 [0.0194, 0.345] |
concavity | |||
Mean (SD) | 0.0461 (0.0434) | 0.161 (0.0750) | 0.0888 (0.0797) |
Median [Min, Max] | 0.0371 [0.00, 0.411] | 0.151 [0.0240, 0.427] | 0.0615 [0.00, 0.427] |
concave.points | |||
Mean (SD) | 0.0257 (0.0159) | 0.0880 (0.0344) | 0.0489 (0.0388) |
Median [Min, Max] | 0.0234 [0.00, 0.0853] | 0.0863 [0.0203, 0.201] | 0.0335 [0.00, 0.201] |
symmetry | |||
Mean (SD) | 0.174 (0.0248) | 0.193 (0.0276) | 0.181 (0.0274) |
Median [Min, Max] | 0.171 [0.106, 0.274] | 0.190 [0.131, 0.304] | 0.179 [0.106, 0.304] |
fractal | |||
Mean (SD) | 0.0629 (0.00675) | 0.0627 (0.00757) | 0.0628 (0.00706) |
Median [Min, Max] | 0.0615 [0.0519, 0.0958] | 0.0616 [0.0500, 0.0974] | 0.0615 [0.0500, 0.0974] |
# Mô tả bằng biểu đồ
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggpairs(ca, mapping = aes(color = group))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Bạn có nhận xét gì khi xem qua biểu đồ này?
Trong task này, học viên sẽ dùng package BMA
để tìm các biến số liên quan đến ung thư
library(BMA)
## Loading required package: survival
## Loading required package: leaps
## Loading required package: robustbase
##
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
##
## heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.4-7)
xvars = ca[, -1]
yvar = ca[, 1]
m = bic.glm(xvars, yvar, strict=F, OR=20, glm.family="binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(m)
##
## Call:
## bic.glm.data.frame(x = xvars, y = yvar, glm.family = "binomial", strict = F, OR = 20)
##
##
## 19 models were selected
## Best 5 models (cumulative posterior probability = 0.5967 ):
##
## p!=0 EV SD model 1 model 2
## Intercept 100 -14.46144 10.48166 -1.675e+01 -3.153e+00
## radius 26.5 -0.41976 1.17434 . .
## texture 100.0 0.34766 0.06231 3.255e-01 3.380e-01
## perimeter 26.7 -0.07614 0.15863 . -3.252e-01
## area 89.1 0.01974 0.01669 7.776e-03 3.194e-02
## smooth 39.5 26.35500 38.40039 . .
## compact 1.2 -0.08537 1.18771 . .
## concavity 6.3 1.09586 4.48508 . .
## concave.points 95.0 91.60054 30.01639 1.016e+02 1.216e+02
## symmetry 16.5 3.11164 8.13363 . .
## fractal 0.0 0.00000 0.00000 . .
##
## nVar 3 4
## BIC -3.423e+03 -3.422e+03
## post prob 0.198 0.126
## model 3 model 4 model 5
## Intercept -2.368e+01 -4.643e+00 -5.328e-01
## radius . -2.888e+00 -2.382e+00
## texture 3.627e-01 3.732e-01 3.278e-01
## perimeter . . .
## area 1.034e-02 4.400e-02 3.537e-02
## smooth 5.947e+01 6.318e+01 .
## compact . . .
## concavity . . .
## concave.points 7.657e+01 8.111e+01 1.054e+02
## symmetry . . .
## fractal . . .
##
## nVar 4 5 4
## BIC -3.422e+03 -3.421e+03 -3.421e+03
## post prob 0.115 0.086 0.072
# Biểu đồ cho các mô hình khả dĩ
imageplot.bma(m)
Đánh giá tầm quan trọng của các biến số dùng caret
m = glm(group ~ texture + area + concave.points, family=binomial, data=ca)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
varImp(m, scale=F)
## Overall
## texture 5.847354
## area 5.359023
## concave.points 7.740414
library(caret)
sample = createDataPartition(ca$group, p=0.6, list=FALSE)
training = ca[ sample, ]
testing = ca[ -sample, ]
mod.fit = train(group ~ texture + area + concave.points, data=training, method="glm", family="binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(mod.fit)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.97132 -0.16906 -0.04325 0.01287 2.81994
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -18.103130 2.667870 -6.786 1.16e-11 ***
## texture 0.364108 0.072945 4.992 5.99e-07 ***
## area 0.009176 0.002030 4.519 6.20e-06 ***
## concave.points 102.002817 17.023641 5.992 2.07e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 453.189 on 342 degrees of freedom
## Residual deviance: 98.916 on 339 degrees of freedom
## AIC: 106.92
##
## Number of Fisher Scoring iterations: 8
Tính giá trị tiên lượng
pred = predict(mod.fit, newdata=testing, type="raw")
pred
## [1] 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 0 0 0 1 0 1 0 0 1 0 0 1 1 1 0
## [36] 1 1 0 0 1 1 0 0 0 0 0 1 0 0 1 1 1 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0
## [71] 1 1 0 1 0 0 1 1 1 0 1 0 0 0 1 0 1 0 0 1 1 0 1 0 1 0 0 1 0 0 0 1 0 1 1
## [106] 1 0 0 1 1 1 1 1 1 1 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1
## [141] 0 0 0 0 0 0 0 1 0 1 1 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0
## [176] 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0
## [211] 0 0 1 1 0 0 0 0 0 0 0 1 0 1 1 1
## Levels: 0 1
Tính giá độ nhạy, đặc hiệu, v.v.
confusionMatrix(testing$group, pred)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 132 10
## 1 5 79
##
## Accuracy : 0.9336
## 95% CI : (0.8929, 0.9624)
## No Information Rate : 0.6062
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8596
##
## Mcnemar's Test P-Value : 0.3017
##
## Sensitivity : 0.9635
## Specificity : 0.8876
## Pos Pred Value : 0.9296
## Neg Pred Value : 0.9405
## Prevalence : 0.6062
## Detection Rate : 0.5841
## Detection Prevalence : 0.6283
## Balanced Accuracy : 0.9256
##
## 'Positive' Class : 0
##
pROC
Cách tính AUC và vẽ biểu đồ ROC
# Tính xác suất xảy ra các biến cố ở bộ dữ liệu testing từ phương trình hồi quy
prob = predict(mod.fit, newdata=testing, type="prob")
# Chuyển cách trình bày các cố hạng như bình thường
options(scipen = 999)
head(prob)
## 0 1
## 2 0.000456352077 0.9995436
## 5 0.000063898211 0.9999361
## 10 0.023409693707 0.9765903
## 12 0.089854660053 0.9101453
## 13 0.000003255543 0.9999967
## 14 0.036592628202 0.9634074
# Gán xác suất xảy ra biến cố theo phương trình tiên lượng từ bộ dữ liệu testing và giá trị testing.group (biến cố xảy ra trên thực tế)
pred = data.frame(prob, testing$group)
head(pred)
## X0 X1 testing.group
## 2 0.000456352077 0.9995436 1
## 5 0.000063898211 0.9999361 1
## 10 0.023409693707 0.9765903 1
## 12 0.089854660053 0.9101453 1
## 13 0.000003255543 0.9999967 1
## 14 0.036592628202 0.9634074 1
Tính ROC
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
roc(pred$testing.group, pred$X1)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## roc.default(response = pred$testing.group, predictor = pred$X1)
##
## Data: pred$X1 in 142 controls (pred$testing.group 0) < 84 cases (pred$testing.group 1).
## Area under the curve: 0.9884
auc = smooth(roc(pred$testing.group, pred$X1))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(auc)
Calibration mô hình ở bộ dữ liệu testing
# Fit mô hình hồi quy logistic
library(rms)
## Loading required package: Hmisc
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:table1':
##
## label, label<-, units
## The following objects are masked from 'package:base':
##
## format.pval, units
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
f = lrm(group ~ texture + area + concave.points, data= training)
# Dùng mô hình đã training để testing trên bộ dữ liệu mới
pred.logit = predict(f, newdata= testing)
reg= lrm (testing$group ~ pred.logit, x=T, y=T)
cal= calibrate (reg, B=1000, m= 500, method= 'boot')
# Vẽ biểu đồ calibration
plot(cal)
##
## n=226 Mean absolute error=0.007 Mean squared error=0.00005
## 0.9 Quantile of absolute error=0.009
# Dùng package rms
library(rms)
ddist = datadist(ca)
options(datadist="ddist")
m = lrm( group ~ texture + area + concave.points, data=ca)
p = nomogram(m, fun=function(x)1/(1+exp( -x)), fun.at=c(0.001, 0.01, 0.05, seq(0.1, 1, by=0.1)), funlabel="Risk of cancer", lp=F)
plot(p, cex.axis=0.6, lmgp=0.2)
# Dùng package DynNom (không tải lên rpub được)
# m = glm( group ~ texture + area + concave.points, family=binomial, data=training)
# library(DynNom); library(shiny)
# DynNom(m, training)