Prediction Model

Package cần làm quen/sử dụng: BMA, visreg, GGally, table1, caret, pROC Dữ liệu thực hành: “wdbc.csv” Mục tiêu: tiên lượng ung thư dùng các biến radius, texture, perimeter, area, smooth, compact, concavity, concave.points, symmetry, fractal.

Task 1: Đọc dữ liệu và phân tích mô tả

Đọc dữ liệu “wdbc.csv” vào R và gọi là ca.

ca = read.csv("C:\\Users\\Nguyen\\Desktop\\Workshop NCKH 2019\\Dataset\\Wdbc.csv")
ca$group[ca$diagnosis=="M"] <- 1
ca$group[ca$diagnosis=="B"] <- 0
ca$group = as.factor(ca$group)
names(ca)
##  [1] "id"                   "diagnosis"            "radius"              
##  [4] "texture"              "perimeter"            "area"                
##  [7] "smooth"               "compact"              "concavity"           
## [10] "concave.points"       "symmetry"             "fractal"             
## [13] "radius_se"            "texture_se"           "perimeter_se"        
## [16] "area_se"              "smooth_se"            "compact_se"          
## [19] "concavity_se"         "concave.points_se"    "symmetry_se"         
## [22] "fractal_se"           "radius_worst"         "texture_worst"       
## [25] "perimeter_worst"      "area_worst"           "smooth_worst"        
## [28] "compact_worst"        "concavity_worst"      "concave.points_worst"
## [31] "symmetry_worst"       "fractal_worst"        "group"

Lọc ra các biến số cần để phân tích, đặt tên cho bộ dữ liệu mới tên ca.

ca = ca[, c("group", "radius", "texture" , "perimeter", "area" , "smooth" , "compact" , "concavity" , "concave.points" , "symmetry" , "fractal")]
names(ca)
##  [1] "group"          "radius"         "texture"        "perimeter"     
##  [5] "area"           "smooth"         "compact"        "concavity"     
##  [9] "concave.points" "symmetry"       "fractal"

Mô tả dữ liệu phân nhóm theo group

library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
table1(~radius + texture + perimeter + area + smooth + compact + concavity + concave.points + symmetry + fractal | group, data=ca)
0
(n=357)
1
(n=212)
Overall
(n=569)
radius
Mean (SD) 12.1 (1.78) 17.5 (3.20) 14.1 (3.52)
Median [Min, Max] 12.2 [6.98, 17.9] 17.3 [11.0, 28.1] 13.4 [6.98, 28.1]
texture
Mean (SD) 17.9 (4.00) 21.6 (3.78) 19.3 (4.30)
Median [Min, Max] 17.4 [9.71, 33.8] 21.5 [10.4, 39.3] 18.8 [9.71, 39.3]
perimeter
Mean (SD) 78.1 (11.8) 115 (21.9) 92.0 (24.3)
Median [Min, Max] 78.2 [43.8, 115] 114 [71.9, 189] 86.2 [43.8, 189]
area
Mean (SD) 463 (134) 978 (368) 655 (352)
Median [Min, Max] 458 [144, 992] 932 [362, 2500] 551 [144, 2500]
smooth
Mean (SD) 0.0925 (0.0134) 0.103 (0.0126) 0.0964 (0.0141)
Median [Min, Max] 0.0908 [0.0526, 0.163] 0.102 [0.0737, 0.145] 0.0959 [0.0526, 0.163]
compact
Mean (SD) 0.0801 (0.0337) 0.145 (0.0540) 0.104 (0.0528)
Median [Min, Max] 0.0753 [0.0194, 0.224] 0.132 [0.0461, 0.345] 0.0926 [0.0194, 0.345]
concavity
Mean (SD) 0.0461 (0.0434) 0.161 (0.0750) 0.0888 (0.0797)
Median [Min, Max] 0.0371 [0.00, 0.411] 0.151 [0.0240, 0.427] 0.0615 [0.00, 0.427]
concave.points
Mean (SD) 0.0257 (0.0159) 0.0880 (0.0344) 0.0489 (0.0388)
Median [Min, Max] 0.0234 [0.00, 0.0853] 0.0863 [0.0203, 0.201] 0.0335 [0.00, 0.201]
symmetry
Mean (SD) 0.174 (0.0248) 0.193 (0.0276) 0.181 (0.0274)
Median [Min, Max] 0.171 [0.106, 0.274] 0.190 [0.131, 0.304] 0.179 [0.106, 0.304]
fractal
Mean (SD) 0.0629 (0.00675) 0.0627 (0.00757) 0.0628 (0.00706)
Median [Min, Max] 0.0615 [0.0519, 0.0958] 0.0616 [0.0500, 0.0974] 0.0615 [0.0500, 0.0974]
# Mô tả bằng biểu đồ
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
ggpairs(ca, mapping = aes(color = group))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Bạn có nhận xét gì khi xem qua biểu đồ này?

Task 2: Tìm biến liên quan

Trong task này, học viên sẽ dùng package BMA để tìm các biến số liên quan đến ung thư

library(BMA)
## Loading required package: survival
## Loading required package: leaps
## Loading required package: robustbase
## 
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
## 
##     heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.4-7)
xvars = ca[, -1]
yvar = ca[, 1]
m = bic.glm(xvars, yvar, strict=F, OR=20, glm.family="binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(m)
## 
## Call:
## bic.glm.data.frame(x = xvars, y = yvar, glm.family = "binomial",     strict = F, OR = 20)
## 
## 
##   19  models were selected
##  Best  5  models (cumulative posterior probability =  0.5967 ): 
## 
##                 p!=0    EV        SD        model 1     model 2   
## Intercept       100    -14.46144  10.48166  -1.675e+01  -3.153e+00
## radius           26.5   -0.41976   1.17434       .           .    
## texture         100.0    0.34766   0.06231   3.255e-01   3.380e-01
## perimeter        26.7   -0.07614   0.15863       .      -3.252e-01
## area             89.1    0.01974   0.01669   7.776e-03   3.194e-02
## smooth           39.5   26.35500  38.40039       .           .    
## compact           1.2   -0.08537   1.18771       .           .    
## concavity         6.3    1.09586   4.48508       .           .    
## concave.points   95.0   91.60054  30.01639   1.016e+02   1.216e+02
## symmetry         16.5    3.11164   8.13363       .           .    
## fractal           0.0    0.00000   0.00000       .           .    
##                                                                   
## nVar                                           3           4      
## BIC                                         -3.423e+03  -3.422e+03
## post prob                                    0.198       0.126    
##                 model 3     model 4     model 5   
## Intercept       -2.368e+01  -4.643e+00  -5.328e-01
## radius               .      -2.888e+00  -2.382e+00
## texture          3.627e-01   3.732e-01   3.278e-01
## perimeter            .           .           .    
## area             1.034e-02   4.400e-02   3.537e-02
## smooth           5.947e+01   6.318e+01       .    
## compact              .           .           .    
## concavity            .           .           .    
## concave.points   7.657e+01   8.111e+01   1.054e+02
## symmetry             .           .           .    
## fractal              .           .           .    
##                                                   
## nVar               4           5           4      
## BIC             -3.422e+03  -3.421e+03  -3.421e+03
## post prob        0.115       0.086       0.072
# Biểu đồ cho các mô hình khả dĩ
imageplot.bma(m)

Task 3: Đánh giá tầm quan trọng

Đánh giá tầm quan trọng của các biến số dùng caret

m = glm(group ~ texture + area + concave.points, family=binomial, data=ca)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
varImp(m, scale=F)
##                 Overall
## texture        5.847354
## area           5.359023
## concave.points 7.740414

Task 4: Training mô hình

library(caret)
sample = createDataPartition(ca$group, p=0.6, list=FALSE)
training = ca[ sample, ]
testing = ca[ -sample, ]
mod.fit = train(group ~ texture + area + concave.points, data=training, method="glm", family="binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(mod.fit)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.97132  -0.16906  -0.04325   0.01287   2.81994  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -18.103130   2.667870  -6.786 1.16e-11 ***
## texture          0.364108   0.072945   4.992 5.99e-07 ***
## area             0.009176   0.002030   4.519 6.20e-06 ***
## concave.points 102.002817  17.023641   5.992 2.07e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 453.189  on 342  degrees of freedom
## Residual deviance:  98.916  on 339  degrees of freedom
## AIC: 106.92
## 
## Number of Fisher Scoring iterations: 8

Task 5: Kiểm định (testing) mô hình

Tính giá trị tiên lượng

pred = predict(mod.fit, newdata=testing, type="raw")
pred
##   [1] 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 0 0 0 1 0 1 0 0 1 0 0 1 1 1 0
##  [36] 1 1 0 0 1 1 0 0 0 0 0 1 0 0 1 1 1 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0
##  [71] 1 1 0 1 0 0 1 1 1 0 1 0 0 0 1 0 1 0 0 1 1 0 1 0 1 0 0 1 0 0 0 1 0 1 1
## [106] 1 0 0 1 1 1 1 1 1 1 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1
## [141] 0 0 0 0 0 0 0 1 0 1 1 0 1 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0
## [176] 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 0 0 1 0 0 0 0 0 0 1 0
## [211] 0 0 1 1 0 0 0 0 0 0 0 1 0 1 1 1
## Levels: 0 1

Tính giá độ nhạy, đặc hiệu, v.v.

confusionMatrix(testing$group, pred)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 132  10
##          1   5  79
##                                           
##                Accuracy : 0.9336          
##                  95% CI : (0.8929, 0.9624)
##     No Information Rate : 0.6062          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8596          
##                                           
##  Mcnemar's Test P-Value : 0.3017          
##                                           
##             Sensitivity : 0.9635          
##             Specificity : 0.8876          
##          Pos Pred Value : 0.9296          
##          Neg Pred Value : 0.9405          
##              Prevalence : 0.6062          
##          Detection Rate : 0.5841          
##    Detection Prevalence : 0.6283          
##       Balanced Accuracy : 0.9256          
##                                           
##        'Positive' Class : 0               
## 

Task 6: Vẽ biểu đồ ROC dùng package pROC

Cách tính AUC và vẽ biểu đồ ROC

# Tính xác suất xảy ra các biến cố ở bộ dữ liệu testing từ phương trình hồi quy
prob = predict(mod.fit, newdata=testing, type="prob")
# Chuyển cách trình bày các cố hạng như bình thường
options(scipen = 999)
head(prob)
##                 0         1
## 2  0.000456352077 0.9995436
## 5  0.000063898211 0.9999361
## 10 0.023409693707 0.9765903
## 12 0.089854660053 0.9101453
## 13 0.000003255543 0.9999967
## 14 0.036592628202 0.9634074
# Gán xác suất xảy ra biến cố theo phương trình tiên lượng từ bộ dữ liệu testing và giá trị testing.group (biến cố xảy ra trên thực tế)
pred = data.frame(prob, testing$group)
head(pred)
##                X0        X1 testing.group
## 2  0.000456352077 0.9995436             1
## 5  0.000063898211 0.9999361             1
## 10 0.023409693707 0.9765903             1
## 12 0.089854660053 0.9101453             1
## 13 0.000003255543 0.9999967             1
## 14 0.036592628202 0.9634074             1

Tính ROC

library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
roc(pred$testing.group, pred$X1)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## 
## Call:
## roc.default(response = pred$testing.group, predictor = pred$X1)
## 
## Data: pred$X1 in 142 controls (pred$testing.group 0) < 84 cases (pred$testing.group 1).
## Area under the curve: 0.9884
auc = smooth(roc(pred$testing.group, pred$X1))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(auc)

Calibration mô hình ở bộ dữ liệu testing

# Fit mô hình hồi quy logistic
library(rms)
## Loading required package: Hmisc
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:table1':
## 
##     label, label<-, units
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
f = lrm(group ~ texture + area + concave.points, data= training)

# Dùng mô hình đã training để testing trên bộ dữ liệu mới
pred.logit = predict(f, newdata= testing)
reg= lrm (testing$group ~ pred.logit, x=T, y=T)
cal= calibrate (reg, B=1000, m= 500, method= 'boot')

# Vẽ biểu đồ calibration
plot(cal)

## 
## n=226   Mean absolute error=0.007   Mean squared error=0.00005
## 0.9 Quantile of absolute error=0.009

Task 7: Vẽ nomogram

# Dùng package rms
library(rms)
ddist = datadist(ca)
options(datadist="ddist")
m = lrm( group ~ texture + area + concave.points, data=ca)
p = nomogram(m, fun=function(x)1/(1+exp( -x)), fun.at=c(0.001, 0.01, 0.05, seq(0.1, 1, by=0.1)), funlabel="Risk of cancer", lp=F)
plot(p, cex.axis=0.6, lmgp=0.2)

# Dùng package DynNom (không tải lên rpub được)
# m = glm( group ~ texture + area + concave.points, family=binomial, data=training)
# library(DynNom); library(shiny)
# DynNom(m, training)