Packages cần sử dụng BMA, visreg, GGally, table1, caret, pROC

Dữ liệu thực hành wdbc.csv

Mục tiêu: tiên lượng ung thư dùng các biến radius, texture, perimeter, area, smooth, compact, concavity, concave.points, symmetry, fractal.

Task 1: Đọc dữ liệu và phân tích mô tả

Đọc dữ liệu wdbc.csv vào R và gọi là ca.

ca = read.csv("C:/Users/ADMIN/OneDrive/Statistical courses/Benh vien Cho Ray-Aug2019/Datasets/Wdbc.csv")
ca$group [ca$diagnosis == "M"] = 1
ca$group [ca$diagnosis == "B"] = 0
ca$group = as.factor(ca$group)
names (ca)
##  [1] "id"                   "diagnosis"            "radius"              
##  [4] "texture"              "perimeter"            "area"                
##  [7] "smooth"               "compact"              "concavity"           
## [10] "concave.points"       "symmetry"             "fractal"             
## [13] "radius_se"            "texture_se"           "perimeter_se"        
## [16] "area_se"              "smooth_se"            "compact_se"          
## [19] "concavity_se"         "concave.points_se"    "symmetry_se"         
## [22] "fractal_se"           "radius_worst"         "texture_worst"       
## [25] "perimeter_worst"      "area_worst"           "smooth_worst"        
## [28] "compact_worst"        "concavity_worst"      "concave.points_worst"
## [31] "symmetry_worst"       "fractal_worst"        "group"

Lọc ra các biến số cần để phân tích, đặt tên cho bộ dữ liệu mới tên ca

ca = ca[, c("group", "radius", "texture", "perimeter", "area", "smooth", "compact", "concavity", "concave.points", "symmetry", "fractal")]
names (ca)
##  [1] "group"          "radius"         "texture"        "perimeter"     
##  [5] "area"           "smooth"         "compact"        "concavity"     
##  [9] "concave.points" "symmetry"       "fractal"

Mô tả dữ liệu phân nhóm theo group

library(table1)
## Warning: package 'table1' was built under R version 3.6.1
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
table1(~radius + texture + perimeter + area + smooth + compact + concavity + concave.points + symmetry + fractal | group, data=ca)
0
(n=357)
1
(n=212)
Overall
(n=569)
radius
Mean (SD) 12.1 (1.78) 17.5 (3.20) 14.1 (3.52)
Median [Min, Max] 12.2 [6.98, 17.9] 17.3 [11.0, 28.1] 13.4 [6.98, 28.1]
texture
Mean (SD) 17.9 (4.00) 21.6 (3.78) 19.3 (4.30)
Median [Min, Max] 17.4 [9.71, 33.8] 21.5 [10.4, 39.3] 18.8 [9.71, 39.3]
perimeter
Mean (SD) 78.1 (11.8) 115 (21.9) 92.0 (24.3)
Median [Min, Max] 78.2 [43.8, 115] 114 [71.9, 189] 86.2 [43.8, 189]
area
Mean (SD) 463 (134) 978 (368) 655 (352)
Median [Min, Max] 458 [144, 992] 932 [362, 2500] 551 [144, 2500]
smooth
Mean (SD) 0.0925 (0.0134) 0.103 (0.0126) 0.0964 (0.0141)
Median [Min, Max] 0.0908 [0.0526, 0.163] 0.102 [0.0737, 0.145] 0.0959 [0.0526, 0.163]
compact
Mean (SD) 0.0801 (0.0337) 0.145 (0.0540) 0.104 (0.0528)
Median [Min, Max] 0.0753 [0.0194, 0.224] 0.132 [0.0461, 0.345] 0.0926 [0.0194, 0.345]
concavity
Mean (SD) 0.0461 (0.0434) 0.161 (0.0750) 0.0888 (0.0797)
Median [Min, Max] 0.0371 [0.00, 0.411] 0.151 [0.0240, 0.427] 0.0615 [0.00, 0.427]
concave.points
Mean (SD) 0.0257 (0.0159) 0.0880 (0.0344) 0.0489 (0.0388)
Median [Min, Max] 0.0234 [0.00, 0.0853] 0.0863 [0.0203, 0.201] 0.0335 [0.00, 0.201]
symmetry
Mean (SD) 0.174 (0.0248) 0.193 (0.0276) 0.181 (0.0274)
Median [Min, Max] 0.171 [0.106, 0.274] 0.190 [0.131, 0.304] 0.179 [0.106, 0.304]
fractal
Mean (SD) 0.0629 (0.00675) 0.0627 (0.00757) 0.0628 (0.00706)
Median [Min, Max] 0.0615 [0.0519, 0.0958] 0.0616 [0.0500, 0.0974] 0.0615 [0.0500, 0.0974]

Mô tả bằng biểu đồ

library(GGally)
## Warning: package 'GGally' was built under R version 3.6.1
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
ggpairs(ca, mapping = aes(color = group))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Task 2: Tìm biến liên quan

Trong task này, học viên sẽ dùng package BMA để tìm các biến số liên quan đến ung thư

library(BMA)
## Warning: package 'BMA' was built under R version 3.6.1
## Loading required package: survival
## Loading required package: leaps
## Warning: package 'leaps' was built under R version 3.6.1
## Loading required package: robustbase
## Warning: package 'robustbase' was built under R version 3.6.1
## 
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
## 
##     heart
## Loading required package: inline
## Warning: package 'inline' was built under R version 3.6.1
## Loading required package: rrcov
## Warning: package 'rrcov' was built under R version 3.6.1
## Scalable Robust Estimators with High Breakdown Point (version 1.4-7)
xvars = ca[, -1] 
yvar = ca[, 1] 
m = bic.glm(xvars, yvar, strict=F, OR=20, glm.family="binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(m)
## 
## Call:
## bic.glm.data.frame(x = xvars, y = yvar, glm.family = "binomial",     strict = F, OR = 20)
## 
## 
##   19  models were selected
##  Best  5  models (cumulative posterior probability =  0.5967 ): 
## 
##                 p!=0    EV        SD        model 1     model 2   
## Intercept       100    -14.46144  10.48166  -1.675e+01  -3.153e+00
## radius           26.5   -0.41976   1.17434       .           .    
## texture         100.0    0.34766   0.06231   3.255e-01   3.380e-01
## perimeter        26.7   -0.07614   0.15863       .      -3.252e-01
## area             89.1    0.01974   0.01669   7.776e-03   3.194e-02
## smooth           39.5   26.35500  38.40039       .           .    
## compact           1.2   -0.08537   1.18771       .           .    
## concavity         6.3    1.09586   4.48508       .           .    
## concave.points   95.0   91.60054  30.01639   1.016e+02   1.216e+02
## symmetry         16.5    3.11164   8.13363       .           .    
## fractal           0.0    0.00000   0.00000       .           .    
##                                                                   
## nVar                                           3           4      
## BIC                                         -3.423e+03  -3.422e+03
## post prob                                    0.198       0.126    
##                 model 3     model 4     model 5   
## Intercept       -2.368e+01  -4.643e+00  -5.328e-01
## radius               .      -2.888e+00  -2.382e+00
## texture          3.627e-01   3.732e-01   3.278e-01
## perimeter            .           .           .    
## area             1.034e-02   4.400e-02   3.537e-02
## smooth           5.947e+01   6.318e+01       .    
## compact              .           .           .    
## concavity            .           .           .    
## concave.points   7.657e+01   8.111e+01   1.054e+02
## symmetry             .           .           .    
## fractal              .           .           .    
##                                                   
## nVar               4           5           4      
## BIC             -3.422e+03  -3.421e+03  -3.421e+03
## post prob        0.115       0.086       0.072

Biểu đồ cho các mô hình khả dĩ

imageplot.bma(m) 

Task 3: Đánh giá tầm quan trọng

Đánh giá tầm quan trọng của các biến số dùng caret

m = glm(group ~ texture + area + concave.points, family=binomial, data=ca)
library(caret)
## Warning: package 'caret' was built under R version 3.6.1
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
varImp(m, scale=F)
##                 Overall
## texture        5.847354
## area           5.359023
## concave.points 7.740414

Task 4: Training mô hình

library(caret) 
sample = createDataPartition(ca$group, p=0.6, list=FALSE)
training = ca [sample, ] 
testing = ca [-sample, ]
mod.fit = train(group ~ texture + area + concave.points,  data=training, method="glm", family="binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(mod.fit)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.32631  -0.12665  -0.03701   0.01579   2.88040  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -17.907843   2.858944  -6.264 3.76e-10 ***
## texture          0.369296   0.086122   4.288 1.80e-05 ***
## area             0.007225   0.001896   3.810 0.000139 ***
## concave.points 112.708334  19.523987   5.773 7.80e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 453.19  on 342  degrees of freedom
## Residual deviance:  81.62  on 339  degrees of freedom
## AIC: 89.62
## 
## Number of Fisher Scoring iterations: 8

Task 5: Kiểm định (testing) mô hình

Tính giá trị tiên lượng

pred = predict(mod.fit, newdata=testing, type="raw")
pred
##   [1] 1 1 1 1 1 0 0 1 1 1 0 0 1 1 0 0 0 0 1 0 1 0 1 1 0 1 1 1 0 0 1 0 0 1 0
##  [36] 0 1 1 0 1 1 1 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 0 0 1
##  [71] 1 0 1 0 1 1 1 1 0 0 1 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0
## [106] 0 1 1 1 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 1 1 0 0 0
## [141] 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0
## [176] 0 0 0 0 0 0 0 1 0 1 1 1 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1
## [211] 1 0 1 0 1 0 0 0 0 0 0 1 1 1 1 0
## Levels: 0 1

Tính độ nhạy, đặc hiệu, v.v.

confusionMatrix (testing$group, pred)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 135   7
##          1  10  74
##                                           
##                Accuracy : 0.9248          
##                  95% CI : (0.8823, 0.9556)
##     No Information Rate : 0.6416          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8378          
##                                           
##  Mcnemar's Test P-Value : 0.6276          
##                                           
##             Sensitivity : 0.9310          
##             Specificity : 0.9136          
##          Pos Pred Value : 0.9507          
##          Neg Pred Value : 0.8810          
##              Prevalence : 0.6416          
##          Detection Rate : 0.5973          
##    Detection Prevalence : 0.6283          
##       Balanced Accuracy : 0.9223          
##                                           
##        'Positive' Class : 0               
## 

Task 5: Vẽ biểu đồ ROC dùng package pROC

Cách tính AUC và vẽ biểu đồ ROC

#Tính xác suất xảy ra các biến cố ở bộ dữ liệu testing từ phương trình hồi quy
prob = predict(mod.fit, newdata= testing, type="prob")
#chuyển cách trình bày các số hạng như bình thường
options (scipen = 999)
head (prob)
##                 0         1
## 7  0.004844253131 0.9951557
## 8  0.330591556394 0.6694084
## 12 0.143428872260 0.8565711
## 13 0.000006361996 0.9999936
## 19 0.000041882548 0.9999581
## 20 0.957898204338 0.0421018

Gán xác suất xảy ra biến cố theo phương trình tiên lượng từ bộ dữ liệu testing và giá trị testing.group (biến cố xảy ra trên thực tế).

pred = data.frame(prob, testing$group)
head(pred)
##                X0        X1 testing.group
## 7  0.004844253131 0.9951557             1
## 8  0.330591556394 0.6694084             1
## 12 0.143428872260 0.8565711             1
## 13 0.000006361996 0.9999936             1
## 19 0.000041882548 0.9999581             1
## 20 0.957898204338 0.0421018             0

Tính ROC

library(pROC)
## Warning: package 'pROC' was built under R version 3.6.1
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
roc(pred$testing.group, pred$X1)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## 
## Call:
## roc.default(response = pred$testing.group, predictor = pred$X1)
## 
## Data: pred$X1 in 142 controls (pred$testing.group 0) < 84 cases (pred$testing.group 1).
## Area under the curve: 0.9786
auc = smooth(roc(pred$testing.group, pred$X1))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(auc)

calibration mô hình ở bộ dữ liệu testing

# Fit mô hình hồi qui logistic
library (rms)
## Warning: package 'rms' was built under R version 3.6.1
## Loading required package: Hmisc
## Warning: package 'Hmisc' was built under R version 3.6.1
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:table1':
## 
##     label, label<-, units
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
f = lrm(group ~ texture + area + concave.points, data= training)
# Dùng mô hình đã training để testing trên bộ dữ liệu mới
pred.logit = predict(f, newdata= testing)
reg= lrm (testing$group ~ pred.logit, x=T, y=T)
cal= calibrate (reg, B=1000, m= 500, method= 'boot')
# Vẽ biểu đồ calibration
plot (cal)

## 
## n=226   Mean absolute error=0.01   Mean squared error=0.00024
## 0.9 Quantile of absolute error=0.023

Task 6: Vẽ nomogram

library(rms)
ddist = datadist(ca)
options(datadist="ddist")
m = lrm(group ~ texture + area + concave.points, data=ca)
p = nomogram(m, fun=function(x)1/(1+exp(-x)), fun.at=c(0.001, 0.01, 0.05, seq(0.1, 1, by=0.1)), funlabel="Risk of cancer", lp=F)
plot(p, cex.axis=0.6, lmgp=0.2)

Gói DynNom không tải lên Rpub được

#m = glm(group ~ texture + area + concave.points, family=binomial, data=training)
#library(DynNom); library(shiny)
#DynNom(m, training)