Packages cần sử dụng BMA, visreg, GGally, table1, caret, pROC
Dữ liệu thực hành wdbc.csv
Mục tiêu: tiên lượng ung thư dùng các biến radius, texture, perimeter, area, smooth, compact, concavity, concave.points, symmetry, fractal.
Đọc dữ liệu wdbc.csv vào R và gọi là ca.
ca = read.csv("C:/Users/ADMIN/OneDrive/Statistical courses/Benh vien Cho Ray-Aug2019/Datasets/Wdbc.csv")
ca$group [ca$diagnosis == "M"] = 1
ca$group [ca$diagnosis == "B"] = 0
ca$group = as.factor(ca$group)
names (ca)
## [1] "id" "diagnosis" "radius"
## [4] "texture" "perimeter" "area"
## [7] "smooth" "compact" "concavity"
## [10] "concave.points" "symmetry" "fractal"
## [13] "radius_se" "texture_se" "perimeter_se"
## [16] "area_se" "smooth_se" "compact_se"
## [19] "concavity_se" "concave.points_se" "symmetry_se"
## [22] "fractal_se" "radius_worst" "texture_worst"
## [25] "perimeter_worst" "area_worst" "smooth_worst"
## [28] "compact_worst" "concavity_worst" "concave.points_worst"
## [31] "symmetry_worst" "fractal_worst" "group"
Lọc ra các biến số cần để phân tích, đặt tên cho bộ dữ liệu mới tên ca
ca = ca[, c("group", "radius", "texture", "perimeter", "area", "smooth", "compact", "concavity", "concave.points", "symmetry", "fractal")]
names (ca)
## [1] "group" "radius" "texture" "perimeter"
## [5] "area" "smooth" "compact" "concavity"
## [9] "concave.points" "symmetry" "fractal"
Mô tả dữ liệu phân nhóm theo group
library(table1)
## Warning: package 'table1' was built under R version 3.6.1
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
table1(~radius + texture + perimeter + area + smooth + compact + concavity + concave.points + symmetry + fractal | group, data=ca)
| 0 (n=357) |
1 (n=212) |
Overall (n=569) |
|
|---|---|---|---|
| radius | |||
| Mean (SD) | 12.1 (1.78) | 17.5 (3.20) | 14.1 (3.52) |
| Median [Min, Max] | 12.2 [6.98, 17.9] | 17.3 [11.0, 28.1] | 13.4 [6.98, 28.1] |
| texture | |||
| Mean (SD) | 17.9 (4.00) | 21.6 (3.78) | 19.3 (4.30) |
| Median [Min, Max] | 17.4 [9.71, 33.8] | 21.5 [10.4, 39.3] | 18.8 [9.71, 39.3] |
| perimeter | |||
| Mean (SD) | 78.1 (11.8) | 115 (21.9) | 92.0 (24.3) |
| Median [Min, Max] | 78.2 [43.8, 115] | 114 [71.9, 189] | 86.2 [43.8, 189] |
| area | |||
| Mean (SD) | 463 (134) | 978 (368) | 655 (352) |
| Median [Min, Max] | 458 [144, 992] | 932 [362, 2500] | 551 [144, 2500] |
| smooth | |||
| Mean (SD) | 0.0925 (0.0134) | 0.103 (0.0126) | 0.0964 (0.0141) |
| Median [Min, Max] | 0.0908 [0.0526, 0.163] | 0.102 [0.0737, 0.145] | 0.0959 [0.0526, 0.163] |
| compact | |||
| Mean (SD) | 0.0801 (0.0337) | 0.145 (0.0540) | 0.104 (0.0528) |
| Median [Min, Max] | 0.0753 [0.0194, 0.224] | 0.132 [0.0461, 0.345] | 0.0926 [0.0194, 0.345] |
| concavity | |||
| Mean (SD) | 0.0461 (0.0434) | 0.161 (0.0750) | 0.0888 (0.0797) |
| Median [Min, Max] | 0.0371 [0.00, 0.411] | 0.151 [0.0240, 0.427] | 0.0615 [0.00, 0.427] |
| concave.points | |||
| Mean (SD) | 0.0257 (0.0159) | 0.0880 (0.0344) | 0.0489 (0.0388) |
| Median [Min, Max] | 0.0234 [0.00, 0.0853] | 0.0863 [0.0203, 0.201] | 0.0335 [0.00, 0.201] |
| symmetry | |||
| Mean (SD) | 0.174 (0.0248) | 0.193 (0.0276) | 0.181 (0.0274) |
| Median [Min, Max] | 0.171 [0.106, 0.274] | 0.190 [0.131, 0.304] | 0.179 [0.106, 0.304] |
| fractal | |||
| Mean (SD) | 0.0629 (0.00675) | 0.0627 (0.00757) | 0.0628 (0.00706) |
| Median [Min, Max] | 0.0615 [0.0519, 0.0958] | 0.0616 [0.0500, 0.0974] | 0.0615 [0.0500, 0.0974] |
Mô tả bằng biểu đồ
library(GGally)
## Warning: package 'GGally' was built under R version 3.6.1
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggpairs(ca, mapping = aes(color = group))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Trong task này, học viên sẽ dùng package BMA để tìm các biến số liên quan đến ung thư
library(BMA)
## Warning: package 'BMA' was built under R version 3.6.1
## Loading required package: survival
## Loading required package: leaps
## Warning: package 'leaps' was built under R version 3.6.1
## Loading required package: robustbase
## Warning: package 'robustbase' was built under R version 3.6.1
##
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
##
## heart
## Loading required package: inline
## Warning: package 'inline' was built under R version 3.6.1
## Loading required package: rrcov
## Warning: package 'rrcov' was built under R version 3.6.1
## Scalable Robust Estimators with High Breakdown Point (version 1.4-7)
xvars = ca[, -1]
yvar = ca[, 1]
m = bic.glm(xvars, yvar, strict=F, OR=20, glm.family="binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(m)
##
## Call:
## bic.glm.data.frame(x = xvars, y = yvar, glm.family = "binomial", strict = F, OR = 20)
##
##
## 19 models were selected
## Best 5 models (cumulative posterior probability = 0.5967 ):
##
## p!=0 EV SD model 1 model 2
## Intercept 100 -14.46144 10.48166 -1.675e+01 -3.153e+00
## radius 26.5 -0.41976 1.17434 . .
## texture 100.0 0.34766 0.06231 3.255e-01 3.380e-01
## perimeter 26.7 -0.07614 0.15863 . -3.252e-01
## area 89.1 0.01974 0.01669 7.776e-03 3.194e-02
## smooth 39.5 26.35500 38.40039 . .
## compact 1.2 -0.08537 1.18771 . .
## concavity 6.3 1.09586 4.48508 . .
## concave.points 95.0 91.60054 30.01639 1.016e+02 1.216e+02
## symmetry 16.5 3.11164 8.13363 . .
## fractal 0.0 0.00000 0.00000 . .
##
## nVar 3 4
## BIC -3.423e+03 -3.422e+03
## post prob 0.198 0.126
## model 3 model 4 model 5
## Intercept -2.368e+01 -4.643e+00 -5.328e-01
## radius . -2.888e+00 -2.382e+00
## texture 3.627e-01 3.732e-01 3.278e-01
## perimeter . . .
## area 1.034e-02 4.400e-02 3.537e-02
## smooth 5.947e+01 6.318e+01 .
## compact . . .
## concavity . . .
## concave.points 7.657e+01 8.111e+01 1.054e+02
## symmetry . . .
## fractal . . .
##
## nVar 4 5 4
## BIC -3.422e+03 -3.421e+03 -3.421e+03
## post prob 0.115 0.086 0.072
Biểu đồ cho các mô hình khả dĩ
imageplot.bma(m)
Đánh giá tầm quan trọng của các biến số dùng caret
m = glm(group ~ texture + area + concave.points, family=binomial, data=ca)
library(caret)
## Warning: package 'caret' was built under R version 3.6.1
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
varImp(m, scale=F)
## Overall
## texture 5.847354
## area 5.359023
## concave.points 7.740414
library(caret)
sample = createDataPartition(ca$group, p=0.6, list=FALSE)
training = ca [sample, ]
testing = ca [-sample, ]
mod.fit = train(group ~ texture + area + concave.points, data=training, method="glm", family="binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(mod.fit)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.32631 -0.12665 -0.03701 0.01579 2.88040
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -17.907843 2.858944 -6.264 3.76e-10 ***
## texture 0.369296 0.086122 4.288 1.80e-05 ***
## area 0.007225 0.001896 3.810 0.000139 ***
## concave.points 112.708334 19.523987 5.773 7.80e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 453.19 on 342 degrees of freedom
## Residual deviance: 81.62 on 339 degrees of freedom
## AIC: 89.62
##
## Number of Fisher Scoring iterations: 8
Tính giá trị tiên lượng
pred = predict(mod.fit, newdata=testing, type="raw")
pred
## [1] 1 1 1 1 1 0 0 1 1 1 0 0 1 1 0 0 0 0 1 0 1 0 1 1 0 1 1 1 0 0 1 0 0 1 0
## [36] 0 1 1 0 1 1 1 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 0 0 1
## [71] 1 0 1 0 1 1 1 1 0 0 1 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 1 1 0 0
## [106] 0 1 1 1 0 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 1 0 1 1 0 0 0
## [141] 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1 1 1 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0
## [176] 0 0 0 0 0 0 0 1 0 1 1 1 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1
## [211] 1 0 1 0 1 0 0 0 0 0 0 1 1 1 1 0
## Levels: 0 1
confusionMatrix (testing$group, pred)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 135 7
## 1 10 74
##
## Accuracy : 0.9248
## 95% CI : (0.8823, 0.9556)
## No Information Rate : 0.6416
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8378
##
## Mcnemar's Test P-Value : 0.6276
##
## Sensitivity : 0.9310
## Specificity : 0.9136
## Pos Pred Value : 0.9507
## Neg Pred Value : 0.8810
## Prevalence : 0.6416
## Detection Rate : 0.5973
## Detection Prevalence : 0.6283
## Balanced Accuracy : 0.9223
##
## 'Positive' Class : 0
##
pROCCách tính AUC và vẽ biểu đồ ROC
#Tính xác suất xảy ra các biến cố ở bộ dữ liệu testing từ phương trình hồi quy
prob = predict(mod.fit, newdata= testing, type="prob")
#chuyển cách trình bày các số hạng như bình thường
options (scipen = 999)
head (prob)
## 0 1
## 7 0.004844253131 0.9951557
## 8 0.330591556394 0.6694084
## 12 0.143428872260 0.8565711
## 13 0.000006361996 0.9999936
## 19 0.000041882548 0.9999581
## 20 0.957898204338 0.0421018
Gán xác suất xảy ra biến cố theo phương trình tiên lượng từ bộ dữ liệu testing và giá trị testing.group (biến cố xảy ra trên thực tế).
pred = data.frame(prob, testing$group)
head(pred)
## X0 X1 testing.group
## 7 0.004844253131 0.9951557 1
## 8 0.330591556394 0.6694084 1
## 12 0.143428872260 0.8565711 1
## 13 0.000006361996 0.9999936 1
## 19 0.000041882548 0.9999581 1
## 20 0.957898204338 0.0421018 0
Tính ROC
library(pROC)
## Warning: package 'pROC' was built under R version 3.6.1
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
roc(pred$testing.group, pred$X1)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## roc.default(response = pred$testing.group, predictor = pred$X1)
##
## Data: pred$X1 in 142 controls (pred$testing.group 0) < 84 cases (pred$testing.group 1).
## Area under the curve: 0.9786
auc = smooth(roc(pred$testing.group, pred$X1))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(auc)
# Fit mô hình hồi qui logistic
library (rms)
## Warning: package 'rms' was built under R version 3.6.1
## Loading required package: Hmisc
## Warning: package 'Hmisc' was built under R version 3.6.1
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:table1':
##
## label, label<-, units
## The following objects are masked from 'package:base':
##
## format.pval, units
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
f = lrm(group ~ texture + area + concave.points, data= training)
# Dùng mô hình đã training để testing trên bộ dữ liệu mới
pred.logit = predict(f, newdata= testing)
reg= lrm (testing$group ~ pred.logit, x=T, y=T)
cal= calibrate (reg, B=1000, m= 500, method= 'boot')
# Vẽ biểu đồ calibration
plot (cal)
##
## n=226 Mean absolute error=0.01 Mean squared error=0.00024
## 0.9 Quantile of absolute error=0.023
library(rms)
ddist = datadist(ca)
options(datadist="ddist")
m = lrm(group ~ texture + area + concave.points, data=ca)
p = nomogram(m, fun=function(x)1/(1+exp(-x)), fun.at=c(0.001, 0.01, 0.05, seq(0.1, 1, by=0.1)), funlabel="Risk of cancer", lp=F)
plot(p, cex.axis=0.6, lmgp=0.2)
DynNom không tải lên Rpub được#m = glm(group ~ texture + area + concave.points, family=binomial, data=training)
#library(DynNom); library(shiny)
#DynNom(m, training)