t = file.choose()
t = read.csv(t)
str(t)
## 'data.frame': 520 obs. of 17 variables:
## $ Age : int 40 58 41 45 60 55 57 66 67 70 ...
## $ Gender : chr "Male" "Male" "Male" "Male" ...
## $ Polyuria : chr "No" "No" "Yes" "No" ...
## $ Polydipsia : chr "Yes" "No" "No" "No" ...
## $ sudden.weight.loss: chr "No" "No" "No" "Yes" ...
## $ weakness : chr "Yes" "Yes" "Yes" "Yes" ...
## $ Polyphagia : chr "No" "No" "Yes" "Yes" ...
## $ Genital.thrush : chr "No" "No" "No" "Yes" ...
## $ visual.blurring : chr "No" "Yes" "No" "No" ...
## $ Itching : chr "Yes" "No" "Yes" "Yes" ...
## $ Irritability : chr "No" "No" "No" "No" ...
## $ delayed.healing : chr "Yes" "No" "Yes" "Yes" ...
## $ partial.paresis : chr "No" "Yes" "No" "No" ...
## $ muscle.stiffness : chr "Yes" "No" "Yes" "No" ...
## $ Alopecia : chr "Yes" "Yes" "Yes" "No" ...
## $ Obesity : chr "Yes" "No" "No" "No" ...
## $ diabetes : chr "Positive" "Positive" "Positive" "Positive" ...
head(t)
## Age Gender Polyuria Polydipsia sudden.weight.loss weakness Polyphagia
## 1 40 Male No Yes No Yes No
## 2 58 Male No No No Yes No
## 3 41 Male Yes No No Yes Yes
## 4 45 Male No No Yes Yes Yes
## 5 60 Male Yes Yes Yes Yes Yes
## 6 55 Male Yes Yes No Yes Yes
## Genital.thrush visual.blurring Itching Irritability delayed.healing
## 1 No No Yes No Yes
## 2 No Yes No No No
## 3 No No Yes No Yes
## 4 Yes No Yes No Yes
## 5 No Yes Yes Yes Yes
## 6 No Yes Yes No Yes
## partial.paresis muscle.stiffness Alopecia Obesity diabetes
## 1 No Yes Yes Yes Positive
## 2 Yes No Yes No Positive
## 3 No Yes Yes No Positive
## 4 No No No No Positive
## 5 Yes Yes Yes Yes Positive
## 6 No Yes Yes Yes Positive
summary(t)
## Age Gender Polyuria Polydipsia
## Min. :16.00 Length:520 Length:520 Length:520
## 1st Qu.:39.00 Class :character Class :character Class :character
## Median :47.50 Mode :character Mode :character Mode :character
## Mean :48.03
## 3rd Qu.:57.00
## Max. :90.00
## sudden.weight.loss weakness Polyphagia Genital.thrush
## Length:520 Length:520 Length:520 Length:520
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## visual.blurring Itching Irritability delayed.healing
## Length:520 Length:520 Length:520 Length:520
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## partial.paresis muscle.stiffness Alopecia Obesity
## Length:520 Length:520 Length:520 Length:520
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## diabetes
## Length:520
## Class :character
## Mode :character
##
##
##
names(t)
## [1] "Age" "Gender" "Polyuria"
## [4] "Polydipsia" "sudden.weight.loss" "weakness"
## [7] "Polyphagia" "Genital.thrush" "visual.blurring"
## [10] "Itching" "Irritability" "delayed.healing"
## [13] "partial.paresis" "muscle.stiffness" "Alopecia"
## [16] "Obesity" "diabetes"
w=lrm(diabetes ~ Gender + Polyuria + sudden.weight.loss + weakness + Genital.thrush + Itching + Irritability + delayed.healing + partial.paresis, data=t)
w
## Logistic Regression Model
##
## lrm(formula = diabetes ~ Gender + Polyuria + sudden.weight.loss +
## weakness + Genital.thrush + Itching + Irritability + delayed.healing +
## partial.paresis, data = t)
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 520 LR chi2 430.95 R2 0.765 C 0.957
## Negative 200 d.f. 9 g 4.311 Dxy 0.915
## Positive 320 Pr(> chi2) <0.0001 gr 74.533 gamma 0.918
## max |deriv| 8e-12 gp 0.435 tau-a 0.434
## Brier 0.075
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept 0.5925 0.3941 1.50 0.1327
## Gender=Male -3.2060 0.4162 -7.70 <0.0001
## Polyuria=Yes 3.8029 0.4307 8.83 <0.0001
## sudden.weight.loss=Yes 0.9844 0.3752 2.62 0.0087
## weakness=Yes 0.9498 0.3672 2.59 0.0097
## Genital.thrush=Yes 1.3886 0.4019 3.45 0.0006
## Itching=Yes -1.2417 0.3870 -3.21 0.0013
## Irritability=Yes 2.1830 0.4502 4.85 <0.0001
## delayed.healing=Yes -1.2103 0.3947 -3.07 0.0022
## partial.paresis=Yes 1.2234 0.3733 3.28 0.0010
##
t$diab = ifelse(t$diabetes == "Positive", 1, 0)
t$diab = as.factor(t$diab)
names(t)
## [1] "Age" "Gender" "Polyuria"
## [4] "Polydipsia" "sudden.weight.loss" "weakness"
## [7] "Polyphagia" "Genital.thrush" "visual.blurring"
## [10] "Itching" "Irritability" "delayed.healing"
## [13] "partial.paresis" "muscle.stiffness" "Alopecia"
## [16] "Obesity" "diabetes" "diab"
train = createDataPartition(t$diabetes, p=0.7, list=FALSE)
training = t[ train, ]
testing = t[ -train, ]
mod_fit = train(diabetes ~ Age + Gender + Polyuria + Polydipsia + sudden.weight.loss + weakness + Polyphagia + Genital.thrush + visual.blurring + Itching + Irritability + delayed.healing + partial.paresis + muscle.stiffness + Alopecia + Obesity, data = training, method = "glm", family="binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(mod_fit)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.06263 -0.23639 0.00181 0.04180 2.44041
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.70161 1.28905 2.096 0.036099 *
## Age -0.04237 0.02972 -1.426 0.153967
## GenderMale -4.46648 0.72650 -6.148 7.85e-10 ***
## PolyuriaYes 4.51898 0.91153 4.958 7.14e-07 ***
## PolydipsiaYes 5.89860 1.18229 4.989 6.06e-07 ***
## sudden.weight.lossYes 0.64551 0.67618 0.955 0.339761
## weaknessYes 1.15610 0.70743 1.634 0.102212
## PolyphagiaYes 1.31278 0.69882 1.879 0.060304 .
## Genital.thrushYes 1.36060 0.73239 1.858 0.063204 .
## visual.blurringYes 0.92294 0.83818 1.101 0.270848
## ItchingYes -3.19500 0.91900 -3.477 0.000508 ***
## IrritabilityYes 2.34829 0.77620 3.025 0.002483 **
## delayed.healingYes -0.39214 0.75322 -0.521 0.602632
## partial.paresisYes 0.88263 0.65136 1.355 0.175402
## muscle.stiffnessYes -1.39432 0.80296 -1.736 0.082481 .
## AlopeciaYes -0.35692 0.76803 -0.465 0.642134
## ObesityYes -0.50606 0.68392 -0.740 0.459329
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 485.05 on 363 degrees of freedom
## Residual deviance: 114.13 on 347 degrees of freedom
## AIC: 148.13
##
## Number of Fisher Scoring iterations: 8
# Tính giá trị tiên lượng cho Testing
prob=predict(mod_fit, newdata=testing, type="prob")
pred = data.frame(prob,testing$diabetes)
head(pred)
## Negative Positive testing.diabetes
## 3 0.8644325267 0.1355675 Positive
## 5 0.0005341836 0.9994658 Positive
## 9 0.0004183163 0.9995817 Positive
## 10 0.0179608525 0.9820391 Positive
## 12 0.0087101329 0.9912899 Positive
## 15 0.0011150518 0.9988849 Positive
roc(pred$testing.diabetes,pred$Positive)
## Setting levels: control = Negative, case = Positive
## Setting direction: controls < cases
##
## Call:
## roc.default(response = pred$testing.diabetes, predictor = pred$Positive)
##
## Data: pred$Positive in 60 controls (pred$testing.diabetes Negative) < 96 cases (pred$testing.diabetes Positive).
## Area under the curve: 0.9733
auc = smooth(roc(pred$testing.diabetes,pred$Positive))
## Setting levels: control = Negative, case = Positive
## Setting direction: controls < cases
plot(auc, col = "red")

xvars=t[, c("Age","Gender","Polyuria","sudden.weight.loss","weakness","Polyphagia", "Genital.thrush","visual.blurring","Itching","Irritability","delayed.healing","partial.paresis","muscle.stiffness","Alopecia","Obesity")]
yvar=t[,c("diab")]
s=bic.glm(xvars,yvar, strict=F, OR=20, glm.family = "binomial")
summary(s)
##
## Call:
## bic.glm.data.frame(x = xvars, y = yvar, glm.family = "binomial", strict = F, OR = 20)
##
##
## 105 models were selected
## Best 5 models (cumulative posterior probability = 0.2082 ):
##
## p!=0 EV SD model 1 model 2
## Intercept 100 0.849456 0.66473 0.5925 0.6779
## Age 15.9 -0.005758 0.01513 . .
## Gender 100.0 -3.082734 0.42672 -3.2060 -2.9827
## Polyuria 100.0 3.767369 0.44997 3.8029 3.6971
## sudden.weight.loss 78.9 0.904674 0.57823 0.9844 1.1608
## weakness 43.6 0.432996 0.55848 0.9498 .
## Polyphagia 9.7 0.062986 0.22617 . .
## Genital.thrush 94.6 1.369749 0.55423 1.3886 1.4128
## visual.blurring 53.5 0.644774 0.69409 . 1.3127
## Itching 93.8 -1.297677 0.54867 -1.2417 -1.4224
## Irritability 100.0 2.199678 0.48063 2.1830 2.2412
## delayed.healing 55.7 -0.619196 0.63872 -1.2103 .
## partial.paresis 69.7 0.767240 0.60550 1.2234 .
## muscle.stiffness 0.0 0.000000 0.00000 . .
## Alopecia 38.2 -0.386852 0.55703 . -1.0879
## Obesity 1.6 -0.007413 0.08033 . .
##
## nVar 9 8
## BIC -2927.4764 -2927.2613
## post prob 0.051 0.046
## model 3 model 4 model 5
## Intercept 0.5235 0.7202 0.7774
## Age . . .
## Gender -3.0891 -3.1016 -3.2848
## Polyuria 3.6490 3.6851 3.9569
## sudden.weight.loss 1.2574 1.2243 .
## weakness . . 1.1910
## Polyphagia . . .
## Genital.thrush 1.6071 1.2772 1.5067
## visual.blurring 1.0452 . .
## Itching -1.4100 -1.0008 -1.3217
## Irritability 2.3178 2.3240 2.1056
## delayed.healing -1.0927 -0.9629 -1.2162
## partial.paresis 0.9861 1.2169 1.2353
## muscle.stiffness . . .
## Alopecia . . .
## Obesity . . .
##
## nVar 9 8 8
## BIC -2926.9100 -2926.9000 -2926.7735
## post prob 0.038 0.038 0.036
w=lrm(diabetes ~ Gender + Polyuria + sudden.weight.loss + weakness + Genital.thrush + Itching + Irritability + delayed.healing + partial.paresis, data=t)
w
## Logistic Regression Model
##
## lrm(formula = diabetes ~ Gender + Polyuria + sudden.weight.loss +
## weakness + Genital.thrush + Itching + Irritability + delayed.healing +
## partial.paresis, data = t)
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 520 LR chi2 430.95 R2 0.765 C 0.957
## Negative 200 d.f. 9 g 4.311 Dxy 0.915
## Positive 320 Pr(> chi2) <0.0001 gr 74.533 gamma 0.918
## max |deriv| 8e-12 gp 0.435 tau-a 0.434
## Brier 0.075
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept 0.5925 0.3941 1.50 0.1327
## Gender=Male -3.2060 0.4162 -7.70 <0.0001
## Polyuria=Yes 3.8029 0.4307 8.83 <0.0001
## sudden.weight.loss=Yes 0.9844 0.3752 2.62 0.0087
## weakness=Yes 0.9498 0.3672 2.59 0.0097
## Genital.thrush=Yes 1.3886 0.4019 3.45 0.0006
## Itching=Yes -1.2417 0.3870 -3.21 0.0013
## Irritability=Yes 2.1830 0.4502 4.85 <0.0001
## delayed.healing=Yes -1.2103 0.3947 -3.07 0.0022
## partial.paresis=Yes 1.2234 0.3733 3.28 0.0010
##
train = createDataPartition(t$diab, p=0.7, list=FALSE)
training = t[ train, ]
testing = t[ -train, ]
mod_fit = train(diab ~ Gender + Polyuria + sudden.weight.loss + weakness + Genital.thrush + Itching + Irritability + delayed.healing + partial.paresis, data = training, method = "glm", family="binomial")
summary(mod_fit)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.84417 -0.34899 0.03826 0.26687 2.90991
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.5541 0.4769 1.162 0.245287
## GenderMale -3.3220 0.5205 -6.382 1.74e-10 ***
## PolyuriaYes 4.0976 0.5350 7.660 1.87e-14 ***
## sudden.weight.lossYes 0.7400 0.4479 1.652 0.098555 .
## weaknessYes 0.9517 0.4617 2.061 0.039271 *
## Genital.thrushYes 1.1928 0.4843 2.463 0.013786 *
## ItchingYes -1.5116 0.4909 -3.079 0.002075 **
## IrritabilityYes 2.5211 0.6017 4.190 2.79e-05 ***
## delayed.healingYes -0.8913 0.4874 -1.829 0.067408 .
## partial.paresisYes 1.6771 0.4893 3.427 0.000609 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 485.05 on 363 degrees of freedom
## Residual deviance: 173.51 on 354 degrees of freedom
## AIC: 193.51
##
## Number of Fisher Scoring iterations: 7
pred = predict(mod_fit,newdata = testing, type = "raw")
confusionMatrix(testing$diab,pred)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 48 12
## 1 9 87
##
## Accuracy : 0.8654
## 95% CI : (0.8016, 0.9147)
## No Information Rate : 0.6346
## P-Value [Acc > NIR] : 1.058e-10
##
## Kappa : 0.7129
##
## Mcnemar's Test P-Value : 0.6625
##
## Sensitivity : 0.8421
## Specificity : 0.8788
## Pos Pred Value : 0.8000
## Neg Pred Value : 0.9062
## Prevalence : 0.3654
## Detection Rate : 0.3077
## Detection Prevalence : 0.3846
## Balanced Accuracy : 0.8604
##
## 'Positive' Class : 0
##
#xvars=t[, c("Gender","Polyuria","sudden.weight.loss","weakness","Genital.thrush","Itching","Irritability","delayed.healing","partial.paresis")]
#yvar=t[,c("diab")]
#b=glm(diab ~ Gender + Polyuria + sudden.weight.loss + weakness + Genital.thrush + Itching + Irritability + delayed.healing + partial.paresis, family=binomial,data=t)
#plot_ly(t, x = xvars, y = yvar)
#DynNom(b,t)
ddist = datadist(t)
options(datadist = "ddist")
m = lrm(diab ~ Gender + Polyuria + sudden.weight.loss + weakness + Genital.thrush + Itching + Irritability + delayed.healing + partial.paresis,data=t)
p = nomogram(m, fun = function(x) 1/(1+exp(-x)), fun.at = c(0.01, 0.05, seq(0.1, 0.9, by = 0.1)), funlabel = "Risk of diabetes",lp = F)
plot(p, cex.axis = 0.7, lmgp = 0.1)
