t = file.choose()
t = read.csv(t)
str(t)
## 'data.frame':    520 obs. of  17 variables:
##  $ Age               : int  40 58 41 45 60 55 57 66 67 70 ...
##  $ Gender            : chr  "Male" "Male" "Male" "Male" ...
##  $ Polyuria          : chr  "No" "No" "Yes" "No" ...
##  $ Polydipsia        : chr  "Yes" "No" "No" "No" ...
##  $ sudden.weight.loss: chr  "No" "No" "No" "Yes" ...
##  $ weakness          : chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ Polyphagia        : chr  "No" "No" "Yes" "Yes" ...
##  $ Genital.thrush    : chr  "No" "No" "No" "Yes" ...
##  $ visual.blurring   : chr  "No" "Yes" "No" "No" ...
##  $ Itching           : chr  "Yes" "No" "Yes" "Yes" ...
##  $ Irritability      : chr  "No" "No" "No" "No" ...
##  $ delayed.healing   : chr  "Yes" "No" "Yes" "Yes" ...
##  $ partial.paresis   : chr  "No" "Yes" "No" "No" ...
##  $ muscle.stiffness  : chr  "Yes" "No" "Yes" "No" ...
##  $ Alopecia          : chr  "Yes" "Yes" "Yes" "No" ...
##  $ Obesity           : chr  "Yes" "No" "No" "No" ...
##  $ diabetes          : chr  "Positive" "Positive" "Positive" "Positive" ...
head(t)
##   Age Gender Polyuria Polydipsia sudden.weight.loss weakness Polyphagia
## 1  40   Male       No        Yes                 No      Yes         No
## 2  58   Male       No         No                 No      Yes         No
## 3  41   Male      Yes         No                 No      Yes        Yes
## 4  45   Male       No         No                Yes      Yes        Yes
## 5  60   Male      Yes        Yes                Yes      Yes        Yes
## 6  55   Male      Yes        Yes                 No      Yes        Yes
##   Genital.thrush visual.blurring Itching Irritability delayed.healing
## 1             No              No     Yes           No             Yes
## 2             No             Yes      No           No              No
## 3             No              No     Yes           No             Yes
## 4            Yes              No     Yes           No             Yes
## 5             No             Yes     Yes          Yes             Yes
## 6             No             Yes     Yes           No             Yes
##   partial.paresis muscle.stiffness Alopecia Obesity diabetes
## 1              No              Yes      Yes     Yes Positive
## 2             Yes               No      Yes      No Positive
## 3              No              Yes      Yes      No Positive
## 4              No               No       No      No Positive
## 5             Yes              Yes      Yes     Yes Positive
## 6              No              Yes      Yes     Yes Positive
summary(t)
##       Age           Gender            Polyuria          Polydipsia       
##  Min.   :16.00   Length:520         Length:520         Length:520        
##  1st Qu.:39.00   Class :character   Class :character   Class :character  
##  Median :47.50   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :48.03                                                           
##  3rd Qu.:57.00                                                           
##  Max.   :90.00                                                           
##  sudden.weight.loss   weakness          Polyphagia        Genital.thrush    
##  Length:520         Length:520         Length:520         Length:520        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  visual.blurring      Itching          Irritability       delayed.healing   
##  Length:520         Length:520         Length:520         Length:520        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  partial.paresis    muscle.stiffness     Alopecia           Obesity         
##  Length:520         Length:520         Length:520         Length:520        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    diabetes        
##  Length:520        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
names(t)
##  [1] "Age"                "Gender"             "Polyuria"          
##  [4] "Polydipsia"         "sudden.weight.loss" "weakness"          
##  [7] "Polyphagia"         "Genital.thrush"     "visual.blurring"   
## [10] "Itching"            "Irritability"       "delayed.healing"   
## [13] "partial.paresis"    "muscle.stiffness"   "Alopecia"          
## [16] "Obesity"            "diabetes"
w=lrm(diabetes ~ Gender + Polyuria + sudden.weight.loss + weakness + Genital.thrush + Itching + Irritability + delayed.healing + partial.paresis, data=t)
w
## Logistic Regression Model
##  
##  lrm(formula = diabetes ~ Gender + Polyuria + sudden.weight.loss + 
##      weakness + Genital.thrush + Itching + Irritability + delayed.healing + 
##      partial.paresis, data = t)
##  
##                         Model Likelihood    Discrimination    Rank Discrim.    
##                               Ratio Test           Indexes          Indexes    
##  Obs           520    LR chi2     430.95    R2       0.765    C       0.957    
##   Negative     200    d.f.             9    g        4.311    Dxy     0.915    
##   Positive     320    Pr(> chi2) <0.0001    gr      74.533    gamma   0.918    
##  max |deriv| 8e-12                          gp       0.435    tau-a   0.434    
##                                             Brier    0.075                     
##  
##                         Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept               0.5925 0.3941  1.50  0.1327  
##  Gender=Male            -3.2060 0.4162 -7.70  <0.0001 
##  Polyuria=Yes            3.8029 0.4307  8.83  <0.0001 
##  sudden.weight.loss=Yes  0.9844 0.3752  2.62  0.0087  
##  weakness=Yes            0.9498 0.3672  2.59  0.0097  
##  Genital.thrush=Yes      1.3886 0.4019  3.45  0.0006  
##  Itching=Yes            -1.2417 0.3870 -3.21  0.0013  
##  Irritability=Yes        2.1830 0.4502  4.85  <0.0001 
##  delayed.healing=Yes    -1.2103 0.3947 -3.07  0.0022  
##  partial.paresis=Yes     1.2234 0.3733  3.28  0.0010  
## 
t$diab = ifelse(t$diabetes == "Positive", 1, 0)
t$diab = as.factor(t$diab)
names(t)
##  [1] "Age"                "Gender"             "Polyuria"          
##  [4] "Polydipsia"         "sudden.weight.loss" "weakness"          
##  [7] "Polyphagia"         "Genital.thrush"     "visual.blurring"   
## [10] "Itching"            "Irritability"       "delayed.healing"   
## [13] "partial.paresis"    "muscle.stiffness"   "Alopecia"          
## [16] "Obesity"            "diabetes"           "diab"
train = createDataPartition(t$diabetes, p=0.7, list=FALSE)
training =  t[ train, ]
testing =  t[ -train, ]
mod_fit = train(diabetes ~ Age + Gender + Polyuria + Polydipsia + sudden.weight.loss + weakness + Polyphagia + Genital.thrush + visual.blurring + Itching + Irritability + delayed.healing + partial.paresis + muscle.stiffness + Alopecia + Obesity, data = training, method = "glm", family="binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(mod_fit)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -3.06263  -0.23639   0.00181   0.04180   2.44041  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            2.70161    1.28905   2.096 0.036099 *  
## Age                   -0.04237    0.02972  -1.426 0.153967    
## GenderMale            -4.46648    0.72650  -6.148 7.85e-10 ***
## PolyuriaYes            4.51898    0.91153   4.958 7.14e-07 ***
## PolydipsiaYes          5.89860    1.18229   4.989 6.06e-07 ***
## sudden.weight.lossYes  0.64551    0.67618   0.955 0.339761    
## weaknessYes            1.15610    0.70743   1.634 0.102212    
## PolyphagiaYes          1.31278    0.69882   1.879 0.060304 .  
## Genital.thrushYes      1.36060    0.73239   1.858 0.063204 .  
## visual.blurringYes     0.92294    0.83818   1.101 0.270848    
## ItchingYes            -3.19500    0.91900  -3.477 0.000508 ***
## IrritabilityYes        2.34829    0.77620   3.025 0.002483 ** 
## delayed.healingYes    -0.39214    0.75322  -0.521 0.602632    
## partial.paresisYes     0.88263    0.65136   1.355 0.175402    
## muscle.stiffnessYes   -1.39432    0.80296  -1.736 0.082481 .  
## AlopeciaYes           -0.35692    0.76803  -0.465 0.642134    
## ObesityYes            -0.50606    0.68392  -0.740 0.459329    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 485.05  on 363  degrees of freedom
## Residual deviance: 114.13  on 347  degrees of freedom
## AIC: 148.13
## 
## Number of Fisher Scoring iterations: 8
# Tính giá trị tiên lượng cho Testing
prob=predict(mod_fit, newdata=testing, type="prob")
pred = data.frame(prob,testing$diabetes)
head(pred)
##        Negative  Positive testing.diabetes
## 3  0.8644325267 0.1355675         Positive
## 5  0.0005341836 0.9994658         Positive
## 9  0.0004183163 0.9995817         Positive
## 10 0.0179608525 0.9820391         Positive
## 12 0.0087101329 0.9912899         Positive
## 15 0.0011150518 0.9988849         Positive
roc(pred$testing.diabetes,pred$Positive)
## Setting levels: control = Negative, case = Positive
## Setting direction: controls < cases
## 
## Call:
## roc.default(response = pred$testing.diabetes, predictor = pred$Positive)
## 
## Data: pred$Positive in 60 controls (pred$testing.diabetes Negative) < 96 cases (pred$testing.diabetes Positive).
## Area under the curve: 0.9733
auc = smooth(roc(pred$testing.diabetes,pred$Positive))
## Setting levels: control = Negative, case = Positive
## Setting direction: controls < cases
plot(auc, col = "red")

xvars=t[, c("Age","Gender","Polyuria","sudden.weight.loss","weakness","Polyphagia", "Genital.thrush","visual.blurring","Itching","Irritability","delayed.healing","partial.paresis","muscle.stiffness","Alopecia","Obesity")]
yvar=t[,c("diab")]
s=bic.glm(xvars,yvar, strict=F, OR=20, glm.family = "binomial")
summary(s)
## 
## Call:
## bic.glm.data.frame(x = xvars, y = yvar, glm.family = "binomial",     strict = F, OR = 20)
## 
## 
##   105  models were selected
##  Best  5  models (cumulative posterior probability =  0.2082 ): 
## 
##                     p!=0    EV        SD       model 1     model 2   
## Intercept           100     0.849456  0.66473      0.5925      0.6779
## Age                  15.9  -0.005758  0.01513       .           .    
## Gender              100.0  -3.082734  0.42672     -3.2060     -2.9827
## Polyuria            100.0   3.767369  0.44997      3.8029      3.6971
## sudden.weight.loss   78.9   0.904674  0.57823      0.9844      1.1608
## weakness             43.6   0.432996  0.55848      0.9498       .    
## Polyphagia            9.7   0.062986  0.22617       .           .    
## Genital.thrush       94.6   1.369749  0.55423      1.3886      1.4128
## visual.blurring      53.5   0.644774  0.69409       .          1.3127
## Itching              93.8  -1.297677  0.54867     -1.2417     -1.4224
## Irritability        100.0   2.199678  0.48063      2.1830      2.2412
## delayed.healing      55.7  -0.619196  0.63872     -1.2103       .    
## partial.paresis      69.7   0.767240  0.60550      1.2234       .    
## muscle.stiffness      0.0   0.000000  0.00000       .           .    
## Alopecia             38.2  -0.386852  0.55703       .         -1.0879
## Obesity               1.6  -0.007413  0.08033       .           .    
##                                                                      
## nVar                                                 9           8   
## BIC                                            -2927.4764  -2927.2613
## post prob                                          0.051       0.046 
##                     model 3     model 4     model 5   
## Intercept               0.5235      0.7202      0.7774
## Age                      .           .           .    
## Gender                 -3.0891     -3.1016     -3.2848
## Polyuria                3.6490      3.6851      3.9569
## sudden.weight.loss      1.2574      1.2243       .    
## weakness                 .           .          1.1910
## Polyphagia               .           .           .    
## Genital.thrush          1.6071      1.2772      1.5067
## visual.blurring         1.0452       .           .    
## Itching                -1.4100     -1.0008     -1.3217
## Irritability            2.3178      2.3240      2.1056
## delayed.healing        -1.0927     -0.9629     -1.2162
## partial.paresis         0.9861      1.2169      1.2353
## muscle.stiffness         .           .           .    
## Alopecia                 .           .           .    
## Obesity                  .           .           .    
##                                                       
## nVar                      9           8           8   
## BIC                 -2926.9100  -2926.9000  -2926.7735
## post prob               0.038       0.038       0.036
w=lrm(diabetes ~ Gender + Polyuria + sudden.weight.loss + weakness + Genital.thrush + Itching + Irritability + delayed.healing + partial.paresis, data=t)
w
## Logistic Regression Model
##  
##  lrm(formula = diabetes ~ Gender + Polyuria + sudden.weight.loss + 
##      weakness + Genital.thrush + Itching + Irritability + delayed.healing + 
##      partial.paresis, data = t)
##  
##                         Model Likelihood    Discrimination    Rank Discrim.    
##                               Ratio Test           Indexes          Indexes    
##  Obs           520    LR chi2     430.95    R2       0.765    C       0.957    
##   Negative     200    d.f.             9    g        4.311    Dxy     0.915    
##   Positive     320    Pr(> chi2) <0.0001    gr      74.533    gamma   0.918    
##  max |deriv| 8e-12                          gp       0.435    tau-a   0.434    
##                                             Brier    0.075                     
##  
##                         Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept               0.5925 0.3941  1.50  0.1327  
##  Gender=Male            -3.2060 0.4162 -7.70  <0.0001 
##  Polyuria=Yes            3.8029 0.4307  8.83  <0.0001 
##  sudden.weight.loss=Yes  0.9844 0.3752  2.62  0.0087  
##  weakness=Yes            0.9498 0.3672  2.59  0.0097  
##  Genital.thrush=Yes      1.3886 0.4019  3.45  0.0006  
##  Itching=Yes            -1.2417 0.3870 -3.21  0.0013  
##  Irritability=Yes        2.1830 0.4502  4.85  <0.0001 
##  delayed.healing=Yes    -1.2103 0.3947 -3.07  0.0022  
##  partial.paresis=Yes     1.2234 0.3733  3.28  0.0010  
## 
train = createDataPartition(t$diab, p=0.7, list=FALSE)
training = t[ train, ]
testing = t[ -train, ]
mod_fit = train(diab ~ Gender + Polyuria + sudden.weight.loss + weakness + Genital.thrush + Itching + Irritability + delayed.healing + partial.paresis, data = training, method = "glm", family="binomial")
summary(mod_fit)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.84417  -0.34899   0.03826   0.26687   2.90991  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             0.5541     0.4769   1.162 0.245287    
## GenderMale             -3.3220     0.5205  -6.382 1.74e-10 ***
## PolyuriaYes             4.0976     0.5350   7.660 1.87e-14 ***
## sudden.weight.lossYes   0.7400     0.4479   1.652 0.098555 .  
## weaknessYes             0.9517     0.4617   2.061 0.039271 *  
## Genital.thrushYes       1.1928     0.4843   2.463 0.013786 *  
## ItchingYes             -1.5116     0.4909  -3.079 0.002075 ** 
## IrritabilityYes         2.5211     0.6017   4.190 2.79e-05 ***
## delayed.healingYes     -0.8913     0.4874  -1.829 0.067408 .  
## partial.paresisYes      1.6771     0.4893   3.427 0.000609 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 485.05  on 363  degrees of freedom
## Residual deviance: 173.51  on 354  degrees of freedom
## AIC: 193.51
## 
## Number of Fisher Scoring iterations: 7
pred = predict(mod_fit,newdata = testing, type = "raw")
confusionMatrix(testing$diab,pred)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 48 12
##          1  9 87
##                                           
##                Accuracy : 0.8654          
##                  95% CI : (0.8016, 0.9147)
##     No Information Rate : 0.6346          
##     P-Value [Acc > NIR] : 1.058e-10       
##                                           
##                   Kappa : 0.7129          
##                                           
##  Mcnemar's Test P-Value : 0.6625          
##                                           
##             Sensitivity : 0.8421          
##             Specificity : 0.8788          
##          Pos Pred Value : 0.8000          
##          Neg Pred Value : 0.9062          
##              Prevalence : 0.3654          
##          Detection Rate : 0.3077          
##    Detection Prevalence : 0.3846          
##       Balanced Accuracy : 0.8604          
##                                           
##        'Positive' Class : 0               
## 
#xvars=t[, c("Gender","Polyuria","sudden.weight.loss","weakness","Genital.thrush","Itching","Irritability","delayed.healing","partial.paresis")]
#yvar=t[,c("diab")]
#b=glm(diab ~ Gender + Polyuria + sudden.weight.loss + weakness + Genital.thrush + Itching + Irritability + delayed.healing + partial.paresis, family=binomial,data=t)
#plot_ly(t, x = xvars, y = yvar)
#DynNom(b,t)
ddist = datadist(t)
options(datadist = "ddist")
m = lrm(diab ~ Gender + Polyuria + sudden.weight.loss + weakness + Genital.thrush + Itching + Irritability + delayed.healing + partial.paresis,data=t)
p = nomogram(m, fun = function(x) 1/(1+exp(-x)), fun.at = c(0.01, 0.05, seq(0.1, 0.9, by = 0.1)), funlabel = "Risk of diabetes",lp = F)
plot(p, cex.axis = 0.7, lmgp = 0.1)