Câu 1: Hồi quy tuyến tính với Boston

library(MASS)

data(Boston)
head(Boston)
##      crim zn indus chas   nox    rm  age    dis rad tax ptratio  black lstat
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21
##   medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
model_lm <- lm(medv ~ crim + indus, data = Boston)
summary(model_lm)
## 
## Call:
## lm(formula = medv ~ crim + indus, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -12.011  -4.876  -1.683   3.024  32.491 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 29.24829    0.67046  43.624  < 2e-16 ***
## crim        -0.24548    0.04434  -5.536 4.99e-08 ***
## indus       -0.52335    0.05559  -9.414  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.83 on 503 degrees of freedom
## Multiple R-squared:  0.278,  Adjusted R-squared:  0.2751 
## F-statistic: 96.83 on 2 and 503 DF,  p-value: < 2.2e-16

Cau 2 : Logistic Regression với PimaIndiansDiabetes

library(mlbench)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
data(PimaIndiansDiabetes)
head(PimaIndiansDiabetes)
##   pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1        6     148       72      35       0 33.6    0.627  50      pos
## 2        1      85       66      29       0 26.6    0.351  31      neg
## 3        8     183       64       0       0 23.3    0.672  32      pos
## 4        1      89       66      23      94 28.1    0.167  21      neg
## 5        0     137       40      35     168 43.1    2.288  33      pos
## 6        5     116       74       0       0 25.6    0.201  30      neg
set.seed(123)

trainIndex <- createDataPartition(PimaIndiansDiabetes$diabetes, 
                                   p = 0.7, 
                                   list = FALSE)

train_data <- PimaIndiansDiabetes[trainIndex, ]
test_data  <- PimaIndiansDiabetes[-trainIndex, ]
model_logit <- glm(diabetes ~ ., 
                   data = train_data, 
                   family = binomial)

summary(model_logit)
## 
## Call:
## glm(formula = diabetes ~ ., family = binomial, data = train_data)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -8.3554422  0.8507536  -9.821  < 2e-16 ***
## pregnant     0.0854045  0.0383843   2.225  0.02608 *  
## glucose      0.0309183  0.0042536   7.269 3.63e-13 ***
## pressure    -0.0131469  0.0057847  -2.273  0.02304 *  
## triceps     -0.0080329  0.0082505  -0.974  0.33024    
## insulin      0.0004474  0.0010825   0.413  0.67941    
## mass         0.0955280  0.0180400   5.295 1.19e-07 ***
## pedigree     1.1678180  0.3634601   3.213  0.00131 ** 
## age          0.0252281  0.0112810   2.236  0.02533 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 696.28  on 537  degrees of freedom
## Residual deviance: 513.98  on 529  degrees of freedom
## AIC: 531.98
## 
## Number of Fisher Scoring iterations: 5
prob_pred <- predict(model_logit, 
                     newdata = test_data, 
                     type = "response")

thresold = 0.3

pred_03 <- ifelse(prob_pred > 0.3, "pos", "neg")
confusionMatrix(as.factor(pred_03), test_data$diabetes)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction neg pos
##        neg 111  19
##        pos  39  61
##                                           
##                Accuracy : 0.7478          
##                  95% CI : (0.6865, 0.8026)
##     No Information Rate : 0.6522          
##     P-Value [Acc > NIR] : 0.001163        
##                                           
##                   Kappa : 0.4748          
##                                           
##  Mcnemar's Test P-Value : 0.012602        
##                                           
##             Sensitivity : 0.7400          
##             Specificity : 0.7625          
##          Pos Pred Value : 0.8538          
##          Neg Pred Value : 0.6100          
##              Prevalence : 0.6522          
##          Detection Rate : 0.4826          
##    Detection Prevalence : 0.5652          
##       Balanced Accuracy : 0.7512          
##                                           
##        'Positive' Class : neg             
## 

thresold = 0.5

pred_05 <- ifelse(prob_pred > 0.5, "pos", "neg")
confusionMatrix(as.factor(pred_05), test_data$diabetes)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction neg pos
##        neg 130  33
##        pos  20  47
##                                           
##                Accuracy : 0.7696          
##                  95% CI : (0.7097, 0.8224)
##     No Information Rate : 0.6522          
##     P-Value [Acc > NIR] : 7.748e-05       
##                                           
##                   Kappa : 0.4721          
##                                           
##  Mcnemar's Test P-Value : 0.09929         
##                                           
##             Sensitivity : 0.8667          
##             Specificity : 0.5875          
##          Pos Pred Value : 0.7975          
##          Neg Pred Value : 0.7015          
##              Prevalence : 0.6522          
##          Detection Rate : 0.5652          
##    Detection Prevalence : 0.7087          
##       Balanced Accuracy : 0.7271          
##                                           
##        'Positive' Class : neg             
## 

thresold = 0.7

pred_07 <- ifelse(prob_pred > 0.7, "pos", "neg")
confusionMatrix(as.factor(pred_07), test_data$diabetes)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction neg pos
##        neg 143  50
##        pos   7  30
##                                           
##                Accuracy : 0.7522          
##                  95% CI : (0.6912, 0.8066)
##     No Information Rate : 0.6522          
##     P-Value [Acc > NIR] : 0.0007075       
##                                           
##                   Kappa : 0.3754          
##                                           
##  Mcnemar's Test P-Value : 2.651e-08       
##                                           
##             Sensitivity : 0.9533          
##             Specificity : 0.3750          
##          Pos Pred Value : 0.7409          
##          Neg Pred Value : 0.8108          
##              Prevalence : 0.6522          
##          Detection Rate : 0.6217          
##    Detection Prevalence : 0.8391          
##       Balanced Accuracy : 0.6642          
##                                           
##        'Positive' Class : neg             
##