#câu 1: hồi quy tuyến tính
 
library(MASS)

data(Boston)

str(Boston)
## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ black  : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
set.seed(123)
n <- nrow(Boston)
train_index <- sample(1:n, size = 0.7*n)

train <- Boston[train_index, ]
test  <- Boston[-train_index, ]

model_lm <- lm(medv ~ lstat + rm, data = train)

summary(model_lm)
## 
## Call:
## lm(formula = medv ~ lstat + rm, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -12.468  -3.470  -1.154   1.753  27.410 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.98609    3.82193    0.52    0.604    
## lstat       -0.66666    0.05145  -12.96  < 2e-16 ***
## rm           4.58297    0.54042    8.48 6.33e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.603 on 351 degrees of freedom
## Multiple R-squared:  0.6224, Adjusted R-squared:  0.6202 
## F-statistic: 289.3 on 2 and 351 DF,  p-value: < 2.2e-16
pred <- predict(model_lm, newdata = test)

MAE <- mean(abs(pred - test$medv))

MSE <- mean((pred - test$medv)^2)

MAE
## [1] 3.950841
MSE
## [1] 29.45709
#câu 2: hồi logictic dự đoán


library(ISLR)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
data(Default)

str(Default)
## 'data.frame':    10000 obs. of  4 variables:
##  $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
##  $ balance: num  730 817 1074 529 786 ...
##  $ income : num  44362 12106 31767 35704 38463 ...
set.seed(123)
n <- nrow(Default)
train_index <- sample(1:n, size = 0.7*n)

train <- Default[train_index, ]
test  <- Default[-train_index, ]

model_log <- glm(default ~ balance + income,
                 data = train,
                 family = binomial)

summary(model_log)
## 
## Call:
## glm(formula = default ~ balance + income, family = binomial, 
##     data = train)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.164e+01  5.202e-01 -22.375  < 2e-16 ***
## balance      5.692e-03  2.729e-04  20.857  < 2e-16 ***
## income       2.190e-05  5.981e-06   3.662  0.00025 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2043.8  on 6999  degrees of freedom
## Residual deviance: 1098.2  on 6997  degrees of freedom
## AIC: 1104.2
## 
## Number of Fisher Scoring iterations: 8
prob <- predict(model_log, newdata = test, type = "response")

pred_class <- ifelse(prob > 0.5, "Yes", "No")

table(Predicted = pred_class, Actual = test$default)
##          Actual
## Predicted   No  Yes
##       No  2890   69
##       Yes   10   31
roc_obj <- roc(test$default, prob)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
plot(roc_obj, col="red", main="ROC Curve")

auc_value <- auc(roc_obj)
auc_value
## Area under the curve: 0.9473