#câu 1: hồi quy tuyến tính
library(MASS)
data(Boston)
str(Boston)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
set.seed(123)
n <- nrow(Boston)
train_index <- sample(1:n, size = 0.7*n)
train <- Boston[train_index, ]
test <- Boston[-train_index, ]
model_lm <- lm(medv ~ lstat + rm, data = train)
summary(model_lm)
##
## Call:
## lm(formula = medv ~ lstat + rm, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.468 -3.470 -1.154 1.753 27.410
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.98609 3.82193 0.52 0.604
## lstat -0.66666 0.05145 -12.96 < 2e-16 ***
## rm 4.58297 0.54042 8.48 6.33e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.603 on 351 degrees of freedom
## Multiple R-squared: 0.6224, Adjusted R-squared: 0.6202
## F-statistic: 289.3 on 2 and 351 DF, p-value: < 2.2e-16
pred <- predict(model_lm, newdata = test)
MAE <- mean(abs(pred - test$medv))
MSE <- mean((pred - test$medv)^2)
MAE
## [1] 3.950841
MSE
## [1] 29.45709
#câu 2: hồi logictic dự đoán
library(ISLR)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
data(Default)
str(Default)
## 'data.frame': 10000 obs. of 4 variables:
## $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
## $ balance: num 730 817 1074 529 786 ...
## $ income : num 44362 12106 31767 35704 38463 ...
set.seed(123)
n <- nrow(Default)
train_index <- sample(1:n, size = 0.7*n)
train <- Default[train_index, ]
test <- Default[-train_index, ]
model_log <- glm(default ~ balance + income,
data = train,
family = binomial)
summary(model_log)
##
## Call:
## glm(formula = default ~ balance + income, family = binomial,
## data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.164e+01 5.202e-01 -22.375 < 2e-16 ***
## balance 5.692e-03 2.729e-04 20.857 < 2e-16 ***
## income 2.190e-05 5.981e-06 3.662 0.00025 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2043.8 on 6999 degrees of freedom
## Residual deviance: 1098.2 on 6997 degrees of freedom
## AIC: 1104.2
##
## Number of Fisher Scoring iterations: 8
prob <- predict(model_log, newdata = test, type = "response")
pred_class <- ifelse(prob > 0.5, "Yes", "No")
table(Predicted = pred_class, Actual = test$default)
## Actual
## Predicted No Yes
## No 2890 69
## Yes 10 31
roc_obj <- roc(test$default, prob)
## Setting levels: control = No, case = Yes
## Setting direction: controls < cases
plot(roc_obj, col="red", main="ROC Curve")

auc_value <- auc(roc_obj)
auc_value
## Area under the curve: 0.9473