Câu 1: Hồi quy tuyến tính với Boston
library(MASS)
data(Boston)
head(Boston)
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21
## medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
model_lm <- lm(medv ~ crim + indus, data = Boston)
summary(model_lm)
##
## Call:
## lm(formula = medv ~ crim + indus, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.011 -4.876 -1.683 3.024 32.491
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.24829 0.67046 43.624 < 2e-16 ***
## crim -0.24548 0.04434 -5.536 4.99e-08 ***
## indus -0.52335 0.05559 -9.414 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.83 on 503 degrees of freedom
## Multiple R-squared: 0.278, Adjusted R-squared: 0.2751
## F-statistic: 96.83 on 2 and 503 DF, p-value: < 2.2e-16
Cau 2 : Logistic Regression với PimaIndiansDiabetes
library(mlbench)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
data(PimaIndiansDiabetes)
head(PimaIndiansDiabetes)
## pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1 6 148 72 35 0 33.6 0.627 50 pos
## 2 1 85 66 29 0 26.6 0.351 31 neg
## 3 8 183 64 0 0 23.3 0.672 32 pos
## 4 1 89 66 23 94 28.1 0.167 21 neg
## 5 0 137 40 35 168 43.1 2.288 33 pos
## 6 5 116 74 0 0 25.6 0.201 30 neg
set.seed(123)
trainIndex <- createDataPartition(PimaIndiansDiabetes$diabetes,
p = 0.7,
list = FALSE)
train_data <- PimaIndiansDiabetes[trainIndex, ]
test_data <- PimaIndiansDiabetes[-trainIndex, ]
model_logit <- glm(diabetes ~ .,
data = train_data,
family = binomial)
summary(model_logit)
##
## Call:
## glm(formula = diabetes ~ ., family = binomial, data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.3554422 0.8507536 -9.821 < 2e-16 ***
## pregnant 0.0854045 0.0383843 2.225 0.02608 *
## glucose 0.0309183 0.0042536 7.269 3.63e-13 ***
## pressure -0.0131469 0.0057847 -2.273 0.02304 *
## triceps -0.0080329 0.0082505 -0.974 0.33024
## insulin 0.0004474 0.0010825 0.413 0.67941
## mass 0.0955280 0.0180400 5.295 1.19e-07 ***
## pedigree 1.1678180 0.3634601 3.213 0.00131 **
## age 0.0252281 0.0112810 2.236 0.02533 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 696.28 on 537 degrees of freedom
## Residual deviance: 513.98 on 529 degrees of freedom
## AIC: 531.98
##
## Number of Fisher Scoring iterations: 5
prob_pred <- predict(model_logit,
newdata = test_data,
type = "response")
thresold = 0.3
pred_03 <- ifelse(prob_pred > 0.3, "pos", "neg")
confusionMatrix(as.factor(pred_03), test_data$diabetes)
## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 111 19
## pos 39 61
##
## Accuracy : 0.7478
## 95% CI : (0.6865, 0.8026)
## No Information Rate : 0.6522
## P-Value [Acc > NIR] : 0.001163
##
## Kappa : 0.4748
##
## Mcnemar's Test P-Value : 0.012602
##
## Sensitivity : 0.7400
## Specificity : 0.7625
## Pos Pred Value : 0.8538
## Neg Pred Value : 0.6100
## Prevalence : 0.6522
## Detection Rate : 0.4826
## Detection Prevalence : 0.5652
## Balanced Accuracy : 0.7512
##
## 'Positive' Class : neg
##
thresold = 0.5
pred_05 <- ifelse(prob_pred > 0.5, "pos", "neg")
confusionMatrix(as.factor(pred_05), test_data$diabetes)
## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 130 33
## pos 20 47
##
## Accuracy : 0.7696
## 95% CI : (0.7097, 0.8224)
## No Information Rate : 0.6522
## P-Value [Acc > NIR] : 7.748e-05
##
## Kappa : 0.4721
##
## Mcnemar's Test P-Value : 0.09929
##
## Sensitivity : 0.8667
## Specificity : 0.5875
## Pos Pred Value : 0.7975
## Neg Pred Value : 0.7015
## Prevalence : 0.6522
## Detection Rate : 0.5652
## Detection Prevalence : 0.7087
## Balanced Accuracy : 0.7271
##
## 'Positive' Class : neg
##
thresold = 0.7
pred_07 <- ifelse(prob_pred > 0.7, "pos", "neg")
confusionMatrix(as.factor(pred_07), test_data$diabetes)
## Confusion Matrix and Statistics
##
## Reference
## Prediction neg pos
## neg 143 50
## pos 7 30
##
## Accuracy : 0.7522
## 95% CI : (0.6912, 0.8066)
## No Information Rate : 0.6522
## P-Value [Acc > NIR] : 0.0007075
##
## Kappa : 0.3754
##
## Mcnemar's Test P-Value : 2.651e-08
##
## Sensitivity : 0.9533
## Specificity : 0.3750
## Pos Pred Value : 0.7409
## Neg Pred Value : 0.8108
## Prevalence : 0.6522
## Detection Rate : 0.6217
## Detection Prevalence : 0.8391
## Balanced Accuracy : 0.6642
##
## 'Positive' Class : neg
##