baiKTKN3

#Cau_1
library(ggplot2) 
library(car)

## Warning: package 'car' was built under R version 4.4.3

## Loading required package: carData

## Warning: package 'carData' was built under R version 4.4.3

data("iris")

model_lm <- lm(Sepal.Length ~ Sepal.Width + Petal.Length, data = iris)

summary(model_lm)

## 
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width + Petal.Length, data = iris)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.96159 -0.23489  0.00077  0.21453  0.78557 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.24914    0.24797    9.07 7.04e-16 ***
## Sepal.Width   0.59552    0.06933    8.59 1.16e-14 ***
## Petal.Length  0.47192    0.01712   27.57  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3333 on 147 degrees of freedom
## Multiple R-squared:  0.8402, Adjusted R-squared:  0.838 
## F-statistic: 386.4 on 2 and 147 DF,  p-value: < 2.2e-16

if (!require(car)) install.packages("car", dependencies = TRUE)
library(car)

par(mfrow = c(1, 2))

hist(resid(model_lm), main = "Histogram of Residuals", 
     xlab = "Residuals", col = "lightblue", breaks = 20)

qqPlot(resid(model_lm), main = "Q-Q Plot of Residuals")

## [1] 107 136

par(mfrow = c(1, 1))

#Cau_2
if (!require(titanic)) install.packages("titanic", dependencies = TRUE)

## Loading required package: titanic

## Warning: package 'titanic' was built under R version 4.4.3

if (!require(caret)) install.packages("caret", dependencies = TRUE)

## Loading required package: caret

## Warning: package 'caret' was built under R version 4.4.3

## Loading required package: lattice

library(titanic)  
library(caret)  

data("titanic_train")

head(titanic_train)

##   PassengerId Survived Pclass
## 1           1        0      3
## 2           2        1      1
## 3           3        1      3
## 4           4        1      1
## 5           5        0      3
## 6           6        0      3
##                                                  Name    Sex Age SibSp Parch
## 1                             Braund, Mr. Owen Harris   male  22     1     0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1     0
## 3                              Heikkinen, Miss. Laina female  26     0     0
## 4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1     0
## 5                            Allen, Mr. William Henry   male  35     0     0
## 6                                    Moran, Mr. James   male  NA     0     0
##             Ticket    Fare Cabin Embarked
## 1        A/5 21171  7.2500              S
## 2         PC 17599 71.2833   C85        C
## 3 STON/O2. 3101282  7.9250              S
## 4           113803 53.1000  C123        S
## 5           373450  8.0500              S
## 6           330877  8.4583              Q

df <- titanic_train[, c("Survived", "Pclass", "Sex", "Age", "Fare")]

df <- na.omit(df)

df$Sex <- ifelse(df$Sex == "male", 1, 0)

df$Survived <- as.factor(df$Survived)

set.seed(123)  
train_index <- createDataPartition(df$Survived, p = 0.8, list = FALSE)
train_data <- df[train_index, ]
test_data  <- df[-train_index, ]

model_logit <- glm(Survived ~ Pclass + Sex + Age + Fare, 
                   data = train_data, family = binomial)

summary(model_logit)

## 
## Call:
## glm(formula = Survived ~ Pclass + Sex + Age + Fare, family = binomial, 
##     data = train_data)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  4.261337   0.619074   6.883 5.84e-12 ***
## Pclass      -1.124291   0.173174  -6.492 8.46e-11 ***
## Sex         -2.422365   0.224278 -10.801  < 2e-16 ***
## Age         -0.025361   0.008367  -3.031  0.00244 ** 
## Fare         0.001491   0.002390   0.624  0.53278    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 772.45  on 571  degrees of freedom
## Residual deviance: 534.66  on 567  degrees of freedom
## AIC: 544.66
## 
## Number of Fisher Scoring iterations: 4

predicted_probs <- predict(model_logit, newdata = test_data, type = "response")

predicted_classes <- ifelse(predicted_probs > 0.5, 1, 0)

predicted_classes <- as.factor(predicted_classes)
test_data$Survived <- as.factor(test_data$Survived)

conf_matrix <- confusionMatrix(predicted_classes, test_data$Survived)
print(conf_matrix)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 73 16
##          1 11 42
##                                           
##                Accuracy : 0.8099          
##                  95% CI : (0.7355, 0.8708)
##     No Information Rate : 0.5915          
##     P-Value [Acc > NIR] : 2.405e-08       
##                                           
##                   Kappa : 0.6012          
##                                           
##  Mcnemar's Test P-Value : 0.4414          
##                                           
##             Sensitivity : 0.8690          
##             Specificity : 0.7241          
##          Pos Pred Value : 0.8202          
##          Neg Pred Value : 0.7925          
##              Prevalence : 0.5915          
##          Detection Rate : 0.5141          
##    Detection Prevalence : 0.6268          
##       Balanced Accuracy : 0.7966          
##                                           
##        'Positive' Class : 0               
##

precision <- conf_matrix$byClass["Precision"]
recall <- conf_matrix$byClass["Recall"]
f1_score <- 2 * (precision * recall) / (precision + recall)

cat("Precision:", precision, "\n")

## Precision: 0.8202247

cat("Recall:", recall, "\n")

## Recall: 0.8690476

cat("F1-score:", f1_score, "\n")

## F1-score: 0.8439306

baiKTKN3

Hoàng Ngọc Hiệp

2025-03-13