data(iris)
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
# Xây dựng mô hình hồi quy tuyến tính
model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length, data = iris)

#tóm tắt mô hình
summary(model)
## 
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width + Petal.Length, data = iris)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.96159 -0.23489  0.00077  0.21453  0.78557 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.24914    0.24797    9.07 7.04e-16 ***
## Sepal.Width   0.59552    0.06933    8.59 1.16e-14 ***
## Petal.Length  0.47192    0.01712   27.57  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3333 on 147 degrees of freedom
## Multiple R-squared:  0.8402, Adjusted R-squared:  0.838 
## F-statistic: 386.4 on 2 and 147 DF,  p-value: < 2.2e-16
# Biểu đồ histogram của residuals
hist(residuals(model), main = "Histogram of Residuals", xlab = "Residuals", col = "lightblue", border = "black")

# Q-Q plot để kiểm tra phân phối chuẩn
qqnorm(residuals(model))
qqline(residuals(model), col = "red")

bài 2

if (!require(titanic)) install.packages("titanic")
## Loading required package: titanic
## Warning: package 'titanic' was built under R version 4.4.3
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(titanic)  

titanic_df <- titanic::titanic_train

titanic_data <- titanic_df %>%
  select(Survived, Pclass, Sex, Age, Fare) %>%
  na.omit()

titanic_data$Sex <- as.numeric(factor(titanic_data$Sex, levels = c("male", "female")))
titanic_data$Survived <- as.factor(titanic_data$Survived)

head(titanic_data)
##   Survived Pclass Sex Age    Fare
## 1        0      3   1  22  7.2500
## 2        1      1   2  38 71.2833
## 3        1      3   2  26  7.9250
## 4        1      1   2  35 53.1000
## 5        0      3   1  35  8.0500
## 7        0      1   1  54 51.8625
  # Xây dựng mô hình logistic
logistic_model <- glm(Survived ~ Pclass + Sex + Age + Fare, data = titanic_data, family = binomial)

#  tóm tắt mô hình
summary(logistic_model)
## 
## Call:
## glm(formula = Survived ~ Pclass + Sex + Age + Fare, family = binomial, 
##     data = titanic_data)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.0483534  0.5563719  -0.087    0.931    
## Pclass      -1.2697410  0.1586252  -8.005 1.20e-15 ***
## Sex          2.5181969  0.2078562  12.115  < 2e-16 ***
## Age         -0.0367073  0.0076795  -4.780 1.75e-06 ***
## Fare         0.0005373  0.0021821   0.246    0.805    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 964.52  on 713  degrees of freedom
## Residual deviance: 647.23  on 709  degrees of freedom
## AIC: 657.23
## 
## Number of Fisher Scoring iterations: 5
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
pred <- predict(logistic_model, titanic_data, type = "response")
pred_class <- ifelse(pred > 0.5, 1, 0)

# Chuyển về dạng factor
pred_class <- as.factor(pred_class)
titanic_data$Survived <- as.factor(titanic_data$Survived)

# Tính F1-score
confusionMatrix(pred_class, titanic_data$Survived, mode = "everything")$byClass["F1"]
##        F1 
## 0.8283063