data(iris)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
# Xây dựng mô hình hồi quy tuyến tính
model <- lm(Sepal.Length ~ Sepal.Width + Petal.Length, data = iris)
#tóm tắt mô hình
summary(model)
##
## Call:
## lm(formula = Sepal.Length ~ Sepal.Width + Petal.Length, data = iris)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.96159 -0.23489 0.00077 0.21453 0.78557
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.24914 0.24797 9.07 7.04e-16 ***
## Sepal.Width 0.59552 0.06933 8.59 1.16e-14 ***
## Petal.Length 0.47192 0.01712 27.57 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3333 on 147 degrees of freedom
## Multiple R-squared: 0.8402, Adjusted R-squared: 0.838
## F-statistic: 386.4 on 2 and 147 DF, p-value: < 2.2e-16
# Biểu đồ histogram của residuals
hist(residuals(model), main = "Histogram of Residuals", xlab = "Residuals", col = "lightblue", border = "black")
# Q-Q plot để kiểm tra phân phối chuẩn
qqnorm(residuals(model))
qqline(residuals(model), col = "red")
bài 2
if (!require(titanic)) install.packages("titanic")
## Loading required package: titanic
## Warning: package 'titanic' was built under R version 4.4.3
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(titanic)
titanic_df <- titanic::titanic_train
titanic_data <- titanic_df %>%
select(Survived, Pclass, Sex, Age, Fare) %>%
na.omit()
titanic_data$Sex <- as.numeric(factor(titanic_data$Sex, levels = c("male", "female")))
titanic_data$Survived <- as.factor(titanic_data$Survived)
head(titanic_data)
## Survived Pclass Sex Age Fare
## 1 0 3 1 22 7.2500
## 2 1 1 2 38 71.2833
## 3 1 3 2 26 7.9250
## 4 1 1 2 35 53.1000
## 5 0 3 1 35 8.0500
## 7 0 1 1 54 51.8625
# Xây dựng mô hình logistic
logistic_model <- glm(Survived ~ Pclass + Sex + Age + Fare, data = titanic_data, family = binomial)
# tóm tắt mô hình
summary(logistic_model)
##
## Call:
## glm(formula = Survived ~ Pclass + Sex + Age + Fare, family = binomial,
## data = titanic_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.0483534 0.5563719 -0.087 0.931
## Pclass -1.2697410 0.1586252 -8.005 1.20e-15 ***
## Sex 2.5181969 0.2078562 12.115 < 2e-16 ***
## Age -0.0367073 0.0076795 -4.780 1.75e-06 ***
## Fare 0.0005373 0.0021821 0.246 0.805
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 964.52 on 713 degrees of freedom
## Residual deviance: 647.23 on 709 degrees of freedom
## AIC: 657.23
##
## Number of Fisher Scoring iterations: 5
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
pred <- predict(logistic_model, titanic_data, type = "response")
pred_class <- ifelse(pred > 0.5, 1, 0)
# Chuyển về dạng factor
pred_class <- as.factor(pred_class)
titanic_data$Survived <- as.factor(titanic_data$Survived)
# Tính F1-score
confusionMatrix(pred_class, titanic_data$Survived, mode = "everything")$byClass["F1"]
## F1
## 0.8283063