##Câu 1
data("airquality")
airquality <- na.omit(airquality)
model <- lm(Temp ~ Wind + Solar.R, data = airquality)
summary(model)
##
## Call:
## lm(formula = Temp ~ Wind + Solar.R, data = airquality)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.2714 -5.0237 0.5837 5.2545 18.4608
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 85.702275 2.925445 29.295 < 2e-16 ***
## Wind -1.251870 0.217207 -5.763 7.89e-08 ***
## Solar.R 0.024533 0.008478 2.894 0.00461 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.039 on 108 degrees of freedom
## Multiple R-squared: 0.3014, Adjusted R-squared: 0.2884
## F-statistic: 23.29 on 2 and 108 DF, p-value: 3.886e-09
residuals <- residuals(model)
shapiro.test(residuals)
##
## Shapiro-Wilk normality test
##
## data: residuals
## W = 0.979, p-value = 0.07734
# Kiểm tra trực quan bằng Histogram của residuals
hist(residuals, main = "Histogram of Residuals", xlab = "Residuals", col = "lightblue", border = "black")
##Câu 2
#install.packages("titanic")
library(titanic)
## Warning: package 'titanic' was built under R version 4.4.3
data <- titanic::titanic_train
data <- na.omit(data)
data$Survived <- as.factor(data$Survived)
data$Pclass <- as.factor(data$Pclass)
data$Sex <- as.factor(data$Sex)
set.seed(123)
#install.packages("caTools")
library(caTools)
## Warning: package 'caTools' was built under R version 4.4.3
split <- sample.split(data$Survived, SplitRatio = 0.7)
train_data <- subset(data, split == TRUE)
test_data <- subset(data, split == FALSE)
model <- glm(Survived ~ Pclass + Sex + Age + Fare, data = train_data, family = binomial)
summary(model)
##
## Call:
## glm(formula = Survived ~ Pclass + Sex + Age + Fare, family = binomial,
## data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.712e+00 5.435e-01 6.829 8.54e-12 ***
## Pclass2 -1.113e+00 3.734e-01 -2.980 0.00288 **
## Pclass3 -2.343e+00 3.824e-01 -6.127 8.96e-10 ***
## Sexmale -2.521e+00 2.450e-01 -10.288 < 2e-16 ***
## Age -3.995e-02 9.154e-03 -4.364 1.28e-05 ***
## Fare -5.631e-05 2.484e-03 -0.023 0.98191
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 675.37 on 499 degrees of freedom
## Residual deviance: 459.39 on 494 degrees of freedom
## AIC: 471.39
##
## Number of Fisher Scoring iterations: 4
probabilities <- predict(model, newdata = test_data, type = "response")
predictions <- ifelse(probabilities > 0.5, 1, 0)
accuracy <- mean(predictions == test_data$Survived)
print(paste("Độ chính xác của mô hình:", round(accuracy * 100, 2), "%"))
## [1] "Độ chính xác của mô hình: 80.84 %"