##Câu 1

data("airquality")  
airquality <- na.omit(airquality)
model <- lm(Temp ~ Wind + Solar.R, data = airquality)
summary(model)
## 
## Call:
## lm(formula = Temp ~ Wind + Solar.R, data = airquality)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -17.2714  -5.0237   0.5837   5.2545  18.4608 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 85.702275   2.925445  29.295  < 2e-16 ***
## Wind        -1.251870   0.217207  -5.763 7.89e-08 ***
## Solar.R      0.024533   0.008478   2.894  0.00461 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.039 on 108 degrees of freedom
## Multiple R-squared:  0.3014, Adjusted R-squared:  0.2884 
## F-statistic: 23.29 on 2 and 108 DF,  p-value: 3.886e-09
residuals <- residuals(model)
shapiro.test(residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals
## W = 0.979, p-value = 0.07734
# Kiểm tra trực quan bằng Histogram của residuals
hist(residuals, main = "Histogram of Residuals", xlab = "Residuals", col = "lightblue", border = "black")

##Câu 2

#install.packages("titanic")
library(titanic)
## Warning: package 'titanic' was built under R version 4.4.3
data <- titanic::titanic_train
data <- na.omit(data)  
data$Survived <- as.factor(data$Survived)  
data$Pclass <- as.factor(data$Pclass) 
data$Sex <- as.factor(data$Sex)
set.seed(123)
#install.packages("caTools")
library(caTools)
## Warning: package 'caTools' was built under R version 4.4.3
split <- sample.split(data$Survived, SplitRatio = 0.7)
train_data <- subset(data, split == TRUE)
test_data <- subset(data, split == FALSE)
model <- glm(Survived ~ Pclass + Sex + Age + Fare, data = train_data, family = binomial)
summary(model)
## 
## Call:
## glm(formula = Survived ~ Pclass + Sex + Age + Fare, family = binomial, 
##     data = train_data)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  3.712e+00  5.435e-01   6.829 8.54e-12 ***
## Pclass2     -1.113e+00  3.734e-01  -2.980  0.00288 ** 
## Pclass3     -2.343e+00  3.824e-01  -6.127 8.96e-10 ***
## Sexmale     -2.521e+00  2.450e-01 -10.288  < 2e-16 ***
## Age         -3.995e-02  9.154e-03  -4.364 1.28e-05 ***
## Fare        -5.631e-05  2.484e-03  -0.023  0.98191    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 675.37  on 499  degrees of freedom
## Residual deviance: 459.39  on 494  degrees of freedom
## AIC: 471.39
## 
## Number of Fisher Scoring iterations: 4
probabilities <- predict(model, newdata = test_data, type = "response")
predictions <- ifelse(probabilities > 0.5, 1, 0)
accuracy <- mean(predictions == test_data$Survived)
print(paste("Độ chính xác của mô hình:", round(accuracy * 100, 2), "%"))
## [1] "Độ chính xác của mô hình: 80.84 %"