Untitled

# Load dữ liệu
data(airquality)

# Xem trước dữ liệu
head(airquality)

##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA      NA 14.3   56     5   5
## 6    28      NA 14.9   66     5   6

# Kiểm tra giá trị thiếu
summary(airquality)

##      Ozone           Solar.R           Wind             Temp      
##  Min.   :  1.00   Min.   :  7.0   Min.   : 1.700   Min.   :56.00  
##  1st Qu.: 18.00   1st Qu.:115.8   1st Qu.: 7.400   1st Qu.:72.00  
##  Median : 31.50   Median :205.0   Median : 9.700   Median :79.00  
##  Mean   : 42.13   Mean   :185.9   Mean   : 9.958   Mean   :77.88  
##  3rd Qu.: 63.25   3rd Qu.:258.8   3rd Qu.:11.500   3rd Qu.:85.00  
##  Max.   :168.00   Max.   :334.0   Max.   :20.700   Max.   :97.00  
##  NA's   :37       NA's   :7                                       
##      Month            Day      
##  Min.   :5.000   Min.   : 1.0  
##  1st Qu.:6.000   1st Qu.: 8.0  
##  Median :7.000   Median :16.0  
##  Mean   :6.993   Mean   :15.8  
##  3rd Qu.:8.000   3rd Qu.:23.0  
##  Max.   :9.000   Max.   :31.0  
##

# Loại bỏ các dòng có NA (hoặc dùng phương pháp bù NA)
airquality <- na.omit(airquality)

# Xây dựng mô hình hồi quy tuyến tính
model <- lm(Ozone ~ Solar.R + Wind + Temp, data = airquality)

# Hiển thị kết quả mô hình
summary(model)

## 
## Call:
## lm(formula = Ozone ~ Solar.R + Wind + Temp, data = airquality)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -40.485 -14.219  -3.551  10.097  95.619 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -64.34208   23.05472  -2.791  0.00623 ** 
## Solar.R       0.05982    0.02319   2.580  0.01124 *  
## Wind         -3.33359    0.65441  -5.094 1.52e-06 ***
## Temp          1.65209    0.25353   6.516 2.42e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.18 on 107 degrees of freedom
## Multiple R-squared:  0.6059, Adjusted R-squared:  0.5948 
## F-statistic: 54.83 on 3 and 107 DF,  p-value: < 2.2e-16

# Dự đoán giá trị Ozone
predictions <- predict(model, newdata = airquality)

# Tính MAE
mae <- mean(abs(predictions - airquality$Ozone))

# In kết quả
cat("Mean Absolute Error (MAE):", mae)

## Mean Absolute Error (MAE): 15.46776

# Cài đặt thư viện nếu chưa có
if (!require(mlbench)) install.packages("mlbench", dependencies = TRUE)

## Loading required package: mlbench

## Warning: package 'mlbench' was built under R version 4.4.3

# Load thư viện
library(mlbench)

# Tải dữ liệu
data(PimaIndiansDiabetes)

# Xem trước dữ liệu
head(PimaIndiansDiabetes)

##   pregnant glucose pressure triceps insulin mass pedigree age diabetes
## 1        6     148       72      35       0 33.6    0.627  50      pos
## 2        1      85       66      29       0 26.6    0.351  31      neg
## 3        8     183       64       0       0 23.3    0.672  32      pos
## 4        1      89       66      23      94 28.1    0.167  21      neg
## 5        0     137       40      35     168 43.1    2.288  33      pos
## 6        5     116       74       0       0 25.6    0.201  30      neg

summary(PimaIndiansDiabetes)

##     pregnant         glucose         pressure         triceps     
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     insulin           mass          pedigree           age        diabetes 
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780   Min.   :21.00   neg:500  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437   1st Qu.:24.00   pos:268  
##  Median : 30.5   Median :32.00   Median :0.3725   Median :29.00            
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719   Mean   :33.24            
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262   3rd Qu.:41.00            
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200   Max.   :81.00

# Chia dữ liệu thành tập huấn luyện (70%) và kiểm tra (30%)
set.seed(123) # Để kết quả có thể tái lập
sample_index <- sample(1:nrow(PimaIndiansDiabetes), 0.7 * nrow(PimaIndiansDiabetes))

train_data <- PimaIndiansDiabetes[sample_index, ]
test_data <- PimaIndiansDiabetes[-sample_index, ]

# Xây dựng mô hình hồi quy logistic
model <- glm(diabetes ~ ., data = train_data, family = binomial)

# Hiển thị kết quả mô hình
summary(model)

## 
## Call:
## glm(formula = diabetes ~ ., family = binomial, data = train_data)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -8.405409   0.841872  -9.984  < 2e-16 ***
## pregnant     0.103471   0.037973   2.725  0.00643 ** 
## glucose      0.035730   0.004563   7.830 4.89e-15 ***
## pressure    -0.012707   0.006057  -2.098  0.03590 *  
## triceps      0.003563   0.008088   0.440  0.65959    
## insulin     -0.001710   0.001060  -1.613  0.10671    
## mass         0.088735   0.017954   4.942 7.72e-07 ***
## pedigree     0.696250   0.334761   2.080  0.03754 *  
## age          0.017015   0.011066   1.538  0.12415    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 694.17  on 536  degrees of freedom
## Residual deviance: 509.76  on 528  degrees of freedom
## AIC: 527.76
## 
## Number of Fisher Scoring iterations: 5

# Cài đặt thư viện nếu chưa có
if (!require(pROC)) install.packages("pROC", dependencies = TRUE)

## Loading required package: pROC

## Warning: package 'pROC' was built under R version 4.4.3

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

# Load thư viện
library(pROC)

# Dự đoán trên tập kiểm tra
probabilities <- predict(model, newdata = test_data, type = "response")

# Tính ROC và AUC
roc_curve <- roc(test_data$diabetes, probabilities)

## Setting levels: control = neg, case = pos

## Setting direction: controls < cases

auc_value <- auc(roc_curve)

# Vẽ biểu đồ ROC
plot(roc_curve, col = "blue", main = paste("ROC Curve - AUC:", round(auc_value, 3)))

Untitled

Ngô Việt Kiên

2025-03-13