1 Thư viện sử dụng

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.3.1

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.3.1

## Warning: package 'tidyr' was built under R version 4.3.1

## Warning: package 'readr' was built under R version 4.3.1

## Warning: package 'purrr' was built under R version 4.3.1

## Warning: package 'dplyr' was built under R version 4.3.1

## Warning: package 'forcats' was built under R version 4.3.1

## Warning: package 'lubridate' was built under R version 4.3.1

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library (DescTools)

## Warning: package 'DescTools' was built under R version 4.3.1

library(epitools)
library(caret)

## Warning: package 'caret' was built under R version 4.3.1

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following objects are masked from 'package:DescTools':
## 
##     MAE, RMSE
## 
## The following object is masked from 'package:purrr':
## 
##     lift

2 Giới thiệu về dữ liệu

Tập dữ liệu này ban đầu được zameen.com thu thập dưới dạng dữ iệu giá nhà ở Pakistan và đã được sử dụng một số kỹ thuật làm sạch dữ liệu để cung cấp tập dữ liệu đặc biệt cho Thành phố Islamabad

Mô tả dữ liệu: Bộ dữ liệu ta lấy được gồm có 153432 quan sát và 6 biến:

Property type: là các loại tài sản. Trong phần này, chúng ta có 6 loại khác nhau: House, FarmHouse, Upper Portion, Lower Portion, Flat, Room

Price: là giá của các loại tài sản

Baths: số phòng tắm

Purpose: mục đích của căn hộ

Bedrooms: số phòng ngủ

Area in Marla: khu vực ở Marla

3 Dữ liệu

setwd("D:/HN")
ED <- read.csv("EDA.csv", header=TRUE)
str(ED)

## 'data.frame':    153430 obs. of  7 variables:
##  $ X            : int  0 1 2 3 4 5 6 7 8 9 ...
##  $ property_type: chr  "Flat" "Flat" "House" "House" ...
##  $ price        : int  10000000 6900000 16500000 43500000 7000000 34500000 27000000 7800000 50000000 40000000 ...
##  $ baths        : int  2 3 6 4 3 8 8 2 7 5 ...
##  $ purpose      : chr  "For Sale" "For Sale" "For Sale" "For Sale" ...
##  $ bedrooms     : int  2 3 5 4 3 8 8 2 7 5 ...
##  $ Area_in_Marla: chr  "4" "5.6" "8" "40" ...

4 Thống kê mô tả biến phụ thuộc (biến Purpose)

4.1 Bảng tần số

table(ED$purpose)

## 
## For Rent For Sale 
##    43183   110247

prop.table(table(ED$purpose))

## 
##  For Rent  For Sale 
## 0.2814508 0.7185492

4.2 Đồ thị

ED  |> ggplot(aes(x = purpose, y = after_stat(count))) + geom_bar(fill = 'pink') + geom_text(aes(label = scales::percent(after_stat(count/sum(count)))), stat = 'count', color = 'black', vjust = - .5) + theme_classic() + labs(x = 'Mục đích mua nhà', y = 'Số người')

5 Ước lượng tỷ lệ

Từ thống kê, ước lượng tỷ lệ căn hộ có giá bán lớn hơn 1 tỷ đồng và đồng thời kiểm định xem tỷ lệ số căn hộ có giá bán lớn hơn 1 tỷ đồng có phải là 25% hay không. Ta kiểm định giả thuyết:

H0: Tỷ lệ căn hộ có giá bán lớn hơn 1 tỷ đồng là 25%

H1: Tỷ lệ căn hộ có giá bán lớn hơn 1 tỷ đồng không phải là 25%

a<-ED[ED$purpose == "For Rent",]
prop.test(length(a$purpose),length(ED$purpose),p= 0.28)

## 
##  1-sample proportions test with continuity correction
## 
## data:  length(a$purpose) out of length(ED$purpose), null probability 0.28
## X-squared = 1.5948, df = 1, p-value = 0.2066
## alternative hypothesis: true p is not equal to 0.28
## 95 percent confidence interval:
##  0.2792029 0.2837097
## sample estimates:
##         p 
## 0.2814508

Ta có p-value > 5%, chưa đủ cơ sở bác bỏ giả thuyết H0. Do đó tỷ lệ số căn hộ với mục đích cho thuê bằng 28% với mức ý nghĩa 5%.

Khoảng ước lượng tỷ lệ số căn hộ với mục đích cho thuê với độ tin cậy 95% là (0,2792029;0,2837097)

6 Mô hình hồi quy

6.1 Hồi quy với hàm logit

ED$purpose<-as.factor(ED$purpose)
ED$property_type<-as.factor(ED$property_type)

logit <- glm(factor (purpose) ~ price + baths + bedrooms + property_type  , family = binomial(link = "logit"), data = ED)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(logit)

## 
## Call:
## glm(formula = factor(purpose) ~ price + baths + bedrooms + property_type, 
##     family = binomial(link = "logit"), data = ED)
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -3.294e+00  5.512e-01  -5.977 2.28e-09 ***
## price                       5.445e-06  1.180e-07  46.142  < 2e-16 ***
## baths                      -9.078e-02  5.575e-02  -1.628   0.1035    
## bedrooms                   -3.956e-01  6.054e-02  -6.535 6.36e-11 ***
## property_typeFlat          -1.257e-01  5.520e-01  -0.228   0.8199    
## property_typeHouse         -2.907e+00  5.586e-01  -5.205 1.94e-07 ***
## property_typeLower Portion -2.582e+00  6.158e-01  -4.194 2.74e-05 ***
## property_typePenthouse     -1.377e+00  1.050e+00  -1.311   0.1898    
## property_typeRoom          -1.881e+00  7.841e-01  -2.399   0.0164 *  
## property_typeUpper Portion -2.489e+00  5.946e-01  -4.186 2.84e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 182372.5  on 153429  degrees of freedom
## Residual deviance:   3259.2  on 153420  degrees of freedom
## AIC: 3279.2
## 
## Number of Fisher Scoring iterations: 16

Hàm hồi quy logistic sau:

logit =

6.2 Hồi quy với hàm probit

ED$purpose<-as.factor(ED$purpose)
ED$property_type<-as.factor(ED$property_type)

probit <- glm(factor (purpose) ~ price + baths + bedrooms + property_type  , family = binomial(link = "probit"), data = ED)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(probit)

## 
## Call:
## glm(formula = factor(purpose) ~ price + baths + bedrooms + property_type, 
##     family = binomial(link = "probit"), data = ED)
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -1.741e+00  2.655e-01  -6.560 5.39e-11 ***
## price                       2.429e-06  3.865e-08  62.841  < 2e-16 ***
## baths                      -1.252e-02  2.169e-02  -0.577 0.563782    
## bedrooms                   -1.534e-01  2.336e-02  -6.569 5.08e-11 ***
## property_typeFlat          -2.094e-01  2.656e-01  -0.788 0.430532    
## property_typeHouse         -1.146e+00  2.673e-01  -4.286 1.82e-05 ***
## property_typeLower Portion -1.017e+00  2.784e-01  -3.653 0.000259 ***
## property_typePenthouse     -6.143e-01  4.314e-01  -1.424 0.154458    
## property_typeRoom          -8.047e-01  3.317e-01  -2.426 0.015266 *  
## property_typeUpper Portion -9.890e-01  2.738e-01  -3.612 0.000304 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 182372.5  on 153429  degrees of freedom
## Residual deviance:   3585.2  on 153420  degrees of freedom
## AIC: 3605.2
## 
## Number of Fisher Scoring iterations: 19

6.3 Mô hình cloglog

 cloglog <- glm(factor (purpose) ~ price + baths + bedrooms + property_type  , family = binomial(link = "cloglog"), data = ED)

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(cloglog)

## 
## Call:
## glm(formula = factor(purpose) ~ price + baths + bedrooms + property_type, 
##     family = binomial(link = "cloglog"), data = ED)
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -6.844e+00  5.164e-01 -13.254  < 2e-16 ***
## price                       4.097e-06  7.441e-08  55.061  < 2e-16 ***
## baths                       6.728e-01  4.637e-02  14.510  < 2e-16 ***
## bedrooms                   -1.023e+00  4.850e-02 -21.091  < 2e-16 ***
## property_typeFlat           3.642e+00  5.184e-01   7.024 2.15e-12 ***
## property_typeHouse          1.516e+00  5.203e-01   2.914 0.003573 ** 
## property_typeLower Portion -6.547e-01  5.970e-01  -1.096 0.272863    
## property_typePenthouse      2.849e+00  7.596e-01   3.750 0.000177 ***
## property_typeRoom           2.127e+00  6.693e-01   3.178 0.001483 ** 
## property_typeUpper Portion  1.471e+00  5.416e-01   2.716 0.006599 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 182372.5  on 153429  degrees of freedom
## Residual deviance:   4505.8  on 153420  degrees of freedom
## AIC: 4525.8
## 
## Number of Fisher Scoring iterations: 25

7 Lựa chọn mô hình

7.1 Chỉ số AIC

AIC(logit) = 3279,2

AIC(probit) = 3605,2

AIC(cloglog) = 4525,8

Mô hình logit có AIC nhỏ nhất nên ta chọn MH logit

7.2 Deviance

Deviance(logit) = 3259,2

Deviance(probit) = 3585,2

Deviance(cloglog) = 4505,8

Mô hình logit có deviance nhỏ nhất nên ta chọn MH này

7.3 Brier Score

library(DescTools)
BrierScore(logit)

## [1] 0.00189995

library(DescTools)
BrierScore(probit)

## [1] 0.002125317

library(DescTools)
BrierScore(cloglog)

## [1] 0.002385188

Dựa vào tiêu chí Brier score ta thấy MH logit có giá trị nhỏ nhất nên ta chọn MH logit

7.4 Ma trận nhầm lẫn

7.4.1 Mô hình logit

library(caret)
predictions <- predict(logit, newdata = ED, type = "response")
predicted_classes <- ifelse(predictions > 0.5, "1", "0")  
predictions1<-factor(predicted_classes, levels = c("0","1"))
actual<- factor(ED$purpose, labels = c("0","1"))
confusionMatrix(table(predictions1, actual))

## Confusion Matrix and Statistics
## 
##             actual
## predictions1      0      1
##            0  43116    313
##            1     67 109934
##                                           
##                Accuracy : 0.9975          
##                  95% CI : (0.9973, 0.9978)
##     No Information Rate : 0.7185          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9939          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9984          
##             Specificity : 0.9972          
##          Pos Pred Value : 0.9928          
##          Neg Pred Value : 0.9994          
##              Prevalence : 0.2815          
##          Detection Rate : 0.2810          
##    Detection Prevalence : 0.2831          
##       Balanced Accuracy : 0.9978          
##                                           
##        'Positive' Class : 0               
##

7.4.2 Mô hình probit

library(caret)
predictions <- predict(probit, newdata = ED, type = "response")
predicted_classes <- ifelse(predictions > 0.5, "1", "0")  
predictions1<-factor(predicted_classes, levels = c("0","1"))
actual<- factor(ED$purpose, labels = c("0","1"))
confusionMatrix(table(predictions1, actual))

## Confusion Matrix and Statistics
## 
##             actual
## predictions1      0      1
##            0  43111    322
##            1     72 109925
##                                           
##                Accuracy : 0.9974          
##                  95% CI : (0.9972, 0.9977)
##     No Information Rate : 0.7185          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9937          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9983          
##             Specificity : 0.9971          
##          Pos Pred Value : 0.9926          
##          Neg Pred Value : 0.9993          
##              Prevalence : 0.2815          
##          Detection Rate : 0.2810          
##    Detection Prevalence : 0.2831          
##       Balanced Accuracy : 0.9977          
##                                           
##        'Positive' Class : 0               
##

7.4.3 Mô hình cloglog

library(caret)
predictions <- predict(cloglog, newdata = ED, type = "response")
predicted_classes <- ifelse(predictions > 0.5, "1", "0")  
predictions1<-factor(predicted_classes, levels = c("0","1"))
actual<- factor(ED$purpose, labels = c("0","1"))
confusionMatrix(table(predictions1, actual))

## Confusion Matrix and Statistics
## 
##             actual
## predictions1      0      1
##            0  43089    365
##            1     94 109882
##                                           
##                Accuracy : 0.997           
##                  95% CI : (0.9967, 0.9973)
##     No Information Rate : 0.7185          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9926          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9978          
##             Specificity : 0.9967          
##          Pos Pred Value : 0.9916          
##          Neg Pred Value : 0.9991          
##              Prevalence : 0.2815          
##          Detection Rate : 0.2808          
##    Detection Prevalence : 0.2832          
##       Balanced Accuracy : 0.9973          
##                                           
##        'Positive' Class : 0               
##

MH cloglog có độ chính xác toàn thể là 99,73%, độ nhạy là 99,78% và độ hiệu quả là 99,67%

MH logit có độ chính xác toàn thể, độ nhạy, độ hiệu quả cao nhất trong 3 mô hình nên chọn mô hình logit.

8 Kết luận:

Dựa vào 4 tiêu chuẩn trên ta thấy MH logit là mô hình được chọn. Do đó mh logit là mô hình phù hợp nhất trong 3 mô hình.

BÀI TẬP

Hoài Ngọc

2023-07-26