library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.3.1

## Warning: package 'ggplot2' was built under R version 4.3.1

## Warning: package 'readr' was built under R version 4.3.1

## Warning: package 'forcats' was built under R version 4.3.1

## Warning: package 'lubridate' was built under R version 4.3.1

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

TH<-read.csv("C:/Users/LU UYEN/Downloads/data267.csv")
library(ggplot2)
library(dplyr)
library(DescTools)

## Warning: package 'DescTools' was built under R version 4.3.1

library(caret)

## Warning: package 'caret' was built under R version 4.3.1

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following objects are masked from 'package:DescTools':
## 
##     MAE, RMSE
## 
## The following object is masked from 'package:purrr':
## 
##     lift

1 Thống kê mô tả

table(TH$purpose)

## 
## For Rent For Sale 
##    43183   110247

table(TH$purpose)/sum(table(TH$purpose))

## 
##  For Rent  For Sale 
## 0.2814508 0.7185492

Dựa vào kết quả của bảng tần số và bảng tần suất purpose thì ta thấy trong 153430 căn hộ thì có 43183 căn hộ cho thuê chiếm khoảng 28.1%, có 110247 căn hộ rao bán chiếm khoảng 71.9%.

Đồ thị cột của biến purpose

TH |> ggplot(aes(TH$purpose)) +
  geom_bar(color = 'blue', fill = 'blue') + theme_classic() + labs(x = '', y = 'Tần số')

## Warning: Use of `TH$purpose` is discouraged.
## ℹ Use `purpose` instead.

2 Ước lượng tỷ lệ

S<- TH[TH$purpose == 'For Sale',]
prop.test( length(S$purpose), length(TH$purpose))

## 
##  1-sample proportions test with continuity correction
## 
## data:  length(S$purpose) out of length(TH$purpose), null probability 0.5
## X-squared = 29313, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.7162903 0.7207971
## sample estimates:
##         p 
## 0.7185492

Với độ tin cậy 95% ta có tỷ lệ những người trong cuộc khảo sát có mục đích bán nhà nằm trong khoảng từ 71,63% đến 72,08%.

S <- TH[TH$purpose == 'For Rent',]
prop.test( length(S$purpose), length(TH$purpose))

## 
##  1-sample proportions test with continuity correction
## 
## data:  length(S$purpose) out of length(TH$purpose), null probability 0.5
## X-squared = 29313, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.2792029 0.2837097
## sample estimates:
##         p 
## 0.2814508

Với độ tin cậy 95% ta có tỷ lệ những người trong cuộc khảo sát có mục đích cho thuê nhà nằm trong khoảng từ 27,92% đến 28,37%.

3 Mô hình hồi quy

3.1 Mô hình logit

logit <- glm(factor (purpose) ~ TH$property_type + TH$price  + TH$bedrooms++ TH$baths , family = binomial(link = "logit"), data = TH)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(logit)

## 
## Call:
## glm(formula = factor(purpose) ~ TH$property_type + TH$price + 
##     TH$bedrooms + +TH$baths, family = binomial(link = "logit"), 
##     data = TH)
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -3.294e+00  5.512e-01  -5.977 2.28e-09 ***
## TH$property_typeFlat          -1.257e-01  5.520e-01  -0.228   0.8199    
## TH$property_typeHouse         -2.907e+00  5.586e-01  -5.205 1.94e-07 ***
## TH$property_typeLower Portion -2.582e+00  6.158e-01  -4.194 2.74e-05 ***
## TH$property_typePenthouse     -1.377e+00  1.050e+00  -1.311   0.1898    
## TH$property_typeRoom          -1.881e+00  7.841e-01  -2.399   0.0164 *  
## TH$property_typeUpper Portion -2.489e+00  5.946e-01  -4.186 2.84e-05 ***
## TH$price                       5.445e-06  1.180e-07  46.142  < 2e-16 ***
## TH$bedrooms                   -3.956e-01  6.054e-02  -6.535 6.36e-11 ***
## TH$baths                      -9.078e-02  5.575e-02  -1.628   0.1035    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 182372.5  on 153429  degrees of freedom
## Residual deviance:   3259.2  on 153420  degrees of freedom
## AIC: 3279.2
## 
## Number of Fisher Scoring iterations: 16

3.2 Mô hình probit

probit <- glm(factor (purpose) ~ TH$property_type + TH$price + TH$baths + TH$bedrooms , family = binomial(link = "probit"), data = TH)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(probit)

## 
## Call:
## glm(formula = factor(purpose) ~ TH$property_type + TH$price + 
##     TH$baths + TH$bedrooms, family = binomial(link = "probit"), 
##     data = TH)
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -1.741e+00  2.655e-01  -6.560 5.39e-11 ***
## TH$property_typeFlat          -2.094e-01  2.656e-01  -0.788 0.430532    
## TH$property_typeHouse         -1.146e+00  2.673e-01  -4.286 1.82e-05 ***
## TH$property_typeLower Portion -1.017e+00  2.784e-01  -3.653 0.000259 ***
## TH$property_typePenthouse     -6.143e-01  4.314e-01  -1.424 0.154458    
## TH$property_typeRoom          -8.047e-01  3.317e-01  -2.426 0.015266 *  
## TH$property_typeUpper Portion -9.890e-01  2.738e-01  -3.612 0.000304 ***
## TH$price                       2.429e-06  3.865e-08  62.841  < 2e-16 ***
## TH$baths                      -1.252e-02  2.169e-02  -0.577 0.563782    
## TH$bedrooms                   -1.534e-01  2.336e-02  -6.569 5.08e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 182372.5  on 153429  degrees of freedom
## Residual deviance:   3585.2  on 153420  degrees of freedom
## AIC: 3605.2
## 
## Number of Fisher Scoring iterations: 19

3.3 Mô hình clolog

cloglog <- glm(factor (purpose) ~ TH$property_type + TH$price + TH$baths + TH$bedrooms , family = binomial(link = "cloglog"), data = TH)

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(cloglog)

## 
## Call:
## glm(formula = factor(purpose) ~ TH$property_type + TH$price + 
##     TH$baths + TH$bedrooms, family = binomial(link = "cloglog"), 
##     data = TH)
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -6.844e+00  5.164e-01 -13.254  < 2e-16 ***
## TH$property_typeFlat           3.642e+00  5.184e-01   7.024 2.15e-12 ***
## TH$property_typeHouse          1.516e+00  5.203e-01   2.914 0.003573 ** 
## TH$property_typeLower Portion -6.546e-01  5.970e-01  -1.096 0.272889    
## TH$property_typePenthouse      2.849e+00  7.596e-01   3.750 0.000177 ***
## TH$property_typeRoom           2.127e+00  6.693e-01   3.178 0.001483 ** 
## TH$property_typeUpper Portion  1.471e+00  5.416e-01   2.716 0.006598 ** 
## TH$price                       4.097e-06  7.441e-08  55.061  < 2e-16 ***
## TH$baths                       6.728e-01  4.637e-02  14.510  < 2e-16 ***
## TH$bedrooms                   -1.023e+00  4.850e-02 -21.091  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 182372.5  on 153429  degrees of freedom
## Residual deviance:   4505.8  on 153420  degrees of freedom
## AIC: 4525.8
## 
## Number of Fisher Scoring iterations: 25

4 Lựa chọn mô hình

4.1 Tiêu chí AIC

AIC(logit) = 3279.2

AIC(probit) = 3605.2

AIC(cloglog) = 4525.8

->Mô hình logit có AIC nhỏ nhất nên ta chọn mô hình logit

4.2 Deviance

Deviance(logit) = 3259.2

Deviance(probit) = 3585.2

Deviance(cloglog) = 4505.8

->Mô hình logit có deviance nhỏ nhất nên ta chọn mô hình logit.

4.3 Brier Score

BrierScore(logit)

## [1] 0.00189995

BrierScore(probit)

## [1] 0.002125317

BrierScore(cloglog)

## [1] 0.002385184

->Dựa vào tiêu chí Brier score ta thấy mô hình logit có giá trị nhỏ nhất nên ta chọn mô hình logit.

4.4 Ma trận nhầm lần

4.4.1 Mô hình logit

predictions <- predict(logit, newdata = TH, type = "response")
predicted_classes <- ifelse(predictions > 0.5, "1", "0")  
predictions1 <- factor(predicted_classes, levels = c("0","1"))
actual<- factor(TH$purpose, labels = c("0","1"))
confusionMatrix(table(predictions1, actual))

## Confusion Matrix and Statistics
## 
##             actual
## predictions1      0      1
##            0  43116    313
##            1     67 109934
##                                           
##                Accuracy : 0.9975          
##                  95% CI : (0.9973, 0.9978)
##     No Information Rate : 0.7185          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9939          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9984          
##             Specificity : 0.9972          
##          Pos Pred Value : 0.9928          
##          Neg Pred Value : 0.9994          
##              Prevalence : 0.2815          
##          Detection Rate : 0.2810          
##    Detection Prevalence : 0.2831          
##       Balanced Accuracy : 0.9978          
##                                           
##        'Positive' Class : 0               
##

Mô hình logit có độ chính xác toàn thể là 99,75%, độ nhạy là 99,84% và độ hiệu quả là 99,72%.

4.4.2 Mô hình probit

predictions <- predict(probit, newdata = TH, type = "response")
predicted_classes <- ifelse(predictions > 0.5, "1", "0") 
predictions1 <- factor(predicted_classes, levels = c("0","1"))
actual<- factor(TH$purpose, labels = c("0","1"))
confusionMatrix(table(predictions1, actual))

## Confusion Matrix and Statistics
## 
##             actual
## predictions1      0      1
##            0  43111    322
##            1     72 109925
##                                           
##                Accuracy : 0.9974          
##                  95% CI : (0.9972, 0.9977)
##     No Information Rate : 0.7185          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9937          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9983          
##             Specificity : 0.9971          
##          Pos Pred Value : 0.9926          
##          Neg Pred Value : 0.9993          
##              Prevalence : 0.2815          
##          Detection Rate : 0.2810          
##    Detection Prevalence : 0.2831          
##       Balanced Accuracy : 0.9977          
##                                           
##        'Positive' Class : 0               
##

Mô hình probit có độ chính xác toàn thể là 99,74%, độ nhạy là 99,83% và độ hiệu quả là 99,71%.

4.4.3 Mô hình cloglog

predictions <- predict(cloglog, newdata = TH, type = "response")
predicted_classes <- ifelse(predictions > 0.5, "1", "0") 
predictions1 <- factor(predicted_classes, levels = c("0","1"))
actual<- factor(TH$purpose, labels = c("0","1"))
confusionMatrix(table(predictions1, actual))

## Confusion Matrix and Statistics
## 
##             actual
## predictions1      0      1
##            0  43089    365
##            1     94 109882
##                                           
##                Accuracy : 0.997           
##                  95% CI : (0.9967, 0.9973)
##     No Information Rate : 0.7185          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9926          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9978          
##             Specificity : 0.9967          
##          Pos Pred Value : 0.9916          
##          Neg Pred Value : 0.9991          
##              Prevalence : 0.2815          
##          Detection Rate : 0.2808          
##    Detection Prevalence : 0.2832          
##       Balanced Accuracy : 0.9973          
##                                           
##        'Positive' Class : 0               
##

Mô hình cloglog có độ chính xác toàn thể là 99,70%, độ nhạy là 99,78% và độ hiệu quả là 99,67%.

->Mô hình logit có độ chính xác toàn thể, độ nhạy, độ hiệu quả cao nhất trong 3 mô hình nên chọn mô hình logit.

-Kết luận: Dựa vào 4 tiêu chuẩn trên ta thấy mô hình logit là mô hình được chọn. Do đó mô hình logit là mô hình phù hợp nhất trong 3 mô hình.

Bài tập TH

Lữ Thị Bảo Uyên

2023-07-26