library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.3.1

library(scales)

## Warning: package 'scales' was built under R version 4.3.1

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.3.1

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.3.1

## Warning: package 'tidyr' was built under R version 4.3.1

## Warning: package 'readr' was built under R version 4.3.1

## Warning: package 'purrr' was built under R version 4.3.1

## Warning: package 'forcats' was built under R version 4.3.1

## Warning: package 'lubridate' was built under R version 4.3.1

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ✔ readr     2.1.4

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ readr::col_factor() masks scales::col_factor()
## ✖ purrr::discard()    masks scales::discard()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::lag()        masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidyselect)

## Warning: package 'tidyselect' was built under R version 4.3.1

library(epitools)
library(DescTools)

## Warning: package 'DescTools' was built under R version 4.3.1

library(caTools)

## Warning: package 'caTools' was built under R version 4.3.1

library(tidytext)

## Warning: package 'tidytext' was built under R version 4.3.1

library(readxl)
For_EDA_dataset_1 <- read_excel("C:/Users/Abc/Downloads/For_EDA_dataset 1.xlsx")

## Warning: Expecting numeric in G3081 / R3081C7: got a date

## New names:
## • `` -> `...1`

View(For_EDA_dataset_1)

price<-For_EDA_dataset_1$price
property_type<-For_EDA_dataset_1$property_type
baths<-For_EDA_dataset_1$baths
bedrooms<-For_EDA_dataset_1$bedrooms
Area_in_Marla<-For_EDA_dataset_1$Area_in_Marla
purpose<-For_EDA_dataset_1$purpose

1. Thống kê mô tả cho biến purpose

Purpose: mục đích của căn hộ là biến định tính gồm 2 biểu hiện

table(For_EDA_dataset_1$purpose)

## 
## For Rent For Sale 
##    43183   110247

table(For_EDA_dataset_1$purpose)/sum(table(For_EDA_dataset_1$purpose))

## 
##  For Rent  For Sale 
## 0.2814508 0.7185492

ggplot(map = aes(x=purpose, y= after_stat(count)))+geom_bar(fill = 'brown')+geom_text(aes(label=percent(after_stat(count/sum(count)),accuracy = .01)), stat = 'count', color = 'white',vjust = 4) + labs(tiltle = 'Độ thị thể hiện số lượng người chia theo mục đích của căn hộ ', x = 'mục đích của căn hộ', y = 'số lượng')

Nhận xét: Trong 153430 người được khảo sát có:

43183 người có mục đích căn hộ là For Rent chiếm tỷ lệ 28,15%

110247 người có mục đích căn hộ là For Sale chiếm tỷ lệ 71,85%

Như vậy trong tổng số người được khảo sát số người có mục đích mua căn hộ For Sale nhiều hơn số người có mục đích mua căn hộ For Rent 67064 người (khoảng 43,71% so với tổng người được khảo sát).

2. Ước lượng tỷ lệ

Ước lượng tỷ lệ căn hộ với mục đích cho thuê có phải là 28% hay không (nghĩa là chúng ta kiểm định giả thuyết” H0: p= 0.28”)

a<-For_EDA_dataset_1[For_EDA_dataset_1$purpose == "For Rent",]
prop.test(length(a$purpose),length(For_EDA_dataset_1$purpose),p= 0.28)

## 
##  1-sample proportions test with continuity correction
## 
## data:  length(a$purpose) out of length(For_EDA_dataset_1$purpose), null probability 0.28
## X-squared = 1.5948, df = 1, p-value = 0.2066
## alternative hypothesis: true p is not equal to 0.28
## 95 percent confidence interval:
##  0.2792029 0.2837097
## sample estimates:
##         p 
## 0.2814508

Với p-value > 5%, chưa đủ cơ sở bác bỏ giả thuyết H0. Do đó tỷ lệ số căn hộ với mục đích cho thuê bằng 28% với mức ý nghĩa 5%.

Khoảng ước lượng tỷ lệ số căn hộ với mục đích cho thuê với độ tin cậy 95% là (0,2792029;0,2837097)

3. Mô hình hồi quy

3.1 Mô hình hồi quy logit

For_EDA_dataset_1$purpose<-as.factor(For_EDA_dataset_1$purpose)
For_EDA_dataset_1$property_type<-as.factor(For_EDA_dataset_1$property_type)

logit <- glm(factor (purpose) ~ price + baths + bedrooms + property_type  , family = binomial(link = "logit"), data = For_EDA_dataset_1)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(logit)

## 
## Call:
## glm(formula = factor(purpose) ~ price + baths + bedrooms + property_type, 
##     family = binomial(link = "logit"), data = For_EDA_dataset_1)
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -3.294e+00  5.512e-01  -5.977 2.28e-09 ***
## price                       5.445e-06  1.180e-07  46.142  < 2e-16 ***
## baths                      -9.078e-02  5.575e-02  -1.628   0.1035    
## bedrooms                   -3.956e-01  6.054e-02  -6.535 6.36e-11 ***
## property_typeFlat          -1.257e-01  5.520e-01  -0.228   0.8199    
## property_typeHouse         -2.907e+00  5.586e-01  -5.205 1.94e-07 ***
## property_typeLower Portion -2.582e+00  6.158e-01  -4.194 2.74e-05 ***
## property_typePenthouse     -1.377e+00  1.050e+00  -1.311   0.1898    
## property_typeRoom          -1.881e+00  7.841e-01  -2.399   0.0164 *  
## property_typeUpper Portion -2.489e+00  5.946e-01  -4.186 2.84e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 182372.5  on 153429  degrees of freedom
## Residual deviance:   3259.2  on 153420  degrees of freedom
## AIC: 3279.2
## 
## Number of Fisher Scoring iterations: 16

3.2 Mô hình probit

probit <- glm(factor (purpose) ~ price + baths + bedrooms + property_type , family = binomial(link = "probit"), data = For_EDA_dataset_1)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(probit)

## 
## Call:
## glm(formula = factor(purpose) ~ price + baths + bedrooms + property_type, 
##     family = binomial(link = "probit"), data = For_EDA_dataset_1)
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -1.741e+00  2.655e-01  -6.560 5.39e-11 ***
## price                       2.429e-06  3.865e-08  62.841  < 2e-16 ***
## baths                      -1.252e-02  2.169e-02  -0.577 0.563782    
## bedrooms                   -1.534e-01  2.336e-02  -6.569 5.08e-11 ***
## property_typeFlat          -2.094e-01  2.656e-01  -0.788 0.430532    
## property_typeHouse         -1.146e+00  2.673e-01  -4.286 1.82e-05 ***
## property_typeLower Portion -1.017e+00  2.784e-01  -3.653 0.000259 ***
## property_typePenthouse     -6.143e-01  4.314e-01  -1.424 0.154458    
## property_typeRoom          -8.047e-01  3.317e-01  -2.426 0.015266 *  
## property_typeUpper Portion -9.890e-01  2.738e-01  -3.612 0.000304 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 182372.5  on 153429  degrees of freedom
## Residual deviance:   3585.2  on 153420  degrees of freedom
## AIC: 3605.2
## 
## Number of Fisher Scoring iterations: 19

3.3 Mô hình cloglog

cloglog<-glm(formula = factor(purpose)~ price + baths +bedrooms  + property_type , family = binomial(link = "cloglog"), data = For_EDA_dataset_1)

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(cloglog)

## 
## Call:
## glm(formula = factor(purpose) ~ price + baths + bedrooms + property_type, 
##     family = binomial(link = "cloglog"), data = For_EDA_dataset_1)
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -6.844e+00  5.164e-01 -13.254  < 2e-16 ***
## price                       4.097e-06  7.441e-08  55.061  < 2e-16 ***
## baths                       6.728e-01  4.637e-02  14.510  < 2e-16 ***
## bedrooms                   -1.023e+00  4.850e-02 -21.091  < 2e-16 ***
## property_typeFlat           3.642e+00  5.184e-01   7.024 2.15e-12 ***
## property_typeHouse          1.516e+00  5.203e-01   2.914 0.003573 ** 
## property_typeLower Portion -6.547e-01  5.970e-01  -1.096 0.272863    
## property_typePenthouse      2.849e+00  7.596e-01   3.750 0.000177 ***
## property_typeRoom           2.127e+00  6.693e-01   3.178 0.001483 ** 
## property_typeUpper Portion  1.471e+00  5.416e-01   2.716 0.006599 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 182372.5  on 153429  degrees of freedom
## Residual deviance:   4505.8  on 153420  degrees of freedom
## AIC: 4525.8
## 
## Number of Fisher Scoring iterations: 25

4. Lựa chọn mô hình phù hợp

4.1 AIC

AIC(logit)

## [1] 3279.215

AIC(probit)

## [1] 3605.169

AIC(cloglog)

## [1] 4525.815

Từ chỉ số AIC của 3 mô hình trên ta thấy mô hình logit có chỉ số AIC thấp nhất (3279,215). Vì thế đối với tiêu chí đánh giá AIC thì mô hình logit là phù hợp để xem xét sự tác động của các yếu tố đến biến phụ thuộc Purpose hơn mô hình cloglog và probit.

4.2 Deviance

deviance(logit)

## [1] 3259.215

deviance(probit)

## [1] 3585.169

deviance(cloglog)

## [1] 4505.815

Từ chỉ số deviance của 3 mô hình trên ta thấy mô hình logit có chỉ số deviance thấp nhất (3259.215). Vì thế đối với tiêu chí đánh giá devience thì mô hình logit là phù hợp để xem xét sự tác động của các yếu tố đến biến phụ thuộc Purpose hơn mô hình cloglog và probit.

4.3 Brier Score

BrierScore(logit)

## [1] 0.00189995

BrierScore(probit)

## [1] 0.002125317

BrierScore(cloglog)

## [1] 0.002385188

Từ chỉ số BrierScore của 3 mô hình trên ta thấy mô hình logit có chỉ số BrierScore thấp nhất (0.00189995). Vì thế đối với tiêu chí đánh giá BrierScore thì mô hình logit là phù hợp để xem xét sự tác động của các yếu tố đến biến phụ thuộc Purpose hơn mô hình cloglog và probit.

Thông qua các tiêu chí đánh giá mô hình AIC, Deviance và BrierScore thì mô hình phù hợp nhất để xem xét tác động của các yếu tố property_type, bedrooms, Area_in_Marla tới biến phụ thuộc purpose là mô hình hồi quy logit

PTDLDT

Nguyễn Thị Lan Anh

2023-07-26