https://drive.google.com/file/d/1oHM9X9j0zg_bgaM4qS7oGVhWaPYJf6_O/view?usp=drive_link

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.1
library(epitools)
library(DescTools)
## Warning: package 'DescTools' was built under R version 4.3.1
library(DT)
## Warning: package 'DT' was built under R version 4.3.1
library(energy)
## Warning: package 'energy' was built under R version 4.3.1
options(digits = 4)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.1
## Warning: package 'tidyr' was built under R version 4.3.1
## Warning: package 'readr' was built under R version 4.3.1
## Warning: package 'purrr' was built under R version 4.3.1
## Warning: package 'dplyr' was built under R version 4.3.1
## Warning: package 'forcats' was built under R version 4.3.1
## Warning: package 'lubridate' was built under R version 4.3.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Warning: package 'caret' was built under R version 4.3.1
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
## 
## The following objects are masked from 'package:DescTools':
## 
##     MAE, RMSE

Dữ liệu

library(readxl)
EDA <- read_excel("C:/For_EDA_dataset 1.xlsx")
## Warning: Expecting numeric in G3081 / R3081C7: got a date
## New names:
## • `` -> `...1`
str(EDA)
## tibble [153,430 × 15] (S3: tbl_df/tbl/data.frame)
##  $ ...1         : num [1:153430] 0 1 2 3 4 5 6 7 8 9 ...
##  $ property_type: chr [1:153430] "Flat" "Flat" "House" "House" ...
##  $ price        : num [1:153430] 10000000 6900000 16500000 43500000 7000000 34500000 27000000 7800000 50000000 40000000 ...
##  $ location     : chr [1:153430] "G-10" "E-11" "G-15" "Bani Gala" ...
##  $ city         : chr [1:153430] "Islamabad" "Islamabad" "Islamabad" "Islamabad" ...
##  $ province_name: chr [1:153430] "Islamabad Capital" "Islamabad Capital" "Islamabad Capital" "Islamabad Capital" ...
##  $ latitude     : num [1:153430] 3.37e+06 3.37e+07 3.36e+16 3.37e+13 3.35e+07 ...
##  $ longitude    : num [1:153430] 7.30e+06 7.30e+07 7.29e+07 7.32e+12 7.33e+07 ...
##  $ baths        : num [1:153430] 2 3 6 4 3 8 8 2 7 5 ...
##  $ purpose      : chr [1:153430] "For Sale" "For Sale" "For Sale" "For Sale" ...
##  $ bedrooms     : num [1:153430] 2 3 5 4 3 8 8 2 7 5 ...
##  $ date_added   : POSIXct[1:153430], format: "2019-02-04" "2019-05-04" ...
##  $ agency       : chr [1:153430] "Self" "Self" "Self" "Self" ...
##  $ agent        : chr [1:153430] "Self" "Self" "Self" "Self" ...
##  $ Area_in_Marla: num [1:153430] 4 5.6 8 40 8 32 20 6.2 20 20 ...

Thống kê mô tả biến

  • Purpose: mục đích của căn hộ.

Bảng tần số

z<-table(EDA$purpose)
table(EDA$purpose)
## 
## For Rent For Sale 
##    43183   110247
table(EDA$purpose)/sum(table(EDA$purpose))
## 
## For Rent For Sale 
##   0.2815   0.7185

Đồ thị

F <- data.frame(EDA$purpose)
F |> ggplot(aes(EDA$purpose)) + geom_bar() + ylab ("Số nhà") + xlab ("Mục đích")

ggplot(EDA,aes(purpose))+ geom_bar(color = "black", fill = "white")+ geom_text(aes(label = scales :: percent(after_stat(count/sum(count)))), stat=  'count', color = 'black', vjust = -.5)+ ylab("Number")+ xlab("purpose")

Ước lượng tỷ lệ

Ước lượng tỷ lệ số nhà có mục đích để bán với p=0.72

a<-EDA[EDA$purpose == "For Sale",]
prop.test(length(a$purpose),length(EDA$purpose),p= 0.72)
## 
##  1-sample proportions test with continuity correction
## 
## data:  length(a$purpose) out of length(EDA$purpose), null probability 0.72
## X-squared = 1.6, df = 1, p-value = 0.2
## alternative hypothesis: true p is not equal to 0.72
## 95 percent confidence interval:
##  0.7163 0.7208
## sample estimates:
##      p 
## 0.7185
  • khoảng ước lượng với mức ý nghĩ 5% là từ 0.7163 đến 0.7208

Ước lượng tỷ lệ số nhà có mục đích để cho thuê với p=0.28

a<-EDA[EDA$purpose == "For Rent",]
prop.test(length(a$purpose),length(EDA$purpose),p= 0.28)
## 
##  1-sample proportions test with continuity correction
## 
## data:  length(a$purpose) out of length(EDA$purpose), null probability 0.28
## X-squared = 1.6, df = 1, p-value = 0.2
## alternative hypothesis: true p is not equal to 0.28
## 95 percent confidence interval:
##  0.2792 0.2837
## sample estimates:
##      p 
## 0.2815
  • khoảng ước lượng với mức ý nghĩ 5% là từ 0.2792 đến 0.2837

Mô hình hồi quy

Ước lượng hàm hồi qui logit

mh1 <-glm(data = EDA, formula = factor(purpose) ~ price + baths + bedrooms + Area_in_Marla + property_type,family = binomial(link = 'logit'))
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(mh1)
## 
## Call:
## glm(formula = factor(purpose) ~ price + baths + bedrooms + Area_in_Marla + 
##     property_type, family = binomial(link = "logit"), data = EDA)
## 
## Coefficients:
##                             Estimate Std. Error   z value Pr(>|z|)    
## (Intercept)                 4.80e+13   2.73e+06  1.76e+07   <2e-16 ***
## price                       6.58e+07   5.08e-03  1.29e+10   <2e-16 ***
## baths                      -4.01e+13   9.16e+04 -4.38e+08   <2e-16 ***
## bedrooms                    6.75e+13   1.22e+05  5.54e+08   <2e-16 ***
## Area_in_Marla              -2.98e+12   1.84e+03 -1.62e+09   <2e-16 ***
## property_typeFlat           1.56e+15   2.75e+06  5.65e+08   <2e-16 ***
## property_typeHouse         -5.18e+14   2.74e+06 -1.89e+08   <2e-16 ***
## property_typeLower Portion -3.93e+14   2.83e+06 -1.39e+08   <2e-16 ***
## property_typePenthouse     -4.68e+14   4.39e+06 -1.07e+08   <2e-16 ***
## property_typeRoom          -1.28e+15   3.77e+06 -3.38e+08   <2e-16 ***
## property_typeUpper Portion -2.95e+14   2.80e+06 -1.06e+08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance:  182373  on 153429  degrees of freedom
## Residual deviance: 1385951  on 153419  degrees of freedom
## AIC: 1385973
## 
## Number of Fisher Scoring iterations: 25
BrierScore (mh1)
## [1] 0.1253

Ước lượng hàm hồi qui probit

mh2 <- glm(factor(purpose) ~ price + baths + bedrooms + Area_in_Marla + property_type, family = binomial(link = "probit"), data = EDA)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(mh2)
## 
## Call:
## glm(formula = factor(purpose) ~ price + baths + bedrooms + Area_in_Marla + 
##     property_type, family = binomial(link = "probit"), data = EDA)
## 
## Coefficients:
##                             Estimate Std. Error   z value Pr(>|z|)    
## (Intercept)                 7.01e+14   2.73e+06  2.57e+08   <2e-16 ***
## price                       5.40e+07   5.08e-03  1.06e+10   <2e-16 ***
## baths                      -5.22e+13   9.16e+04 -5.70e+08   <2e-16 ***
## bedrooms                   -1.22e+14   1.22e+05 -1.00e+09   <2e-16 ***
## Area_in_Marla              -2.86e+12   1.84e+03 -1.55e+09   <2e-16 ***
## property_typeFlat           1.82e+14   2.75e+06  6.61e+07   <2e-16 ***
## property_typeHouse          3.53e+14   2.74e+06  1.29e+08   <2e-16 ***
## property_typeLower Portion -4.69e+14   2.83e+06 -1.66e+08   <2e-16 ***
## property_typePenthouse     -9.52e+14   4.39e+06 -2.17e+08   <2e-16 ***
## property_typeRoom          -3.87e+15   3.77e+06 -1.03e+09   <2e-16 ***
## property_typeUpper Portion -3.31e+14   2.80e+06 -1.18e+08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance:  182373  on 153429  degrees of freedom
## Residual deviance: 1961424  on 153419  degrees of freedom
## AIC: 1961446
## 
## Number of Fisher Scoring iterations: 25
BrierScore (mh2)
## [1] 0.1773

Ước lương hàm hồi qui log

mh3 <- glm(data = EDA, formula = factor(purpose) ~ price + baths + bedrooms + Area_in_Marla + property_type, family = binomial(link = "cloglog"))
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
levels(factor(EDA$purpose))
## [1] "For Rent" "For Sale"
summary(mh3)
## 
## Call:
## glm(formula = factor(purpose) ~ price + baths + bedrooms + Area_in_Marla + 
##     property_type, family = binomial(link = "cloglog"), data = EDA)
## 
## Coefficients:
##                             Estimate Std. Error   z value Pr(>|z|)    
## (Intercept)                 1.01e+14   2.73e+06  3.71e+07   <2e-16 ***
## price                       4.23e+07   5.08e-03  8.31e+09   <2e-16 ***
## baths                      -2.89e+13   9.16e+04 -3.16e+08   <2e-16 ***
## bedrooms                   -4.27e+13   1.22e+05 -3.50e+08   <2e-16 ***
## Area_in_Marla              -1.87e+12   1.84e+03 -1.02e+09   <2e-16 ***
## property_typeFlat           3.19e+14   2.75e+06  1.16e+08   <2e-16 ***
## property_typeHouse         -9.16e+13   2.74e+06 -3.34e+07   <2e-16 ***
## property_typeLower Portion -6.76e+14   2.83e+06 -2.39e+08   <2e-16 ***
## property_typePenthouse      4.04e+14   4.39e+06  9.21e+07   <2e-16 ***
## property_typeRoom          -2.25e+14   3.77e+06 -5.97e+07   <2e-16 ***
## property_typeUpper Portion -1.77e+15   2.80e+06 -6.35e+08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance:  182373  on 153429  degrees of freedom
## Residual deviance: 1345942  on 153419  degrees of freedom
## AIC: 1345964
## 
## Number of Fisher Scoring iterations: 25

Đánh giá mô hình

Tiêu chí AIC

aic1 <- AIC(mh1)
aic2 <- AIC(mh2)
aic3 <- AIC(mh3)
AIC <-cbind(aic1,aic2,aic3)
AIC
##         aic1    aic2    aic3
## [1,] 1385973 1961446 1345964

Tiêu chí Deviance

de1 <- deviance(mh1)
de2 <- deviance(mh2)
de3 <- deviance(mh3)
deviance <- cbind(de1,de2,de3)
deviance
##          de1     de2     de3
## [1,] 1385951 1961424 1345942

Tiêu chí Brier Score

bs1 <- BrierScore(mh1)
bs2 <- BrierScore(mh2)
bs3 <- BrierScore(mh3)
BrierScore <- cbind(bs1,bs2,bs3)
BrierScore
##         bs1    bs2    bs3
## [1,] 0.1253 0.1773 0.1217

-> Giá trị của 3 tiêu chí trên càng nhỏ nghĩa là mô hình càng tốt.Vì vậy ta lựa chọn mô hình hàm hồi quy probit.

summary(mh2)
## 
## Call:
## glm(formula = factor(purpose) ~ price + baths + bedrooms + Area_in_Marla + 
##     property_type, family = binomial(link = "probit"), data = EDA)
## 
## Coefficients:
##                             Estimate Std. Error   z value Pr(>|z|)    
## (Intercept)                 7.01e+14   2.73e+06  2.57e+08   <2e-16 ***
## price                       5.40e+07   5.08e-03  1.06e+10   <2e-16 ***
## baths                      -5.22e+13   9.16e+04 -5.70e+08   <2e-16 ***
## bedrooms                   -1.22e+14   1.22e+05 -1.00e+09   <2e-16 ***
## Area_in_Marla              -2.86e+12   1.84e+03 -1.55e+09   <2e-16 ***
## property_typeFlat           1.82e+14   2.75e+06  6.61e+07   <2e-16 ***
## property_typeHouse          3.53e+14   2.74e+06  1.29e+08   <2e-16 ***
## property_typeLower Portion -4.69e+14   2.83e+06 -1.66e+08   <2e-16 ***
## property_typePenthouse     -9.52e+14   4.39e+06 -2.17e+08   <2e-16 ***
## property_typeRoom          -3.87e+15   3.77e+06 -1.03e+09   <2e-16 ***
## property_typeUpper Portion -3.31e+14   2.80e+06 -1.18e+08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance:  182373  on 153429  degrees of freedom
## Residual deviance: 1961424  on 153419  degrees of freedom
## AIC: 1961446
## 
## Number of Fisher Scoring iterations: 25

Kết quả cho thấy tất cả các biến độc lập đều có ý nghĩa thống kê với p-value<0.05. Như vậy các biến độc lập đều ảnh hưởng đến biến phụ thuộc.