library(readxl)
## Warning: package 'readxl' was built under R version 4.3.1
library(DescTools)
## Warning: package 'DescTools' was built under R version 4.3.1
library(gtsummary)
## Warning: package 'gtsummary' was built under R version 4.3.1
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.1
library(epitools)
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.1
## Warning: package 'tibble' was built under R version 4.3.1
## Warning: package 'tidyr' was built under R version 4.3.1
## Warning: package 'readr' was built under R version 4.3.1
## Warning: package 'purrr' was built under R version 4.3.1
## Warning: package 'dplyr' was built under R version 4.3.1
## Warning: package 'stringr' was built under R version 4.3.1
## Warning: package 'forcats' was built under R version 4.3.1
## Warning: package 'lubridate' was built under R version 4.3.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Dữ liệu

a<-file.choose()
data <- read_xlsx(a)
## Warning: Expecting numeric in G3081 / R3081C7: got a date
## New names:
## • `` -> `...1`
str(data)
## tibble [153,430 × 15] (S3: tbl_df/tbl/data.frame)
##  $ ...1         : num [1:153430] 0 1 2 3 4 5 6 7 8 9 ...
##  $ property_type: chr [1:153430] "Flat" "Flat" "House" "House" ...
##  $ price        : num [1:153430] 10000000 6900000 16500000 43500000 7000000 34500000 27000000 7800000 50000000 40000000 ...
##  $ location     : chr [1:153430] "G-10" "E-11" "G-15" "Bani Gala" ...
##  $ city         : chr [1:153430] "Islamabad" "Islamabad" "Islamabad" "Islamabad" ...
##  $ province_name: chr [1:153430] "Islamabad Capital" "Islamabad Capital" "Islamabad Capital" "Islamabad Capital" ...
##  $ latitude     : num [1:153430] 3.37e+06 3.37e+07 3.36e+16 3.37e+13 3.35e+07 ...
##  $ longitude    : num [1:153430] 7.30e+06 7.30e+07 7.29e+07 7.32e+12 7.33e+07 ...
##  $ baths        : num [1:153430] 2 3 6 4 3 8 8 2 7 5 ...
##  $ purpose      : chr [1:153430] "For Sale" "For Sale" "For Sale" "For Sale" ...
##  $ bedrooms     : num [1:153430] 2 3 5 4 3 8 8 2 7 5 ...
##  $ date_added   : POSIXct[1:153430], format: "2019-02-04" "2019-05-04" ...
##  $ agency       : chr [1:153430] "Self" "Self" "Self" "Self" ...
##  $ agent        : chr [1:153430] "Self" "Self" "Self" "Self" ...
##  $ Area_in_Marla: num [1:153430] 4 5.6 8 40 8 32 20 6.2 20 20 ...

Thống kê mô tả các biến

pur1 <- table(data$purpose)
pur1
## 
## For Rent For Sale 
##    43183   110247
pur1a <- prop.table(pur1);pur1a
## 
##  For Rent  For Sale 
## 0.2814508 0.7185492
addmargins(pur1)
## 
## For Rent For Sale      Sum 
##    43183   110247   153430
library(ggplot2)
 data |> ggplot(aes(x = data$purpose, y = after_stat(count))) + geom_bar(fill = 'blue') + geom_text(aes(label = scales::percent(after_stat(count/sum(count)))), stat = 'count', color = 'black', vjust = - .5) + theme_classic() + labs(x = 'Purpose', y = 'Frequency')
## Warning: Use of `data$purpose` is discouraged.
## ℹ Use `purpose` instead.
## Use of `data$purpose` is discouraged.
## ℹ Use `purpose` instead.

Ước lượng tỷ lệ

m <- data[data$purpose == "For Rent",]
prop.test(length(m$purpose), length(data$purpose), p= 0.28)
## 
##  1-sample proportions test with continuity correction
## 
## data:  length(m$purpose) out of length(data$purpose), null probability 0.28
## X-squared = 1.5948, df = 1, p-value = 0.2066
## alternative hypothesis: true p is not equal to 0.28
## 95 percent confidence interval:
##  0.2792029 0.2837097
## sample estimates:
##         p 
## 0.2814508

Mô hình hồi quy

Hồi quy logit

mh1 <- glm(formula = factor(data$purpose) ~ data$property_type + data$price + data$baths + data$bedrooms + data$Area_in_Marla, family = binomial(link = "logit"), data = data)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(mh1)
## 
## Call:
## glm(formula = factor(data$purpose) ~ data$property_type + data$price + 
##     data$baths + data$bedrooms + data$Area_in_Marla, family = binomial(link = "logit"), 
##     data = data)
## 
## Coefficients:
##                                   Estimate Std. Error    z value Pr(>|z|)    
## (Intercept)                      4.798e+13  2.733e+06  1.756e+07   <2e-16 ***
## data$property_typeFlat           1.556e+15  2.752e+06  5.652e+08   <2e-16 ***
## data$property_typeHouse         -5.182e+14  2.739e+06 -1.892e+08   <2e-16 ***
## data$property_typeLower Portion -3.930e+14  2.827e+06 -1.390e+08   <2e-16 ***
## data$property_typePenthouse     -4.679e+14  4.388e+06 -1.066e+08   <2e-16 ***
## data$property_typeRoom          -1.277e+15  3.774e+06 -3.384e+08   <2e-16 ***
## data$property_typeUpper Portion -2.953e+14  2.795e+06 -1.056e+08   <2e-16 ***
## data$price                       6.576e+07  5.084e-03  1.294e+10   <2e-16 ***
## data$baths                      -4.013e+13  9.157e+04 -4.382e+08   <2e-16 ***
## data$bedrooms                    6.753e+13  1.220e+05  5.535e+08   <2e-16 ***
## data$Area_in_Marla              -2.979e+12  1.843e+03 -1.617e+09   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance:  182373  on 153429  degrees of freedom
## Residual deviance: 1385951  on 153419  degrees of freedom
## AIC: 1385973
## 
## Number of Fisher Scoring iterations: 25

Hồi quy probit

mh2 <- glm(formula = factor(data$purpose) ~ data$property_type + data$price + data$baths + data$bedrooms + data$Area_in_Marla, family = binomial(link = "probit"), data = data)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(mh2)
## 
## Call:
## glm(formula = factor(data$purpose) ~ data$property_type + data$price + 
##     data$baths + data$bedrooms + data$Area_in_Marla, family = binomial(link = "probit"), 
##     data = data)
## 
## Coefficients:
##                                   Estimate Std. Error    z value Pr(>|z|)    
## (Intercept)                      6.968e+14  2.733e+06  2.550e+08   <2e-16 ***
## data$property_typeFlat           1.917e+14  2.752e+06  6.964e+07   <2e-16 ***
## data$property_typeHouse          3.660e+14  2.739e+06  1.336e+08   <2e-16 ***
## data$property_typeLower Portion -4.624e+14  2.827e+06 -1.636e+08   <2e-16 ***
## data$property_typePenthouse     -9.449e+14  4.388e+06 -2.153e+08   <2e-16 ***
## data$property_typeRoom          -1.099e+15  3.774e+06 -2.911e+08   <2e-16 ***
## data$property_typeUpper Portion -3.249e+14  2.795e+06 -1.162e+08   <2e-16 ***
## data$price                       5.401e+07  5.084e-03  1.062e+10   <2e-16 ***
## data$baths                      -5.222e+13  9.157e+04 -5.702e+08   <2e-16 ***
## data$bedrooms                   -1.233e+14  1.220e+05 -1.011e+09   <2e-16 ***
## data$Area_in_Marla              -2.857e+12  1.843e+03 -1.551e+09   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance:  182373  on 153429  degrees of freedom
## Residual deviance: 1959910  on 153419  degrees of freedom
## AIC: 1959932
## 
## Number of Fisher Scoring iterations: 25

Hồi quy cloglog

mh3 <- glm(formula = factor(data$purpose) ~ data$property_type + data$price + data$baths + data$bedrooms + data$Area_in_Marla, family = binomial(link = "cloglog"), data = data)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(mh3)
## 
## Call:
## glm(formula = factor(data$purpose) ~ data$property_type + data$price + 
##     data$baths + data$bedrooms + data$Area_in_Marla, family = binomial(link = "cloglog"), 
##     data = data)
## 
## Coefficients:
##                                   Estimate Std. Error    z value Pr(>|z|)    
## (Intercept)                      1.014e+14  2.733e+06  3.711e+07   <2e-16 ***
## data$property_typeFlat           3.185e+14  2.752e+06  1.157e+08   <2e-16 ***
## data$property_typeHouse         -9.156e+13  2.739e+06 -3.342e+07   <2e-16 ***
## data$property_typeLower Portion -6.761e+14  2.827e+06 -2.392e+08   <2e-16 ***
## data$property_typePenthouse      4.042e+14  4.388e+06  9.213e+07   <2e-16 ***
## data$property_typeRoom          -2.253e+14  3.774e+06 -5.971e+07   <2e-16 ***
## data$property_typeUpper Portion -1.774e+15  2.795e+06 -6.346e+08   <2e-16 ***
## data$price                       4.226e+07  5.084e-03  8.311e+09   <2e-16 ***
## data$baths                      -2.894e+13  9.157e+04 -3.160e+08   <2e-16 ***
## data$bedrooms                   -4.268e+13  1.220e+05 -3.498e+08   <2e-16 ***
## data$Area_in_Marla              -1.871e+12  1.843e+03 -1.015e+09   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance:  182373  on 153429  degrees of freedom
## Residual deviance: 1345942  on 153419  degrees of freedom
## AIC: 1345964
## 
## Number of Fisher Scoring iterations: 25