library(readxl)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.1
library(DT)
## Warning: package 'DT' was built under R version 4.3.1
EDA <- read_excel("D:/data/EDA.xlsx")
## Warning: Expecting numeric in G3081 / R3081C7: got a date
## New names:
## • `` -> `...1`
str(EDA)
## tibble [153,430 × 15] (S3: tbl_df/tbl/data.frame)
##  $ ...1         : num [1:153430] 0 1 2 3 4 5 6 7 8 9 ...
##  $ property_type: chr [1:153430] "Flat" "Flat" "House" "House" ...
##  $ price        : num [1:153430] 10000000 6900000 16500000 43500000 7000000 34500000 27000000 7800000 50000000 40000000 ...
##  $ location     : chr [1:153430] "G-10" "E-11" "G-15" "Bani Gala" ...
##  $ city         : chr [1:153430] "Islamabad" "Islamabad" "Islamabad" "Islamabad" ...
##  $ province_name: chr [1:153430] "Islamabad Capital" "Islamabad Capital" "Islamabad Capital" "Islamabad Capital" ...
##  $ latitude     : num [1:153430] 3.37e+06 3.37e+07 3.36e+16 3.37e+13 3.35e+07 ...
##  $ longitude    : num [1:153430] 7.30e+06 7.30e+07 7.29e+07 7.32e+12 7.33e+07 ...
##  $ baths        : num [1:153430] 2 3 6 4 3 8 8 2 7 5 ...
##  $ purpose      : chr [1:153430] "For Sale" "For Sale" "For Sale" "For Sale" ...
##  $ bedrooms     : num [1:153430] 2 3 5 4 3 8 8 2 7 5 ...
##  $ date_added   : POSIXct[1:153430], format: "2019-02-04" "2019-05-04" ...
##  $ agency       : chr [1:153430] "Self" "Self" "Self" "Self" ...
##  $ agent        : chr [1:153430] "Self" "Self" "Self" "Self" ...
##  $ Area_in_Marla: num [1:153430] 4 5.6 8 40 8 32 20 6.2 20 20 ...

1 Thống kê mô tả biến Purpose

kq <- table(EDA$purpose)
kq
## 
## For Rent For Sale 
##    43183   110247

Có 43183 căn hộ cho thuê và 110247 căn hộ để bán

table(EDA$CC)/sum(table(EDA$purpose))
## Warning: Unknown or uninitialised column: `CC`.
## numeric(0)

Tỷ lệ căn hộ cho thuê chiếm 28,14%, để bán chiếm 71,85%

ggplot(EDA,aes(purpose))+
  geom_bar(color = "black", fill = "pink")+
   geom_text(aes(label = scales :: percent(after_stat(count/sum(count)))), stat=  'count', color = 'black', vjust = -.5)+
  ylab("soluong")+ xlab("purpose")

2 Ước lượng tỷ lệ

ul1 <- EDA[EDA$purpose == 'For Rent',]
prop.test( length(ul1$purpose), length(EDA$purpose))
## 
##  1-sample proportions test with continuity correction
## 
## data:  length(ul1$purpose) out of length(EDA$purpose), null probability 0.5
## X-squared = 29313, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.2792029 0.2837097
## sample estimates:
##         p 
## 0.2814508

Với độ tin cậy 95% ta có tỷ lệ những người có mục đích cho thuê nhà nằm trong khoảng từ 27,92029% đến 28,37097%.

ul2 <- EDA[EDA$purpose == 'For Sale',]
prop.test( length(ul2$purpose), length(EDA$purpose))
## 
##  1-sample proportions test with continuity correction
## 
## data:  length(ul2$purpose) out of length(EDA$purpose), null probability 0.5
## X-squared = 29313, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.7162903 0.7207971
## sample estimates:
##         p 
## 0.7185492

Với độ tin cậy 95% ta có tỷ lệ những người có mục đích bán nhà nằm trong khoảng từ 71,62903% đến 72,07971%.

3 Mô hình hồi quy

3.1 Mô hình Logit

EDA$purpose <- as.factor(EDA$purpose)
mhlogit <- glm( purpose ~ property_type + price + baths + bedrooms + Area_in_Marla, family= binomial( link = 'logit'), data = EDA)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(mhlogit)
## 
## Call:
## glm(formula = purpose ~ property_type + price + baths + bedrooms + 
##     Area_in_Marla, family = binomial(link = "logit"), data = EDA)
## 
## Coefficients:
##                              Estimate Std. Error    z value Pr(>|z|)    
## (Intercept)                 4.798e+13  2.733e+06  1.756e+07   <2e-16 ***
## property_typeFlat           1.556e+15  2.752e+06  5.652e+08   <2e-16 ***
## property_typeHouse         -5.182e+14  2.739e+06 -1.892e+08   <2e-16 ***
## property_typeLower Portion -3.930e+14  2.827e+06 -1.390e+08   <2e-16 ***
## property_typePenthouse     -4.679e+14  4.388e+06 -1.066e+08   <2e-16 ***
## property_typeRoom          -1.277e+15  3.774e+06 -3.384e+08   <2e-16 ***
## property_typeUpper Portion -2.953e+14  2.795e+06 -1.056e+08   <2e-16 ***
## price                       6.576e+07  5.084e-03  1.294e+10   <2e-16 ***
## baths                      -4.013e+13  9.157e+04 -4.382e+08   <2e-16 ***
## bedrooms                    6.753e+13  1.220e+05  5.535e+08   <2e-16 ***
## Area_in_Marla              -2.979e+12  1.843e+03 -1.617e+09   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance:  182373  on 153429  degrees of freedom
## Residual deviance: 1385951  on 153419  degrees of freedom
## AIC: 1385973
## 
## Number of Fisher Scoring iterations: 25

3.2 Hồi quy với hàm probit

mhprobit <- glm(factor (purpose) ~ price + baths + bedrooms + property_type , family = binomial(link = "probit"), data = EDA)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(mhprobit)
## 
## Call:
## glm(formula = factor(purpose) ~ price + baths + bedrooms + property_type, 
##     family = binomial(link = "probit"), data = EDA)
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -1.741e+00  2.655e-01  -6.560 5.39e-11 ***
## price                       2.429e-06  3.865e-08  62.841  < 2e-16 ***
## baths                      -1.252e-02  2.169e-02  -0.577 0.563782    
## bedrooms                   -1.534e-01  2.336e-02  -6.569 5.08e-11 ***
## property_typeFlat          -2.094e-01  2.656e-01  -0.788 0.430532    
## property_typeHouse         -1.146e+00  2.673e-01  -4.286 1.82e-05 ***
## property_typeLower Portion -1.017e+00  2.784e-01  -3.653 0.000259 ***
## property_typePenthouse     -6.143e-01  4.314e-01  -1.424 0.154458    
## property_typeRoom          -8.047e-01  3.317e-01  -2.426 0.015266 *  
## property_typeUpper Portion -9.890e-01  2.738e-01  -3.612 0.000304 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 182372.5  on 153429  degrees of freedom
## Residual deviance:   3585.2  on 153420  degrees of freedom
## AIC: 3605.2
## 
## Number of Fisher Scoring iterations: 19

3.3 Mô hình cloglog

mhcloglog <- glm(formula = factor(purpose)~ price + baths +bedrooms  + property_type , family = binomial(link = "cloglog"), data = EDA)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(mhcloglog)
## 
## Call:
## glm(formula = factor(purpose) ~ price + baths + bedrooms + property_type, 
##     family = binomial(link = "cloglog"), data = EDA)
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -6.844e+00  5.164e-01 -13.254  < 2e-16 ***
## price                       4.097e-06  7.441e-08  55.061  < 2e-16 ***
## baths                       6.728e-01  4.637e-02  14.510  < 2e-16 ***
## bedrooms                   -1.023e+00  4.850e-02 -21.091  < 2e-16 ***
## property_typeFlat           3.642e+00  5.184e-01   7.024 2.15e-12 ***
## property_typeHouse          1.516e+00  5.203e-01   2.914 0.003573 ** 
## property_typeLower Portion -6.547e-01  5.970e-01  -1.096 0.272863    
## property_typePenthouse      2.849e+00  7.596e-01   3.750 0.000177 ***
## property_typeRoom           2.127e+00  6.693e-01   3.178 0.001483 ** 
## property_typeUpper Portion  1.471e+00  5.416e-01   2.716 0.006599 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 182372.5  on 153429  degrees of freedom
## Residual deviance:   4505.8  on 153420  degrees of freedom
## AIC: 4525.8
## 
## Number of Fisher Scoring iterations: 25

4 Lựa chọn mô hình