options(repos = c(CRAN = "https://cran.r-project.org"))
install.packages("readxl")
## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'readxl' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\HP\AppData\Local\Temp\Rtmpkp9bsP\downloaded_packages
library(readxl)
## Warning: package 'readxl' was built under R version 4.3.1
install.packages("tidyselect")
## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'tidyselect' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\HP\AppData\Local\Temp\Rtmpkp9bsP\downloaded_packages
library(tidyselect)
## Warning: package 'tidyselect' was built under R version 4.3.1
install.packages("epitools")
## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'epitools' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\HP\AppData\Local\Temp\Rtmpkp9bsP\downloaded_packages
library(epitools)
install.packages("DescTools")
## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'DescTools' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\HP\AppData\Local\Temp\Rtmpkp9bsP\downloaded_packages
library(DescTools)
## Warning: package 'DescTools' was built under R version 4.3.1
install.packages("dplyr")
## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'dplyr' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\HP\AppData\Local\Temp\Rtmpkp9bsP\downloaded_packages
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.1
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
install.packages("ggplot2")
## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'ggplot2' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\HP\AppData\Local\Temp\Rtmpkp9bsP\downloaded_packages
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.1
install.packages("caTools")
## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'caTools' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\HP\AppData\Local\Temp\Rtmpkp9bsP\downloaded_packages
library(caTools)
## Warning: package 'caTools' was built under R version 4.3.1
install.packages("caret")
## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'caret' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\HP\AppData\Local\Temp\Rtmpkp9bsP\downloaded_packages
library(caret)
## Warning: package 'caret' was built under R version 4.3.1
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:DescTools':
## 
##     MAE, RMSE
install.packages("fBasics")
## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'fBasics' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\HP\AppData\Local\Temp\Rtmpkp9bsP\downloaded_packages
library(fBasics)
## Warning: package 'fBasics' was built under R version 4.3.1
install.packages("lmtest")
## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'lmtest' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\HP\AppData\Local\Temp\Rtmpkp9bsP\downloaded_packages
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.3.1
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.3.1
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

1 Giới thiệu

Dữ liệu bao gồm 153430 quan sát và 15 biến:

  • Property type: là các loại tài sản. Trong phần này, chúng ta có 6 loại khác nhau: House, FarmHouse, Upper Portion, Lower Portion, Flat, Room

  • Price: là giá của các loại tài sản

  • Location: về các loại vị trí khác nhau trong mỗi thành phố.

  • City: thành phố. Trong bộ dữ liệu này có 5 thành phố:Lahore, Karachi, Faisalabad, Rawalpindi, Islamabad

  • Province_name: tên tỉnh

  • Latitude: chiều rộng của căn nhà

  • Longitde: Chiều dài của căn nhà

  • Baths: số phòng tắm

  • Purpose: mục đích của căn hộ

  • Bedrooms: số phòng ngủ

  • Date_added: Ngày được thêm vào

  • Agency: hãng

  • Agent: đại lý

  • Area in Marla: khu vực ở Marla

  • Biến số thứ tự

d <- read_excel("D:/EDA.xlsx")
## Warning: Expecting numeric in G3081 / R3081C7: got a date
## New names:
## • `` -> `...1`
View(d)
str(d)
## tibble [153,430 × 15] (S3: tbl_df/tbl/data.frame)
##  $ ...1         : num [1:153430] 0 1 2 3 4 5 6 7 8 9 ...
##  $ property_type: chr [1:153430] "Flat" "Flat" "House" "House" ...
##  $ price        : num [1:153430] 10000000 6900000 16500000 43500000 7000000 34500000 27000000 7800000 50000000 40000000 ...
##  $ location     : chr [1:153430] "G-10" "E-11" "G-15" "Bani Gala" ...
##  $ city         : chr [1:153430] "Islamabad" "Islamabad" "Islamabad" "Islamabad" ...
##  $ province_name: chr [1:153430] "Islamabad Capital" "Islamabad Capital" "Islamabad Capital" "Islamabad Capital" ...
##  $ latitude     : num [1:153430] 3.37e+06 3.37e+07 3.36e+16 3.37e+13 3.35e+07 ...
##  $ longitude    : num [1:153430] 7.30e+06 7.30e+07 7.29e+07 7.32e+12 7.33e+07 ...
##  $ baths        : num [1:153430] 2 3 6 4 3 8 8 2 7 5 ...
##  $ purpose      : chr [1:153430] "For Sale" "For Sale" "For Sale" "For Sale" ...
##  $ bedrooms     : num [1:153430] 2 3 5 4 3 8 8 2 7 5 ...
##  $ date_added   : POSIXct[1:153430], format: "2019-02-04" "2019-05-04" ...
##  $ agency       : chr [1:153430] "Self" "Self" "Self" "Self" ...
##  $ agent        : chr [1:153430] "Self" "Self" "Self" "Self" ...
##  $ Area_in_Marla: num [1:153430] 4 5.6 8 40 8 32 20 6.2 20 20 ...

2 Thống kê mô tả biến phụ thuộc

table(d$purpose)
## 
## For Rent For Sale 
##    43183   110247
a<-d[d$purpose == "For Rent",]
length(d$purpose)
## [1] 153430
ggplot(d,aes(purpose))+
  geom_bar(color = "lightblue", fill = "lavender")+
   geom_text(aes(label = scales :: percent(after_stat(count/sum(count)))), stat=  'count', color = 'black', vjust = -.5)+
  ylab("Number of property")+ xlab("purpose")

3 Ước luợng tỷ lệ

Ước lượng tỷ lệ căn hộ với mục đích cho thuê có phải là 28% hay không

Giả thuyết” \(H_0: p = 0.28\)

prop.test(length(a$purpose),length(d$purpose),p= 0.28)
## 
##  1-sample proportions test with continuity correction
## 
## data:  length(a$purpose) out of length(d$purpose), null probability 0.28
## X-squared = 1.5948, df = 1, p-value = 0.2066
## alternative hypothesis: true p is not equal to 0.28
## 95 percent confidence interval:
##  0.2792029 0.2837097
## sample estimates:
##         p 
## 0.2814508

p-value > 5%, chưa đủ cơ sở bác bỏ giả thuyết \(H_0\). DO đó tỷ lệ số căn hộ với mục đích cho thuê bằng 28% với mức ý nghĩa 5%.

Khoảng ước lượng tỷ lệ số căn hộ với mục đích bán với độ tin cậy 95% là (0,2792029;0,2837097)

4 Hồi quy logistic

4.1 Mô hình logit

d$purpose<-as.factor(d$purpose)
d$property_type<-as.factor(d$property_type)
logit <- glm(factor (purpose) ~ price + baths + bedrooms + property_type  , family = binomial(link = "logit"), data = d)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logit)
## 
## Call:
## glm(formula = factor(purpose) ~ price + baths + bedrooms + property_type, 
##     family = binomial(link = "logit"), data = d)
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -3.294e+00  5.512e-01  -5.977 2.28e-09 ***
## price                       5.445e-06  1.180e-07  46.142  < 2e-16 ***
## baths                      -9.078e-02  5.575e-02  -1.628   0.1035    
## bedrooms                   -3.956e-01  6.054e-02  -6.535 6.36e-11 ***
## property_typeFlat          -1.257e-01  5.520e-01  -0.228   0.8199    
## property_typeHouse         -2.907e+00  5.586e-01  -5.205 1.94e-07 ***
## property_typeLower Portion -2.582e+00  6.158e-01  -4.194 2.74e-05 ***
## property_typePenthouse     -1.377e+00  1.050e+00  -1.311   0.1898    
## property_typeRoom          -1.881e+00  7.841e-01  -2.399   0.0164 *  
## property_typeUpper Portion -2.489e+00  5.946e-01  -4.186 2.84e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 182372.5  on 153429  degrees of freedom
## Residual deviance:   3259.2  on 153420  degrees of freedom
## AIC: 3279.2
## 
## Number of Fisher Scoring iterations: 16

4.2 Mô hình probit

probit<-glm(formula = purpose~ price + baths +bedrooms  + property_type, family = binomial(link = "probit"), data = d)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(probit)
## 
## Call:
## glm(formula = purpose ~ price + baths + bedrooms + property_type, 
##     family = binomial(link = "probit"), data = d)
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -1.741e+00  2.655e-01  -6.560 5.39e-11 ***
## price                       2.429e-06  3.865e-08  62.841  < 2e-16 ***
## baths                      -1.252e-02  2.169e-02  -0.577 0.563782    
## bedrooms                   -1.534e-01  2.336e-02  -6.569 5.08e-11 ***
## property_typeFlat          -2.094e-01  2.656e-01  -0.788 0.430532    
## property_typeHouse         -1.146e+00  2.673e-01  -4.286 1.82e-05 ***
## property_typeLower Portion -1.017e+00  2.784e-01  -3.653 0.000259 ***
## property_typePenthouse     -6.143e-01  4.314e-01  -1.424 0.154458    
## property_typeRoom          -8.047e-01  3.317e-01  -2.426 0.015266 *  
## property_typeUpper Portion -9.890e-01  2.738e-01  -3.612 0.000304 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 182372.5  on 153429  degrees of freedom
## Residual deviance:   3585.2  on 153420  degrees of freedom
## AIC: 3605.2
## 
## Number of Fisher Scoring iterations: 19

4.3 Mô hình cloglog

cloglog<-glm(formula = purpose~ price + baths +bedrooms  + property_type, family = binomial(link = "cloglog"), data = d)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(cloglog)
## 
## Call:
## glm(formula = purpose ~ price + baths + bedrooms + property_type, 
##     family = binomial(link = "cloglog"), data = d)
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -6.844e+00  5.164e-01 -13.254  < 2e-16 ***
## price                       4.097e-06  7.441e-08  55.061  < 2e-16 ***
## baths                       6.728e-01  4.637e-02  14.510  < 2e-16 ***
## bedrooms                   -1.023e+00  4.850e-02 -21.091  < 2e-16 ***
## property_typeFlat           3.642e+00  5.184e-01   7.024 2.15e-12 ***
## property_typeHouse          1.516e+00  5.203e-01   2.914 0.003573 ** 
## property_typeLower Portion -6.547e-01  5.970e-01  -1.096 0.272863    
## property_typePenthouse      2.849e+00  7.596e-01   3.750 0.000177 ***
## property_typeRoom           2.127e+00  6.693e-01   3.178 0.001483 ** 
## property_typeUpper Portion  1.471e+00  5.416e-01   2.716 0.006599 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 182372.5  on 153429  degrees of freedom
## Residual deviance:   4505.8  on 153420  degrees of freedom
## AIC: 4525.8
## 
## Number of Fisher Scoring iterations: 25

5 Lựa chọn mô hình

5.1 Tiêu chỉ AIC

AIC(logit) = 3279,2

AIC(probit) = 3605,2

AIC(cloglog) = 4525,8

Mô hình logit có AIC nhỏ nhất nên ta chọn mô hình logit

5.2 Deviance

Deviance(logit) = 3259,2

Deviance(probit) = 3585,2

Deviance(cloglog) = 4505,8

Mô hình logit có deviance nhỏ nhất nên ta chọn mô hình logit

5.3 Brier Score

BrierScore(logit)
## [1] 0.00189995
BrierScore(probit)
## [1] 0.002125317
BrierScore(cloglog)
## [1] 0.002385188

Dựa vào tiêu chí Brier score ta thấy mô hình logit có giá trị nhỏ nhất nên ta chọn logit

5.4 Ma trận nhầm lẫn

5.4.1 Mô hình logit

predictions <- predict(logit, newdata = d, type = "response")
predicted_classes <- ifelse(predictions > 0.5, "1", "0") 
predictions1<-factor(predicted_classes, levels = c("0","1"))
actual<- factor(d$purpose, labels = c("0","1"))
confusionMatrix(table(predictions1, actual))
## Confusion Matrix and Statistics
## 
##             actual
## predictions1      0      1
##            0  43116    313
##            1     67 109934
##                                           
##                Accuracy : 0.9975          
##                  95% CI : (0.9973, 0.9978)
##     No Information Rate : 0.7185          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9939          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9984          
##             Specificity : 0.9972          
##          Pos Pred Value : 0.9928          
##          Neg Pred Value : 0.9994          
##              Prevalence : 0.2815          
##          Detection Rate : 0.2810          
##    Detection Prevalence : 0.2831          
##       Balanced Accuracy : 0.9978          
##                                           
##        'Positive' Class : 0               
## 

Mô hình logit có độ chính xác toàn thể là 99,84%, độ nhạy là 99,84% và độ hiệu quả là 99,72%

5.4.2 Mô hình probit

predictions <- predict(probit, newdata = d, type = "response")
predicted_classes <- ifelse(predictions > 0.5, "1", "0")  
predictions1<-factor(predicted_classes, levels = c("0","1"))
actual<- factor(d$purpose, labels = c("0","1"))
confusionMatrix(table(predictions1, actual))
## Confusion Matrix and Statistics
## 
##             actual
## predictions1      0      1
##            0  43111    322
##            1     72 109925
##                                           
##                Accuracy : 0.9974          
##                  95% CI : (0.9972, 0.9977)
##     No Information Rate : 0.7185          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9937          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9983          
##             Specificity : 0.9971          
##          Pos Pred Value : 0.9926          
##          Neg Pred Value : 0.9993          
##              Prevalence : 0.2815          
##          Detection Rate : 0.2810          
##    Detection Prevalence : 0.2831          
##       Balanced Accuracy : 0.9977          
##                                           
##        'Positive' Class : 0               
## 

Mô hình probit có độ chính xác toàn thể là 99,74%, độ nhạy là 99,83% và độ hiệu quả là 99,71%

5.4.3 Mô hình cloglog

predictions <- predict(cloglog, newdata = d, type = "response")
predicted_classes <- ifelse(predictions > 0.5, "1", "0") 
predictions1<-factor(predicted_classes, levels = c("0","1"))
actual<- factor(d$purpose, labels = c("0","1"))
confusionMatrix(table(predictions1, actual))
## Confusion Matrix and Statistics
## 
##             actual
## predictions1      0      1
##            0  43089    365
##            1     94 109882
##                                           
##                Accuracy : 0.997           
##                  95% CI : (0.9967, 0.9973)
##     No Information Rate : 0.7185          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9926          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9978          
##             Specificity : 0.9967          
##          Pos Pred Value : 0.9916          
##          Neg Pred Value : 0.9991          
##              Prevalence : 0.2815          
##          Detection Rate : 0.2808          
##    Detection Prevalence : 0.2832          
##       Balanced Accuracy : 0.9973          
##                                           
##        'Positive' Class : 0               
## 

Mô hình cloglog có độ chính xác toàn thể là 99,7%, độ nhạy là 99,78% và độ hiệu quả là 99,67%

Dựa vào 4 tiêu chuẩn trên ta thấy mô hình logit là mô hình được chọn. Do đó Mô hình logit là mô hình phù hợp nhất trong 3 mô hình.