options(repos = c(CRAN = "https://cran.r-project.org"))
install.packages("readxl")

## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)

## package 'readxl' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\HP\AppData\Local\Temp\Rtmpkp9bsP\downloaded_packages

library(readxl)

## Warning: package 'readxl' was built under R version 4.3.1

install.packages("tidyselect")

## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)

## package 'tidyselect' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\HP\AppData\Local\Temp\Rtmpkp9bsP\downloaded_packages

library(tidyselect)

## Warning: package 'tidyselect' was built under R version 4.3.1

install.packages("epitools")

## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)

## package 'epitools' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\HP\AppData\Local\Temp\Rtmpkp9bsP\downloaded_packages

library(epitools)
install.packages("DescTools")

## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)

## package 'DescTools' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\HP\AppData\Local\Temp\Rtmpkp9bsP\downloaded_packages

library(DescTools)

## Warning: package 'DescTools' was built under R version 4.3.1

install.packages("dplyr")

## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)

## package 'dplyr' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\HP\AppData\Local\Temp\Rtmpkp9bsP\downloaded_packages

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.3.1

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

install.packages("ggplot2")

## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)

## package 'ggplot2' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\HP\AppData\Local\Temp\Rtmpkp9bsP\downloaded_packages

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.3.1

install.packages("caTools")

## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)

## package 'caTools' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\HP\AppData\Local\Temp\Rtmpkp9bsP\downloaded_packages

library(caTools)

## Warning: package 'caTools' was built under R version 4.3.1

install.packages("caret")

## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)

## package 'caret' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\HP\AppData\Local\Temp\Rtmpkp9bsP\downloaded_packages

library(caret)

## Warning: package 'caret' was built under R version 4.3.1

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following objects are masked from 'package:DescTools':
## 
##     MAE, RMSE

install.packages("fBasics")

## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)

## package 'fBasics' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\HP\AppData\Local\Temp\Rtmpkp9bsP\downloaded_packages

library(fBasics)

## Warning: package 'fBasics' was built under R version 4.3.1

install.packages("lmtest")

## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)

## package 'lmtest' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\HP\AppData\Local\Temp\Rtmpkp9bsP\downloaded_packages

library(lmtest)

## Warning: package 'lmtest' was built under R version 4.3.1

## Loading required package: zoo

## Warning: package 'zoo' was built under R version 4.3.1

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

1 Giới thiệu

Dữ liệu bao gồm 153430 quan sát và 15 biến:

Property type: là các loại tài sản. Trong phần này, chúng ta có 6 loại khác nhau: House, FarmHouse, Upper Portion, Lower Portion, Flat, Room
Price: là giá của các loại tài sản
Location: về các loại vị trí khác nhau trong mỗi thành phố.
City: thành phố. Trong bộ dữ liệu này có 5 thành phố:Lahore, Karachi, Faisalabad, Rawalpindi, Islamabad
Province_name: tên tỉnh
Latitude: chiều rộng của căn nhà
Longitde: Chiều dài của căn nhà
Baths: số phòng tắm
Purpose: mục đích của căn hộ
Bedrooms: số phòng ngủ
Date_added: Ngày được thêm vào
Agency: hãng
Agent: đại lý
Area in Marla: khu vực ở Marla
Biến số thứ tự

d <- read_excel("D:/EDA.xlsx")

## Warning: Expecting numeric in G3081 / R3081C7: got a date

## New names:
## • `` -> `...1`

View(d)
str(d)

## tibble [153,430 × 15] (S3: tbl_df/tbl/data.frame)
##  $ ...1         : num [1:153430] 0 1 2 3 4 5 6 7 8 9 ...
##  $ property_type: chr [1:153430] "Flat" "Flat" "House" "House" ...
##  $ price        : num [1:153430] 10000000 6900000 16500000 43500000 7000000 34500000 27000000 7800000 50000000 40000000 ...
##  $ location     : chr [1:153430] "G-10" "E-11" "G-15" "Bani Gala" ...
##  $ city         : chr [1:153430] "Islamabad" "Islamabad" "Islamabad" "Islamabad" ...
##  $ province_name: chr [1:153430] "Islamabad Capital" "Islamabad Capital" "Islamabad Capital" "Islamabad Capital" ...
##  $ latitude     : num [1:153430] 3.37e+06 3.37e+07 3.36e+16 3.37e+13 3.35e+07 ...
##  $ longitude    : num [1:153430] 7.30e+06 7.30e+07 7.29e+07 7.32e+12 7.33e+07 ...
##  $ baths        : num [1:153430] 2 3 6 4 3 8 8 2 7 5 ...
##  $ purpose      : chr [1:153430] "For Sale" "For Sale" "For Sale" "For Sale" ...
##  $ bedrooms     : num [1:153430] 2 3 5 4 3 8 8 2 7 5 ...
##  $ date_added   : POSIXct[1:153430], format: "2019-02-04" "2019-05-04" ...
##  $ agency       : chr [1:153430] "Self" "Self" "Self" "Self" ...
##  $ agent        : chr [1:153430] "Self" "Self" "Self" "Self" ...
##  $ Area_in_Marla: num [1:153430] 4 5.6 8 40 8 32 20 6.2 20 20 ...

2 Thống kê mô tả biến phụ thuộc

table(d$purpose)

## 
## For Rent For Sale 
##    43183   110247

a<-d[d$purpose == "For Rent",]
length(d$purpose)

## [1] 153430

ggplot(d,aes(purpose))+
  geom_bar(color = "lightblue", fill = "lavender")+
   geom_text(aes(label = scales :: percent(after_stat(count/sum(count)))), stat=  'count', color = 'black', vjust = -.5)+
  ylab("Number of property")+ xlab("purpose")

3 Ước luợng tỷ lệ

Ước lượng tỷ lệ căn hộ với mục đích cho thuê có phải là 28% hay không

Giả thuyết” \(H_0: p = 0.28\)

prop.test(length(a$purpose),length(d$purpose),p= 0.28)

## 
##  1-sample proportions test with continuity correction
## 
## data:  length(a$purpose) out of length(d$purpose), null probability 0.28
## X-squared = 1.5948, df = 1, p-value = 0.2066
## alternative hypothesis: true p is not equal to 0.28
## 95 percent confidence interval:
##  0.2792029 0.2837097
## sample estimates:
##         p 
## 0.2814508

p-value > 5%, chưa đủ cơ sở bác bỏ giả thuyết \(H_0\). DO đó tỷ lệ số căn hộ với mục đích cho thuê bằng 28% với mức ý nghĩa 5%.

Khoảng ước lượng tỷ lệ số căn hộ với mục đích bán với độ tin cậy 95% là (0,2792029;0,2837097)

4 Hồi quy logistic

4.1 Mô hình logit

d$purpose<-as.factor(d$purpose)
d$property_type<-as.factor(d$property_type)

logit <- glm(factor (purpose) ~ price + baths + bedrooms + property_type  , family = binomial(link = "logit"), data = d)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(logit)

## 
## Call:
## glm(formula = factor(purpose) ~ price + baths + bedrooms + property_type, 
##     family = binomial(link = "logit"), data = d)
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -3.294e+00  5.512e-01  -5.977 2.28e-09 ***
## price                       5.445e-06  1.180e-07  46.142  < 2e-16 ***
## baths                      -9.078e-02  5.575e-02  -1.628   0.1035    
## bedrooms                   -3.956e-01  6.054e-02  -6.535 6.36e-11 ***
## property_typeFlat          -1.257e-01  5.520e-01  -0.228   0.8199    
## property_typeHouse         -2.907e+00  5.586e-01  -5.205 1.94e-07 ***
## property_typeLower Portion -2.582e+00  6.158e-01  -4.194 2.74e-05 ***
## property_typePenthouse     -1.377e+00  1.050e+00  -1.311   0.1898    
## property_typeRoom          -1.881e+00  7.841e-01  -2.399   0.0164 *  
## property_typeUpper Portion -2.489e+00  5.946e-01  -4.186 2.84e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 182372.5  on 153429  degrees of freedom
## Residual deviance:   3259.2  on 153420  degrees of freedom
## AIC: 3279.2
## 
## Number of Fisher Scoring iterations: 16

4.2 Mô hình probit

probit<-glm(formula = purpose~ price + baths +bedrooms  + property_type, family = binomial(link = "probit"), data = d)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(probit)

## 
## Call:
## glm(formula = purpose ~ price + baths + bedrooms + property_type, 
##     family = binomial(link = "probit"), data = d)
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -1.741e+00  2.655e-01  -6.560 5.39e-11 ***
## price                       2.429e-06  3.865e-08  62.841  < 2e-16 ***
## baths                      -1.252e-02  2.169e-02  -0.577 0.563782    
## bedrooms                   -1.534e-01  2.336e-02  -6.569 5.08e-11 ***
## property_typeFlat          -2.094e-01  2.656e-01  -0.788 0.430532    
## property_typeHouse         -1.146e+00  2.673e-01  -4.286 1.82e-05 ***
## property_typeLower Portion -1.017e+00  2.784e-01  -3.653 0.000259 ***
## property_typePenthouse     -6.143e-01  4.314e-01  -1.424 0.154458    
## property_typeRoom          -8.047e-01  3.317e-01  -2.426 0.015266 *  
## property_typeUpper Portion -9.890e-01  2.738e-01  -3.612 0.000304 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 182372.5  on 153429  degrees of freedom
## Residual deviance:   3585.2  on 153420  degrees of freedom
## AIC: 3605.2
## 
## Number of Fisher Scoring iterations: 19

4.3 Mô hình cloglog

cloglog<-glm(formula = purpose~ price + baths +bedrooms  + property_type, family = binomial(link = "cloglog"), data = d)

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(cloglog)

## 
## Call:
## glm(formula = purpose ~ price + baths + bedrooms + property_type, 
##     family = binomial(link = "cloglog"), data = d)
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -6.844e+00  5.164e-01 -13.254  < 2e-16 ***
## price                       4.097e-06  7.441e-08  55.061  < 2e-16 ***
## baths                       6.728e-01  4.637e-02  14.510  < 2e-16 ***
## bedrooms                   -1.023e+00  4.850e-02 -21.091  < 2e-16 ***
## property_typeFlat           3.642e+00  5.184e-01   7.024 2.15e-12 ***
## property_typeHouse          1.516e+00  5.203e-01   2.914 0.003573 ** 
## property_typeLower Portion -6.547e-01  5.970e-01  -1.096 0.272863    
## property_typePenthouse      2.849e+00  7.596e-01   3.750 0.000177 ***
## property_typeRoom           2.127e+00  6.693e-01   3.178 0.001483 ** 
## property_typeUpper Portion  1.471e+00  5.416e-01   2.716 0.006599 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 182372.5  on 153429  degrees of freedom
## Residual deviance:   4505.8  on 153420  degrees of freedom
## AIC: 4525.8
## 
## Number of Fisher Scoring iterations: 25

5 Lựa chọn mô hình

5.1 Tiêu chỉ AIC

AIC(logit) = 3279,2

AIC(probit) = 3605,2

AIC(cloglog) = 4525,8

Mô hình logit có AIC nhỏ nhất nên ta chọn mô hình logit

5.2 Deviance

Deviance(logit) = 3259,2

Deviance(probit) = 3585,2

Deviance(cloglog) = 4505,8

Mô hình logit có deviance nhỏ nhất nên ta chọn mô hình logit

5.3 Brier Score

BrierScore(logit)

## [1] 0.00189995

BrierScore(probit)

## [1] 0.002125317

BrierScore(cloglog)

## [1] 0.002385188

Dựa vào tiêu chí Brier score ta thấy mô hình logit có giá trị nhỏ nhất nên ta chọn logit

5.4 Ma trận nhầm lẫn

5.4.1 Mô hình logit

predictions <- predict(logit, newdata = d, type = "response")
predicted_classes <- ifelse(predictions > 0.5, "1", "0") 
predictions1<-factor(predicted_classes, levels = c("0","1"))
actual<- factor(d$purpose, labels = c("0","1"))
confusionMatrix(table(predictions1, actual))

## Confusion Matrix and Statistics
## 
##             actual
## predictions1      0      1
##            0  43116    313
##            1     67 109934
##                                           
##                Accuracy : 0.9975          
##                  95% CI : (0.9973, 0.9978)
##     No Information Rate : 0.7185          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9939          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9984          
##             Specificity : 0.9972          
##          Pos Pred Value : 0.9928          
##          Neg Pred Value : 0.9994          
##              Prevalence : 0.2815          
##          Detection Rate : 0.2810          
##    Detection Prevalence : 0.2831          
##       Balanced Accuracy : 0.9978          
##                                           
##        'Positive' Class : 0               
##

Mô hình logit có độ chính xác toàn thể là 99,84%, độ nhạy là 99,84% và độ hiệu quả là 99,72%

5.4.2 Mô hình probit

predictions <- predict(probit, newdata = d, type = "response")
predicted_classes <- ifelse(predictions > 0.5, "1", "0")  
predictions1<-factor(predicted_classes, levels = c("0","1"))
actual<- factor(d$purpose, labels = c("0","1"))
confusionMatrix(table(predictions1, actual))

## Confusion Matrix and Statistics
## 
##             actual
## predictions1      0      1
##            0  43111    322
##            1     72 109925
##                                           
##                Accuracy : 0.9974          
##                  95% CI : (0.9972, 0.9977)
##     No Information Rate : 0.7185          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9937          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9983          
##             Specificity : 0.9971          
##          Pos Pred Value : 0.9926          
##          Neg Pred Value : 0.9993          
##              Prevalence : 0.2815          
##          Detection Rate : 0.2810          
##    Detection Prevalence : 0.2831          
##       Balanced Accuracy : 0.9977          
##                                           
##        'Positive' Class : 0               
##

Mô hình probit có độ chính xác toàn thể là 99,74%, độ nhạy là 99,83% và độ hiệu quả là 99,71%

5.4.3 Mô hình cloglog

predictions <- predict(cloglog, newdata = d, type = "response")
predicted_classes <- ifelse(predictions > 0.5, "1", "0") 
predictions1<-factor(predicted_classes, levels = c("0","1"))
actual<- factor(d$purpose, labels = c("0","1"))
confusionMatrix(table(predictions1, actual))

## Confusion Matrix and Statistics
## 
##             actual
## predictions1      0      1
##            0  43089    365
##            1     94 109882
##                                           
##                Accuracy : 0.997           
##                  95% CI : (0.9967, 0.9973)
##     No Information Rate : 0.7185          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9926          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9978          
##             Specificity : 0.9967          
##          Pos Pred Value : 0.9916          
##          Neg Pred Value : 0.9991          
##              Prevalence : 0.2815          
##          Detection Rate : 0.2808          
##    Detection Prevalence : 0.2832          
##       Balanced Accuracy : 0.9973          
##                                           
##        'Positive' Class : 0               
##

Mô hình cloglog có độ chính xác toàn thể là 99,7%, độ nhạy là 99,78% và độ hiệu quả là 99,67%

Dựa vào 4 tiêu chuẩn trên ta thấy mô hình logit là mô hình được chọn. Do đó Mô hình logit là mô hình phù hợp nhất trong 3 mô hình.

PTDL

Nguyễn Minh Dạ Mẫn

2023-07-26