options(repos = c(CRAN = "https://cran.r-project.org"))
install.packages("readxl")
## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'readxl' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\HP\AppData\Local\Temp\Rtmpkp9bsP\downloaded_packages
library(readxl)
## Warning: package 'readxl' was built under R version 4.3.1
install.packages("tidyselect")
## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'tidyselect' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\HP\AppData\Local\Temp\Rtmpkp9bsP\downloaded_packages
library(tidyselect)
## Warning: package 'tidyselect' was built under R version 4.3.1
install.packages("epitools")
## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'epitools' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\HP\AppData\Local\Temp\Rtmpkp9bsP\downloaded_packages
library(epitools)
install.packages("DescTools")
## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'DescTools' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\HP\AppData\Local\Temp\Rtmpkp9bsP\downloaded_packages
library(DescTools)
## Warning: package 'DescTools' was built under R version 4.3.1
install.packages("dplyr")
## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'dplyr' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\HP\AppData\Local\Temp\Rtmpkp9bsP\downloaded_packages
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
install.packages("ggplot2")
## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'ggplot2' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\HP\AppData\Local\Temp\Rtmpkp9bsP\downloaded_packages
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.1
install.packages("caTools")
## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'caTools' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\HP\AppData\Local\Temp\Rtmpkp9bsP\downloaded_packages
library(caTools)
## Warning: package 'caTools' was built under R version 4.3.1
install.packages("caret")
## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'caret' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\HP\AppData\Local\Temp\Rtmpkp9bsP\downloaded_packages
library(caret)
## Warning: package 'caret' was built under R version 4.3.1
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked from 'package:DescTools':
##
## MAE, RMSE
install.packages("fBasics")
## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'fBasics' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\HP\AppData\Local\Temp\Rtmpkp9bsP\downloaded_packages
library(fBasics)
## Warning: package 'fBasics' was built under R version 4.3.1
install.packages("lmtest")
## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.3'
## (as 'lib' is unspecified)
## package 'lmtest' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\HP\AppData\Local\Temp\Rtmpkp9bsP\downloaded_packages
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.3.1
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.3.1
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
Dữ liệu bao gồm 153430 quan sát và 15 biến:
Property type: là các loại tài sản. Trong phần này, chúng ta có 6 loại khác nhau: House, FarmHouse, Upper Portion, Lower Portion, Flat, Room
Price: là giá của các loại tài sản
Location: về các loại vị trí khác nhau trong mỗi thành phố.
City: thành phố. Trong bộ dữ liệu này có 5 thành phố:Lahore, Karachi, Faisalabad, Rawalpindi, Islamabad
Province_name: tên tỉnh
Latitude: chiều rộng của căn nhà
Longitde: Chiều dài của căn nhà
Baths: số phòng tắm
Purpose: mục đích của căn hộ
Bedrooms: số phòng ngủ
Date_added: Ngày được thêm vào
Agency: hãng
Agent: đại lý
Area in Marla: khu vực ở Marla
Biến số thứ tự
d <- read_excel("D:/EDA.xlsx")
## Warning: Expecting numeric in G3081 / R3081C7: got a date
## New names:
## • `` -> `...1`
View(d)
str(d)
## tibble [153,430 × 15] (S3: tbl_df/tbl/data.frame)
## $ ...1 : num [1:153430] 0 1 2 3 4 5 6 7 8 9 ...
## $ property_type: chr [1:153430] "Flat" "Flat" "House" "House" ...
## $ price : num [1:153430] 10000000 6900000 16500000 43500000 7000000 34500000 27000000 7800000 50000000 40000000 ...
## $ location : chr [1:153430] "G-10" "E-11" "G-15" "Bani Gala" ...
## $ city : chr [1:153430] "Islamabad" "Islamabad" "Islamabad" "Islamabad" ...
## $ province_name: chr [1:153430] "Islamabad Capital" "Islamabad Capital" "Islamabad Capital" "Islamabad Capital" ...
## $ latitude : num [1:153430] 3.37e+06 3.37e+07 3.36e+16 3.37e+13 3.35e+07 ...
## $ longitude : num [1:153430] 7.30e+06 7.30e+07 7.29e+07 7.32e+12 7.33e+07 ...
## $ baths : num [1:153430] 2 3 6 4 3 8 8 2 7 5 ...
## $ purpose : chr [1:153430] "For Sale" "For Sale" "For Sale" "For Sale" ...
## $ bedrooms : num [1:153430] 2 3 5 4 3 8 8 2 7 5 ...
## $ date_added : POSIXct[1:153430], format: "2019-02-04" "2019-05-04" ...
## $ agency : chr [1:153430] "Self" "Self" "Self" "Self" ...
## $ agent : chr [1:153430] "Self" "Self" "Self" "Self" ...
## $ Area_in_Marla: num [1:153430] 4 5.6 8 40 8 32 20 6.2 20 20 ...
table(d$purpose)
##
## For Rent For Sale
## 43183 110247
a<-d[d$purpose == "For Rent",]
length(d$purpose)
## [1] 153430
ggplot(d,aes(purpose))+
geom_bar(color = "lightblue", fill = "lavender")+
geom_text(aes(label = scales :: percent(after_stat(count/sum(count)))), stat= 'count', color = 'black', vjust = -.5)+
ylab("Number of property")+ xlab("purpose")
Ước lượng tỷ lệ căn hộ với mục đích cho thuê có phải là 28% hay không
Giả thuyết” \(H_0: p = 0.28\)
prop.test(length(a$purpose),length(d$purpose),p= 0.28)
##
## 1-sample proportions test with continuity correction
##
## data: length(a$purpose) out of length(d$purpose), null probability 0.28
## X-squared = 1.5948, df = 1, p-value = 0.2066
## alternative hypothesis: true p is not equal to 0.28
## 95 percent confidence interval:
## 0.2792029 0.2837097
## sample estimates:
## p
## 0.2814508
p-value > 5%, chưa đủ cơ sở bác bỏ giả thuyết \(H_0\). DO đó tỷ lệ số căn hộ với mục đích cho thuê bằng 28% với mức ý nghĩa 5%.
Khoảng ước lượng tỷ lệ số căn hộ với mục đích bán với độ tin cậy 95% là (0,2792029;0,2837097)
d$purpose<-as.factor(d$purpose)
d$property_type<-as.factor(d$property_type)
logit <- glm(factor (purpose) ~ price + baths + bedrooms + property_type , family = binomial(link = "logit"), data = d)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logit)
##
## Call:
## glm(formula = factor(purpose) ~ price + baths + bedrooms + property_type,
## family = binomial(link = "logit"), data = d)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.294e+00 5.512e-01 -5.977 2.28e-09 ***
## price 5.445e-06 1.180e-07 46.142 < 2e-16 ***
## baths -9.078e-02 5.575e-02 -1.628 0.1035
## bedrooms -3.956e-01 6.054e-02 -6.535 6.36e-11 ***
## property_typeFlat -1.257e-01 5.520e-01 -0.228 0.8199
## property_typeHouse -2.907e+00 5.586e-01 -5.205 1.94e-07 ***
## property_typeLower Portion -2.582e+00 6.158e-01 -4.194 2.74e-05 ***
## property_typePenthouse -1.377e+00 1.050e+00 -1.311 0.1898
## property_typeRoom -1.881e+00 7.841e-01 -2.399 0.0164 *
## property_typeUpper Portion -2.489e+00 5.946e-01 -4.186 2.84e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 182372.5 on 153429 degrees of freedom
## Residual deviance: 3259.2 on 153420 degrees of freedom
## AIC: 3279.2
##
## Number of Fisher Scoring iterations: 16
probit<-glm(formula = purpose~ price + baths +bedrooms + property_type, family = binomial(link = "probit"), data = d)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(probit)
##
## Call:
## glm(formula = purpose ~ price + baths + bedrooms + property_type,
## family = binomial(link = "probit"), data = d)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.741e+00 2.655e-01 -6.560 5.39e-11 ***
## price 2.429e-06 3.865e-08 62.841 < 2e-16 ***
## baths -1.252e-02 2.169e-02 -0.577 0.563782
## bedrooms -1.534e-01 2.336e-02 -6.569 5.08e-11 ***
## property_typeFlat -2.094e-01 2.656e-01 -0.788 0.430532
## property_typeHouse -1.146e+00 2.673e-01 -4.286 1.82e-05 ***
## property_typeLower Portion -1.017e+00 2.784e-01 -3.653 0.000259 ***
## property_typePenthouse -6.143e-01 4.314e-01 -1.424 0.154458
## property_typeRoom -8.047e-01 3.317e-01 -2.426 0.015266 *
## property_typeUpper Portion -9.890e-01 2.738e-01 -3.612 0.000304 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 182372.5 on 153429 degrees of freedom
## Residual deviance: 3585.2 on 153420 degrees of freedom
## AIC: 3605.2
##
## Number of Fisher Scoring iterations: 19
cloglog<-glm(formula = purpose~ price + baths +bedrooms + property_type, family = binomial(link = "cloglog"), data = d)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(cloglog)
##
## Call:
## glm(formula = purpose ~ price + baths + bedrooms + property_type,
## family = binomial(link = "cloglog"), data = d)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.844e+00 5.164e-01 -13.254 < 2e-16 ***
## price 4.097e-06 7.441e-08 55.061 < 2e-16 ***
## baths 6.728e-01 4.637e-02 14.510 < 2e-16 ***
## bedrooms -1.023e+00 4.850e-02 -21.091 < 2e-16 ***
## property_typeFlat 3.642e+00 5.184e-01 7.024 2.15e-12 ***
## property_typeHouse 1.516e+00 5.203e-01 2.914 0.003573 **
## property_typeLower Portion -6.547e-01 5.970e-01 -1.096 0.272863
## property_typePenthouse 2.849e+00 7.596e-01 3.750 0.000177 ***
## property_typeRoom 2.127e+00 6.693e-01 3.178 0.001483 **
## property_typeUpper Portion 1.471e+00 5.416e-01 2.716 0.006599 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 182372.5 on 153429 degrees of freedom
## Residual deviance: 4505.8 on 153420 degrees of freedom
## AIC: 4525.8
##
## Number of Fisher Scoring iterations: 25
AIC(logit) = 3279,2
AIC(probit) = 3605,2
AIC(cloglog) = 4525,8
Mô hình logit có AIC nhỏ nhất nên ta chọn mô hình logit
Deviance(logit) = 3259,2
Deviance(probit) = 3585,2
Deviance(cloglog) = 4505,8
Mô hình logit có deviance nhỏ nhất nên ta chọn mô hình logit
BrierScore(logit)
## [1] 0.00189995
BrierScore(probit)
## [1] 0.002125317
BrierScore(cloglog)
## [1] 0.002385188
Dựa vào tiêu chí Brier score ta thấy mô hình logit có giá trị nhỏ nhất nên ta chọn logit
predictions <- predict(logit, newdata = d, type = "response")
predicted_classes <- ifelse(predictions > 0.5, "1", "0")
predictions1<-factor(predicted_classes, levels = c("0","1"))
actual<- factor(d$purpose, labels = c("0","1"))
confusionMatrix(table(predictions1, actual))
## Confusion Matrix and Statistics
##
## actual
## predictions1 0 1
## 0 43116 313
## 1 67 109934
##
## Accuracy : 0.9975
## 95% CI : (0.9973, 0.9978)
## No Information Rate : 0.7185
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9939
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9984
## Specificity : 0.9972
## Pos Pred Value : 0.9928
## Neg Pred Value : 0.9994
## Prevalence : 0.2815
## Detection Rate : 0.2810
## Detection Prevalence : 0.2831
## Balanced Accuracy : 0.9978
##
## 'Positive' Class : 0
##
Mô hình logit có độ chính xác toàn thể là 99,84%, độ nhạy là 99,84% và độ hiệu quả là 99,72%
predictions <- predict(probit, newdata = d, type = "response")
predicted_classes <- ifelse(predictions > 0.5, "1", "0")
predictions1<-factor(predicted_classes, levels = c("0","1"))
actual<- factor(d$purpose, labels = c("0","1"))
confusionMatrix(table(predictions1, actual))
## Confusion Matrix and Statistics
##
## actual
## predictions1 0 1
## 0 43111 322
## 1 72 109925
##
## Accuracy : 0.9974
## 95% CI : (0.9972, 0.9977)
## No Information Rate : 0.7185
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9937
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9983
## Specificity : 0.9971
## Pos Pred Value : 0.9926
## Neg Pred Value : 0.9993
## Prevalence : 0.2815
## Detection Rate : 0.2810
## Detection Prevalence : 0.2831
## Balanced Accuracy : 0.9977
##
## 'Positive' Class : 0
##
Mô hình probit có độ chính xác toàn thể là 99,74%, độ nhạy là 99,83% và độ hiệu quả là 99,71%
predictions <- predict(cloglog, newdata = d, type = "response")
predicted_classes <- ifelse(predictions > 0.5, "1", "0")
predictions1<-factor(predicted_classes, levels = c("0","1"))
actual<- factor(d$purpose, labels = c("0","1"))
confusionMatrix(table(predictions1, actual))
## Confusion Matrix and Statistics
##
## actual
## predictions1 0 1
## 0 43089 365
## 1 94 109882
##
## Accuracy : 0.997
## 95% CI : (0.9967, 0.9973)
## No Information Rate : 0.7185
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9926
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9978
## Specificity : 0.9967
## Pos Pred Value : 0.9916
## Neg Pred Value : 0.9991
## Prevalence : 0.2815
## Detection Rate : 0.2808
## Detection Prevalence : 0.2832
## Balanced Accuracy : 0.9973
##
## 'Positive' Class : 0
##
Mô hình cloglog có độ chính xác toàn thể là 99,7%, độ nhạy là 99,78% và độ hiệu quả là 99,67%
Dựa vào 4 tiêu chuẩn trên ta thấy mô hình logit là mô hình được chọn. Do đó Mô hình logit là mô hình phù hợp nhất trong 3 mô hình.