library(readxl)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.1
library(DT)
## Warning: package 'DT' was built under R version 4.3.1
EDA <- read_excel("D:/data/EDA.xlsx")
## Warning: Expecting numeric in G3081 / R3081C7: got a date
## New names:
## • `` -> `...1`
str(EDA)
## tibble [153,430 × 15] (S3: tbl_df/tbl/data.frame)
## $ ...1 : num [1:153430] 0 1 2 3 4 5 6 7 8 9 ...
## $ property_type: chr [1:153430] "Flat" "Flat" "House" "House" ...
## $ price : num [1:153430] 10000000 6900000 16500000 43500000 7000000 34500000 27000000 7800000 50000000 40000000 ...
## $ location : chr [1:153430] "G-10" "E-11" "G-15" "Bani Gala" ...
## $ city : chr [1:153430] "Islamabad" "Islamabad" "Islamabad" "Islamabad" ...
## $ province_name: chr [1:153430] "Islamabad Capital" "Islamabad Capital" "Islamabad Capital" "Islamabad Capital" ...
## $ latitude : num [1:153430] 3.37e+06 3.37e+07 3.36e+16 3.37e+13 3.35e+07 ...
## $ longitude : num [1:153430] 7.30e+06 7.30e+07 7.29e+07 7.32e+12 7.33e+07 ...
## $ baths : num [1:153430] 2 3 6 4 3 8 8 2 7 5 ...
## $ purpose : chr [1:153430] "For Sale" "For Sale" "For Sale" "For Sale" ...
## $ bedrooms : num [1:153430] 2 3 5 4 3 8 8 2 7 5 ...
## $ date_added : POSIXct[1:153430], format: "2019-02-04" "2019-05-04" ...
## $ agency : chr [1:153430] "Self" "Self" "Self" "Self" ...
## $ agent : chr [1:153430] "Self" "Self" "Self" "Self" ...
## $ Area_in_Marla: num [1:153430] 4 5.6 8 40 8 32 20 6.2 20 20 ...
kq <- table(EDA$purpose)
kq
##
## For Rent For Sale
## 43183 110247
Có 43183 căn hộ cho thuê và 110247 căn hộ để bán
table(EDA$CC)/sum(table(EDA$purpose))
## Warning: Unknown or uninitialised column: `CC`.
## numeric(0)
Tỷ lệ căn hộ cho thuê chiếm 28,14%, để bán chiếm 71,85%
ggplot(EDA,aes(purpose))+
geom_bar(color = "black", fill = "pink")+
geom_text(aes(label = scales :: percent(after_stat(count/sum(count)))), stat= 'count', color = 'black', vjust = -.5)+
ylab("soluong")+ xlab("purpose")
ul1 <- EDA[EDA$purpose == 'For Rent',]
prop.test( length(ul1$purpose), length(EDA$purpose))
##
## 1-sample proportions test with continuity correction
##
## data: length(ul1$purpose) out of length(EDA$purpose), null probability 0.5
## X-squared = 29313, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.2792029 0.2837097
## sample estimates:
## p
## 0.2814508
Với độ tin cậy 95% ta có tỷ lệ những người có mục đích cho thuê nhà nằm trong khoảng từ 27,92029% đến 28,37097%.
ul2 <- EDA[EDA$purpose == 'For Sale',]
prop.test( length(ul2$purpose), length(EDA$purpose))
##
## 1-sample proportions test with continuity correction
##
## data: length(ul2$purpose) out of length(EDA$purpose), null probability 0.5
## X-squared = 29313, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.7162903 0.7207971
## sample estimates:
## p
## 0.7185492
Với độ tin cậy 95% ta có tỷ lệ những người có mục đích bán nhà nằm trong khoảng từ 71,62903% đến 72,07971%.
EDA$purpose <- as.factor(EDA$purpose)
mhlogit <- glm( purpose ~ property_type + price + baths + bedrooms + Area_in_Marla, family= binomial( link = 'logit'), data = EDA)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(mhlogit)
##
## Call:
## glm(formula = purpose ~ property_type + price + baths + bedrooms +
## Area_in_Marla, family = binomial(link = "logit"), data = EDA)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.798e+13 2.733e+06 1.756e+07 <2e-16 ***
## property_typeFlat 1.556e+15 2.752e+06 5.652e+08 <2e-16 ***
## property_typeHouse -5.182e+14 2.739e+06 -1.892e+08 <2e-16 ***
## property_typeLower Portion -3.930e+14 2.827e+06 -1.390e+08 <2e-16 ***
## property_typePenthouse -4.679e+14 4.388e+06 -1.066e+08 <2e-16 ***
## property_typeRoom -1.277e+15 3.774e+06 -3.384e+08 <2e-16 ***
## property_typeUpper Portion -2.953e+14 2.795e+06 -1.056e+08 <2e-16 ***
## price 6.576e+07 5.084e-03 1.294e+10 <2e-16 ***
## baths -4.013e+13 9.157e+04 -4.382e+08 <2e-16 ***
## bedrooms 6.753e+13 1.220e+05 5.535e+08 <2e-16 ***
## Area_in_Marla -2.979e+12 1.843e+03 -1.617e+09 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 182373 on 153429 degrees of freedom
## Residual deviance: 1385951 on 153419 degrees of freedom
## AIC: 1385973
##
## Number of Fisher Scoring iterations: 25
mhprobit <- glm(factor (purpose) ~ price + baths + bedrooms + property_type , family = binomial(link = "probit"), data = EDA)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(mhprobit)
##
## Call:
## glm(formula = factor(purpose) ~ price + baths + bedrooms + property_type,
## family = binomial(link = "probit"), data = EDA)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.741e+00 2.655e-01 -6.560 5.39e-11 ***
## price 2.429e-06 3.865e-08 62.841 < 2e-16 ***
## baths -1.252e-02 2.169e-02 -0.577 0.563782
## bedrooms -1.534e-01 2.336e-02 -6.569 5.08e-11 ***
## property_typeFlat -2.094e-01 2.656e-01 -0.788 0.430532
## property_typeHouse -1.146e+00 2.673e-01 -4.286 1.82e-05 ***
## property_typeLower Portion -1.017e+00 2.784e-01 -3.653 0.000259 ***
## property_typePenthouse -6.143e-01 4.314e-01 -1.424 0.154458
## property_typeRoom -8.047e-01 3.317e-01 -2.426 0.015266 *
## property_typeUpper Portion -9.890e-01 2.738e-01 -3.612 0.000304 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 182372.5 on 153429 degrees of freedom
## Residual deviance: 3585.2 on 153420 degrees of freedom
## AIC: 3605.2
##
## Number of Fisher Scoring iterations: 19
mhcloglog <- glm(formula = factor(purpose)~ price + baths +bedrooms + property_type , family = binomial(link = "cloglog"), data = EDA)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(mhcloglog)
##
## Call:
## glm(formula = factor(purpose) ~ price + baths + bedrooms + property_type,
## family = binomial(link = "cloglog"), data = EDA)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.844e+00 5.164e-01 -13.254 < 2e-16 ***
## price 4.097e-06 7.441e-08 55.061 < 2e-16 ***
## baths 6.728e-01 4.637e-02 14.510 < 2e-16 ***
## bedrooms -1.023e+00 4.850e-02 -21.091 < 2e-16 ***
## property_typeFlat 3.642e+00 5.184e-01 7.024 2.15e-12 ***
## property_typeHouse 1.516e+00 5.203e-01 2.914 0.003573 **
## property_typeLower Portion -6.547e-01 5.970e-01 -1.096 0.272863
## property_typePenthouse 2.849e+00 7.596e-01 3.750 0.000177 ***
## property_typeRoom 2.127e+00 6.693e-01 3.178 0.001483 **
## property_typeUpper Portion 1.471e+00 5.416e-01 2.716 0.006599 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 182372.5 on 153429 degrees of freedom
## Residual deviance: 4505.8 on 153420 degrees of freedom
## AIC: 4525.8
##
## Number of Fisher Scoring iterations: 25