library(readxl)
dataset_1 <- read_excel("C:/Users/ASUS/Downloads/For_EDA_dataset 1.xlsx")
## New names:
## • `` -> `...1`
View(dataset_1)
dataset_1
## # A tibble: 153,430 × 15
## ...1 property_type price location city province_name latitude longitude
## <dbl> <chr> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 0 Flat 10000000 G-10 Isla… Islamabad Ca… 3.367.9… 7.301.264
## 2 1 Flat 6900000 E-11 Isla… Islamabad Ca… 33.700.… 72.971.4…
## 3 2 House 16500000 G-15 Isla… Islamabad Ca… 33.631.… 72.926.5…
## 4 3 House 43500000 Bani Gala Isla… Islamabad Ca… 33.707.… 7.315.11…
## 5 4 House 7000000 DHA Defe… Isla… Islamabad Ca… 33.492.… 73.301.3…
## 6 5 House 34500000 Ghauri T… Isla… Islamabad Ca… 33.623.… 73.126.5…
## 7 6 House 27000000 Korang T… Isla… Islamabad Ca… 33.579.… 7.313.95…
## 8 7 Flat 7800000 E-11 Isla… Islamabad Ca… 33.698.… 72.984.2…
## 9 8 House 50000000 DHA Defe… Isla… Islamabad Ca… 33.540.… 73.095.7…
## 10 9 Penthouse 40000000 F-11 Isla… Islamabad Ca… 33.679.… 72.988.7…
## # ℹ 153,420 more rows
## # ℹ 7 more variables: baths <dbl>, purpose <chr>, bedrooms <dbl>,
## # date_added <chr>, agency <chr>, agent <chr>, Area_in_Marla <dbl>
table(dataset_1$purpose)
##
## For Rent For Sale
## 43183 110247
table(dataset_1$purpose)/sum(table(dataset_1$purpose))*100
##
## For Rent For Sale
## 28.14508 71.85492
Tỷ lệ căn hộ được sử dụng với mục đích cho thuê là 28.15% và tỷ lệ căn hộ được sử dụng với mục đích bán là 71.85%
library(ggplot2)
ggplot(dataset_1)+geom_bar(aes(x = purpose), binwidth = 0.5, color = "green", fill = "yellow") + labs(x= "Purpose", y = "Tần số") + theme_minimal()
## Warning in geom_bar(aes(x = purpose), binwidth = 0.5, color = "green", fill =
## "yellow"): Ignoring unknown parameters: `binwidth`
p1<-dataset_1[dataset_1$purpose == "For Rent",]
prop.test(length(p1$purpose),length(dataset_1$purpose), p= 0.28)
##
## 1-sample proportions test with continuity correction
##
## data: length(p1$purpose) out of length(dataset_1$purpose), null probability 0.28
## X-squared = 1.5948, df = 1, p-value = 0.2066
## alternative hypothesis: true p is not equal to 0.28
## 95 percent confidence interval:
## 0.2792029 0.2837097
## sample estimates:
## p
## 0.2814508
dataset_1$purpose<-as.factor(dataset_1$purpose)
dataset_1$property_type<-as.factor(dataset_1$property_type)
logit <-glm(factor (purpose) ~ price + baths + bedrooms + property_type + Area_in_Marla, family = binomial(link= "logit"), data = dataset_1)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logit)
##
## Call:
## glm(formula = factor(purpose) ~ price + baths + bedrooms + property_type +
## Area_in_Marla, family = binomial(link = "logit"), data = dataset_1)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.798e+13 2.733e+06 1.756e+07 <2e-16 ***
## price 6.576e+07 5.084e-03 1.294e+10 <2e-16 ***
## baths -4.013e+13 9.157e+04 -4.382e+08 <2e-16 ***
## bedrooms 6.753e+13 1.220e+05 5.535e+08 <2e-16 ***
## property_typeFlat 1.556e+15 2.752e+06 5.652e+08 <2e-16 ***
## property_typeHouse -5.182e+14 2.739e+06 -1.892e+08 <2e-16 ***
## property_typeLower Portion -3.930e+14 2.827e+06 -1.390e+08 <2e-16 ***
## property_typePenthouse -4.679e+14 4.388e+06 -1.066e+08 <2e-16 ***
## property_typeRoom -1.277e+15 3.774e+06 -3.384e+08 <2e-16 ***
## property_typeUpper Portion -2.953e+14 2.795e+06 -1.056e+08 <2e-16 ***
## Area_in_Marla -2.979e+12 1.843e+03 -1.617e+09 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 182373 on 153429 degrees of freedom
## Residual deviance: 1385951 on 153419 degrees of freedom
## AIC: 1385973
##
## Number of Fisher Scoring iterations: 25
probit <-glm(factor (purpose) ~ price + baths + bedrooms + property_type + Area_in_Marla, family = binomial(link= "probit"), data = dataset_1)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(probit)
##
## Call:
## glm(formula = factor(purpose) ~ price + baths + bedrooms + property_type +
## Area_in_Marla, family = binomial(link = "probit"), data = dataset_1)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.011e+14 2.733e+06 2.566e+08 <2e-16 ***
## price 5.405e+07 5.084e-03 1.063e+10 <2e-16 ***
## baths -5.222e+13 9.157e+04 -5.702e+08 <2e-16 ***
## bedrooms -1.223e+14 1.220e+05 -1.003e+09 <2e-16 ***
## property_typeFlat 1.820e+14 2.752e+06 6.612e+07 <2e-16 ***
## property_typeHouse 3.535e+14 2.739e+06 1.290e+08 <2e-16 ***
## property_typeLower Portion -4.695e+14 2.827e+06 -1.661e+08 <2e-16 ***
## property_typePenthouse -9.524e+14 4.388e+06 -2.171e+08 <2e-16 ***
## property_typeRoom -3.869e+15 3.774e+06 -1.025e+09 <2e-16 ***
## property_typeUpper Portion -3.309e+14 2.795e+06 -1.184e+08 <2e-16 ***
## Area_in_Marla -2.860e+12 1.843e+03 -1.552e+09 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 182373 on 153429 degrees of freedom
## Residual deviance: 1961424 on 153419 degrees of freedom
## AIC: 1961446
##
## Number of Fisher Scoring iterations: 25
cloglog <-glm(formula = factor (purpose) ~ price + baths + bedrooms + property_type + Area_in_Marla, family = binomial(link= "cloglog"), data = dataset_1)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(cloglog)
##
## Call:
## glm(formula = factor(purpose) ~ price + baths + bedrooms + property_type +
## Area_in_Marla, family = binomial(link = "cloglog"), data = dataset_1)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.014e+14 2.733e+06 3.711e+07 <2e-16 ***
## price 4.226e+07 5.084e-03 8.311e+09 <2e-16 ***
## baths -2.894e+13 9.157e+04 -3.160e+08 <2e-16 ***
## bedrooms -4.268e+13 1.220e+05 -3.498e+08 <2e-16 ***
## property_typeFlat 3.185e+14 2.752e+06 1.157e+08 <2e-16 ***
## property_typeHouse -9.156e+13 2.739e+06 -3.342e+07 <2e-16 ***
## property_typeLower Portion -6.761e+14 2.827e+06 -2.392e+08 <2e-16 ***
## property_typePenthouse 4.042e+14 4.388e+06 9.213e+07 <2e-16 ***
## property_typeRoom -2.253e+14 3.774e+06 -5.971e+07 <2e-16 ***
## property_typeUpper Portion -1.774e+15 2.795e+06 -6.346e+08 <2e-16 ***
## Area_in_Marla -1.871e+12 1.843e+03 -1.015e+09 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 182373 on 153429 degrees of freedom
## Residual deviance: 1345942 on 153419 degrees of freedom
## AIC: 1345964
##
## Number of Fisher Scoring iterations: 25
AIC (logit) = 1385973
AIC (probit) = 1961446
AIC (cloglog) = 1345964
Deviance (logit) = 1385951
Deviance (probit) = 1961424
Deviance (cloglog) = 1345942
library(DescTools)
BrierScore(logit)
## [1] 0.125308
BrierScore(probit)
## [1] 0.1773382
BrierScore(cloglog)
## [1] 0.1216907