1 Dữ liệu

library(readxl)
dataset_1 <- read_excel("C:/Users/ASUS/Downloads/For_EDA_dataset 1.xlsx")
## New names:
## • `` -> `...1`
View(dataset_1)
dataset_1
## # A tibble: 153,430 × 15
##     ...1 property_type    price location  city  province_name latitude longitude
##    <dbl> <chr>            <dbl> <chr>     <chr> <chr>         <chr>    <chr>    
##  1     0 Flat          10000000 G-10      Isla… Islamabad Ca… 3.367.9… 7.301.264
##  2     1 Flat           6900000 E-11      Isla… Islamabad Ca… 33.700.… 72.971.4…
##  3     2 House         16500000 G-15      Isla… Islamabad Ca… 33.631.… 72.926.5…
##  4     3 House         43500000 Bani Gala Isla… Islamabad Ca… 33.707.… 7.315.11…
##  5     4 House          7000000 DHA Defe… Isla… Islamabad Ca… 33.492.… 73.301.3…
##  6     5 House         34500000 Ghauri T… Isla… Islamabad Ca… 33.623.… 73.126.5…
##  7     6 House         27000000 Korang T… Isla… Islamabad Ca… 33.579.… 7.313.95…
##  8     7 Flat           7800000 E-11      Isla… Islamabad Ca… 33.698.… 72.984.2…
##  9     8 House         50000000 DHA Defe… Isla… Islamabad Ca… 33.540.… 73.095.7…
## 10     9 Penthouse     40000000 F-11      Isla… Islamabad Ca… 33.679.… 72.988.7…
## # ℹ 153,420 more rows
## # ℹ 7 more variables: baths <dbl>, purpose <chr>, bedrooms <dbl>,
## #   date_added <chr>, agency <chr>, agent <chr>, Area_in_Marla <dbl>

2 Xử lí dữ liệu

2.1 Bảng tần số biến purpose

table(dataset_1$purpose)
## 
## For Rent For Sale 
##    43183   110247

2.2 Bảng tần suất

table(dataset_1$purpose)/sum(table(dataset_1$purpose))*100
## 
## For Rent For Sale 
## 28.14508 71.85492

Tỷ lệ căn hộ được sử dụng với mục đích cho thuê là 28.15% và tỷ lệ căn hộ được sử dụng với mục đích bán là 71.85%

library(ggplot2)
ggplot(dataset_1)+geom_bar(aes(x = purpose), binwidth = 0.5, color = "green", fill = "yellow") + labs(x= "Purpose", y = "Tần số") + theme_minimal()
## Warning in geom_bar(aes(x = purpose), binwidth = 0.5, color = "green", fill =
## "yellow"): Ignoring unknown parameters: `binwidth`

3 Ước lượng

p1<-dataset_1[dataset_1$purpose == "For Rent",]
prop.test(length(p1$purpose),length(dataset_1$purpose), p= 0.28)
## 
##  1-sample proportions test with continuity correction
## 
## data:  length(p1$purpose) out of length(dataset_1$purpose), null probability 0.28
## X-squared = 1.5948, df = 1, p-value = 0.2066
## alternative hypothesis: true p is not equal to 0.28
## 95 percent confidence interval:
##  0.2792029 0.2837097
## sample estimates:
##         p 
## 0.2814508

4 Hồi quy

4.1 Mô hình logit

dataset_1$purpose<-as.factor(dataset_1$purpose)
dataset_1$property_type<-as.factor(dataset_1$property_type)
logit <-glm(factor (purpose) ~ price + baths + bedrooms + property_type + Area_in_Marla, family = binomial(link= "logit"), data = dataset_1)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(logit)
## 
## Call:
## glm(formula = factor(purpose) ~ price + baths + bedrooms + property_type + 
##     Area_in_Marla, family = binomial(link = "logit"), data = dataset_1)
## 
## Coefficients:
##                              Estimate Std. Error    z value Pr(>|z|)    
## (Intercept)                 4.798e+13  2.733e+06  1.756e+07   <2e-16 ***
## price                       6.576e+07  5.084e-03  1.294e+10   <2e-16 ***
## baths                      -4.013e+13  9.157e+04 -4.382e+08   <2e-16 ***
## bedrooms                    6.753e+13  1.220e+05  5.535e+08   <2e-16 ***
## property_typeFlat           1.556e+15  2.752e+06  5.652e+08   <2e-16 ***
## property_typeHouse         -5.182e+14  2.739e+06 -1.892e+08   <2e-16 ***
## property_typeLower Portion -3.930e+14  2.827e+06 -1.390e+08   <2e-16 ***
## property_typePenthouse     -4.679e+14  4.388e+06 -1.066e+08   <2e-16 ***
## property_typeRoom          -1.277e+15  3.774e+06 -3.384e+08   <2e-16 ***
## property_typeUpper Portion -2.953e+14  2.795e+06 -1.056e+08   <2e-16 ***
## Area_in_Marla              -2.979e+12  1.843e+03 -1.617e+09   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance:  182373  on 153429  degrees of freedom
## Residual deviance: 1385951  on 153419  degrees of freedom
## AIC: 1385973
## 
## Number of Fisher Scoring iterations: 25

4.2 Mô hình probit

probit <-glm(factor (purpose) ~ price + baths + bedrooms + property_type + Area_in_Marla, family = binomial(link= "probit"), data = dataset_1)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(probit)
## 
## Call:
## glm(formula = factor(purpose) ~ price + baths + bedrooms + property_type + 
##     Area_in_Marla, family = binomial(link = "probit"), data = dataset_1)
## 
## Coefficients:
##                              Estimate Std. Error    z value Pr(>|z|)    
## (Intercept)                 7.011e+14  2.733e+06  2.566e+08   <2e-16 ***
## price                       5.405e+07  5.084e-03  1.063e+10   <2e-16 ***
## baths                      -5.222e+13  9.157e+04 -5.702e+08   <2e-16 ***
## bedrooms                   -1.223e+14  1.220e+05 -1.003e+09   <2e-16 ***
## property_typeFlat           1.820e+14  2.752e+06  6.612e+07   <2e-16 ***
## property_typeHouse          3.535e+14  2.739e+06  1.290e+08   <2e-16 ***
## property_typeLower Portion -4.695e+14  2.827e+06 -1.661e+08   <2e-16 ***
## property_typePenthouse     -9.524e+14  4.388e+06 -2.171e+08   <2e-16 ***
## property_typeRoom          -3.869e+15  3.774e+06 -1.025e+09   <2e-16 ***
## property_typeUpper Portion -3.309e+14  2.795e+06 -1.184e+08   <2e-16 ***
## Area_in_Marla              -2.860e+12  1.843e+03 -1.552e+09   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance:  182373  on 153429  degrees of freedom
## Residual deviance: 1961424  on 153419  degrees of freedom
## AIC: 1961446
## 
## Number of Fisher Scoring iterations: 25

4.3 Mô hình cloglog

cloglog <-glm(formula = factor (purpose) ~ price + baths + bedrooms + property_type + Area_in_Marla, family = binomial(link= "cloglog"), data = dataset_1)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(cloglog)
## 
## Call:
## glm(formula = factor(purpose) ~ price + baths + bedrooms + property_type + 
##     Area_in_Marla, family = binomial(link = "cloglog"), data = dataset_1)
## 
## Coefficients:
##                              Estimate Std. Error    z value Pr(>|z|)    
## (Intercept)                 1.014e+14  2.733e+06  3.711e+07   <2e-16 ***
## price                       4.226e+07  5.084e-03  8.311e+09   <2e-16 ***
## baths                      -2.894e+13  9.157e+04 -3.160e+08   <2e-16 ***
## bedrooms                   -4.268e+13  1.220e+05 -3.498e+08   <2e-16 ***
## property_typeFlat           3.185e+14  2.752e+06  1.157e+08   <2e-16 ***
## property_typeHouse         -9.156e+13  2.739e+06 -3.342e+07   <2e-16 ***
## property_typeLower Portion -6.761e+14  2.827e+06 -2.392e+08   <2e-16 ***
## property_typePenthouse      4.042e+14  4.388e+06  9.213e+07   <2e-16 ***
## property_typeRoom          -2.253e+14  3.774e+06 -5.971e+07   <2e-16 ***
## property_typeUpper Portion -1.774e+15  2.795e+06 -6.346e+08   <2e-16 ***
## Area_in_Marla              -1.871e+12  1.843e+03 -1.015e+09   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance:  182373  on 153429  degrees of freedom
## Residual deviance: 1345942  on 153419  degrees of freedom
## AIC: 1345964
## 
## Number of Fisher Scoring iterations: 25

5 Lựa chọn mô hình

5.1 Tiêu chí AIC

AIC (logit) = 1385973

AIC (probit) = 1961446

AIC (cloglog) = 1345964

5.2 Deviance

Deviance (logit) = 1385951

Deviance (probit) = 1961424

Deviance (cloglog) = 1345942

5.3 Brier Score

5.3.1 Logit

library(DescTools)
BrierScore(logit)
## [1] 0.125308

5.3.2 Probit

BrierScore(probit)
## [1] 0.1773382

5.3.3 Cloglog

BrierScore(cloglog)
## [1] 0.1216907