1 Thống kê mô tả biến purpose

1.1 Bảng tần số

table(tt$purpose)
## 
## For Rent For Sale 
##    43183   110247
table(tt$purpose)/sum(table(tt$purpose))*100
## 
## For Rent For Sale 
## 28.14508 71.85492

Có 28,15% số người trong cuộc khảo sát có ý định cho thuê nhà và 71,85% có ý định bán nhà.

1.2 Đồ thị

tt |> ggplot(aes( x = purpose , y = after_stat(count))) +
  geom_bar(fill = 'moccasin') +
  geom_text(aes(label = scales::percent( after_stat(count/sum(count)), accuracy = 0.01)) ,stat = 'count', color = 'black', vjust = 1.5) +
  theme_classic() + 
  labs(title = 'Đồ thị phân bổ của biến purpose' ,x = 'purpose', y = 'Số người')

tt %>% group_by(purpose) %>% summarise( n=n()) %>% mutate( percent =n/sum(n))  |> ggplot(aes(x='', y=percent, fill= purpose)) + geom_bar(stat='identity', width = 1) + geom_text(aes(label = paste0(round(percent*100), "%")), position = position_stack(vjust = 0.5))+
  coord_polar("y", start = 0) +
  scale_fill_manual(values = c("lightpink", "moccasin"), name = "purpose") +
    labs(title = "BIỂU ĐỒ PHÂN BỐ BIẾN PURPOSE") +
  theme_minimal()

1.3 Tính odd

rent_count <- table(tt$purpose)
rent_prop <- prop.table(rent_count)
odd_rent <- rent_prop['For Rent'] / rent_prop['For Sale']
odd_rent
##  For Rent 
## 0.3916932
odd_sale <- rent_prop['For Sale'] / rent_prop['For Rent']
odd_sale
## For Sale 
## 2.553019

Trong cuộc khảo sát ta có tỷ lệ người có mục đích cho thuê nhà bằng 39,2% tỷ lệ người có mục đích bán nhà. Hay nói cách khác tỷ lệ người có mục đích bán nhà gấp 2,55 lần tỷ lệ người có mục đích cho thuê nhà.

2 Ước lượng tỷ lệ

c <- tt[tt$purpose == 'For Rent',]
prop.test( length(c$purpose), length(tt$purpose))
## 
##  1-sample proportions test with continuity correction
## 
## data:  length(c$purpose) out of length(tt$purpose), null probability 0.5
## X-squared = 29313, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.2792029 0.2837097
## sample estimates:
##         p 
## 0.2814508

Với độ tin cậy 95% ta có tỷ lệ những người trong cuộc khảo sát có mục đích cho thuê nhà nằm trong khoảng từ 27,92% đến 28,37%.

c1 <- tt[tt$purpose == 'For Sale',]
prop.test( length(c1$purpose), length(tt$purpose))
## 
##  1-sample proportions test with continuity correction
## 
## data:  length(c1$purpose) out of length(tt$purpose), null probability 0.5
## X-squared = 29313, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.7162903 0.7207971
## sample estimates:
##         p 
## 0.7185492

Với độ tin cậy 95% ta có tỷ lệ những người trong cuộc khảo sát có mục đích bán nhà nằm trong khoảng từ 71,63% đến 72,08%.

3 Hồi quy

tt$purpose <- as.factor(tt$purpose)
mh1 <- glm( purpose ~ property_type + price + baths + bedrooms + Area_in_Marla, family= binomial( link = 'logit'), data = tt)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(mh1)
## 
## Call:
## glm(formula = purpose ~ property_type + price + baths + bedrooms + 
##     Area_in_Marla, family = binomial(link = "logit"), data = tt)
## 
## Coefficients:
##                              Estimate Std. Error    z value Pr(>|z|)    
## (Intercept)                 4.798e+13  2.733e+06  1.756e+07   <2e-16 ***
## property_typeFlat           1.556e+15  2.752e+06  5.652e+08   <2e-16 ***
## property_typeHouse         -5.182e+14  2.739e+06 -1.892e+08   <2e-16 ***
## property_typeLower Portion -3.930e+14  2.827e+06 -1.390e+08   <2e-16 ***
## property_typePenthouse     -4.679e+14  4.388e+06 -1.066e+08   <2e-16 ***
## property_typeRoom          -1.277e+15  3.774e+06 -3.384e+08   <2e-16 ***
## property_typeUpper Portion -2.953e+14  2.795e+06 -1.056e+08   <2e-16 ***
## price                       6.576e+07  5.084e-03  1.294e+10   <2e-16 ***
## baths                      -4.013e+13  9.157e+04 -4.382e+08   <2e-16 ***
## bedrooms                    6.753e+13  1.220e+05  5.535e+08   <2e-16 ***
## Area_in_Marla              -2.979e+12  1.843e+03 -1.617e+09   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance:  182373  on 153429  degrees of freedom
## Residual deviance: 1385951  on 153419  degrees of freedom
## AIC: 1385973
## 
## Number of Fisher Scoring iterations: 25
BrierScore(mh1)
## [1] 0.125308
confusionMatrix(table(predict(mh1, type = "response")>=0.5, mh1$data$purpose == 'For Sale'))
## Confusion Matrix and Statistics
## 
##        
##          FALSE   TRUE
##   FALSE  33707   9750
##   TRUE    9476 100497
##                                          
##                Accuracy : 0.8747         
##                  95% CI : (0.873, 0.8763)
##     No Information Rate : 0.7185         
##     P-Value [Acc > NIR] : < 2e-16        
##                                          
##                   Kappa : 0.6908         
##                                          
##  Mcnemar's Test P-Value : 0.04897        
##                                          
##             Sensitivity : 0.7806         
##             Specificity : 0.9116         
##          Pos Pred Value : 0.7756         
##          Neg Pred Value : 0.9138         
##              Prevalence : 0.2815         
##          Detection Rate : 0.2197         
##    Detection Prevalence : 0.2832         
##       Balanced Accuracy : 0.8461         
##                                          
##        'Positive' Class : FALSE          
## 
mh2 <- glm(purpose ~ price + baths + bedrooms + Area_in_Marla + property_type, family = binomial(link = 'cloglog'), data = tt)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(mh2)
## 
## Call:
## glm(formula = purpose ~ price + baths + bedrooms + Area_in_Marla + 
##     property_type, family = binomial(link = "cloglog"), data = tt)
## 
## Coefficients:
##                              Estimate Std. Error    z value Pr(>|z|)    
## (Intercept)                 1.014e+14  2.733e+06  3.711e+07   <2e-16 ***
## price                       4.226e+07  5.084e-03  8.311e+09   <2e-16 ***
## baths                      -2.894e+13  9.157e+04 -3.160e+08   <2e-16 ***
## bedrooms                   -4.268e+13  1.220e+05 -3.498e+08   <2e-16 ***
## Area_in_Marla              -1.871e+12  1.843e+03 -1.015e+09   <2e-16 ***
## property_typeFlat           3.185e+14  2.752e+06  1.157e+08   <2e-16 ***
## property_typeHouse         -9.156e+13  2.739e+06 -3.342e+07   <2e-16 ***
## property_typeLower Portion -6.761e+14  2.827e+06 -2.392e+08   <2e-16 ***
## property_typePenthouse      4.042e+14  4.388e+06  9.213e+07   <2e-16 ***
## property_typeRoom          -2.253e+14  3.774e+06 -5.971e+07   <2e-16 ***
## property_typeUpper Portion -1.774e+15  2.795e+06 -6.346e+08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance:  182373  on 153429  degrees of freedom
## Residual deviance: 1345942  on 153419  degrees of freedom
## AIC: 1345964
## 
## Number of Fisher Scoring iterations: 25
BrierScore(mh2)
## [1] 0.1216907
confusionMatrix(table(predict(mh2, type = "response") >= 0.5, mh2$data$purpose == 'For Sale'))
## Confusion Matrix and Statistics
## 
##        
##          FALSE   TRUE
##   FALSE  32529   8017
##   TRUE   10654 102230
##                                           
##                Accuracy : 0.8783          
##                  95% CI : (0.8767, 0.8799)
##     No Information Rate : 0.7185          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6934          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.7533          
##             Specificity : 0.9273          
##          Pos Pred Value : 0.8023          
##          Neg Pred Value : 0.9056          
##              Prevalence : 0.2815          
##          Detection Rate : 0.2120          
##    Detection Prevalence : 0.2643          
##       Balanced Accuracy : 0.8403          
##                                           
##        'Positive' Class : FALSE           
## 
mh3 <- glm( purpose ~ property_type + price + baths + bedrooms + Area_in_Marla, family= binomial( link = 'probit'), data = tt)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(mh3)
## 
## Call:
## glm(formula = purpose ~ property_type + price + baths + bedrooms + 
##     Area_in_Marla, family = binomial(link = "probit"), data = tt)
## 
## Coefficients:
##                              Estimate Std. Error    z value Pr(>|z|)    
## (Intercept)                 6.968e+14  2.733e+06  2.550e+08   <2e-16 ***
## property_typeFlat           1.917e+14  2.752e+06  6.964e+07   <2e-16 ***
## property_typeHouse          3.660e+14  2.739e+06  1.336e+08   <2e-16 ***
## property_typeLower Portion -4.624e+14  2.827e+06 -1.636e+08   <2e-16 ***
## property_typePenthouse     -9.449e+14  4.388e+06 -2.153e+08   <2e-16 ***
## property_typeRoom          -1.099e+15  3.774e+06 -2.911e+08   <2e-16 ***
## property_typeUpper Portion -3.249e+14  2.795e+06 -1.162e+08   <2e-16 ***
## price                       5.401e+07  5.084e-03  1.062e+10   <2e-16 ***
## baths                      -5.222e+13  9.157e+04 -5.702e+08   <2e-16 ***
## bedrooms                   -1.233e+14  1.220e+05 -1.011e+09   <2e-16 ***
## Area_in_Marla              -2.857e+12  1.843e+03 -1.551e+09   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance:  182373  on 153429  degrees of freedom
## Residual deviance: 1959910  on 153419  degrees of freedom
## AIC: 1959932
## 
## Number of Fisher Scoring iterations: 25
BrierScore(mh3)
## [1] 0.1772013
confusionMatrix(table(predict(mh3, type = "response") >=0.5, mh3$data$purpose == 'For Sale'))
## Confusion Matrix and Statistics
## 
##        
##          FALSE   TRUE
##   FALSE  16352    357
##   TRUE   26831 109890
##                                           
##                Accuracy : 0.8228          
##                  95% CI : (0.8209, 0.8247)
##     No Information Rate : 0.7185          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4615          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.3787          
##             Specificity : 0.9968          
##          Pos Pred Value : 0.9786          
##          Neg Pred Value : 0.8038          
##              Prevalence : 0.2815          
##          Detection Rate : 0.1066          
##    Detection Prevalence : 0.1089          
##       Balanced Accuracy : 0.6877          
##                                           
##        'Positive' Class : FALSE           
## 
AIC  <- c(1385973,1345964,1959932)
Brierscore <- c(0.125308,0.1216907,0.1772013)
Deviance <- c( 1385951,1345942,1959910)
confusionMatrix <- c(0.8747,0.8783,0.8228)  
MH <- c('logit','cloglog','probit')
Bang <- data.frame(MH, confusionMatrix, Deviance, Brierscore, AIC)
Bang
##        MH confusionMatrix Deviance Brierscore     AIC
## 1   logit          0.8747  1385951  0.1253080 1385973
## 2 cloglog          0.8783  1345942  0.1216907 1345964
## 3  probit          0.8228  1959910  0.1772013 1959932

Phương án tốt nhất là cloglog vì có độ chính xác cao nhất (87,83%), Deviance, AIC, Brierscore nhỏ nhất.