##
## For Rent For Sale
## 43183 110247
##
## For Rent For Sale
## 28.14508 71.85492
Có 28,15% số người trong cuộc khảo sát có ý định cho thuê nhà và 71,85% có ý định bán nhà.
tt |> ggplot(aes( x = purpose , y = after_stat(count))) +
geom_bar(fill = 'moccasin') +
geom_text(aes(label = scales::percent( after_stat(count/sum(count)), accuracy = 0.01)) ,stat = 'count', color = 'black', vjust = 1.5) +
theme_classic() +
labs(title = 'Đồ thị phân bổ của biến purpose' ,x = 'purpose', y = 'Số người')tt %>% group_by(purpose) %>% summarise( n=n()) %>% mutate( percent =n/sum(n)) |> ggplot(aes(x='', y=percent, fill= purpose)) + geom_bar(stat='identity', width = 1) + geom_text(aes(label = paste0(round(percent*100), "%")), position = position_stack(vjust = 0.5))+
coord_polar("y", start = 0) +
scale_fill_manual(values = c("lightpink", "moccasin"), name = "purpose") +
labs(title = "BIỂU ĐỒ PHÂN BỐ BIẾN PURPOSE") +
theme_minimal()rent_count <- table(tt$purpose)
rent_prop <- prop.table(rent_count)
odd_rent <- rent_prop['For Rent'] / rent_prop['For Sale']
odd_rent## For Rent
## 0.3916932
## For Sale
## 2.553019
Trong cuộc khảo sát ta có tỷ lệ người có mục đích cho thuê nhà bằng 39,2% tỷ lệ người có mục đích bán nhà. Hay nói cách khác tỷ lệ người có mục đích bán nhà gấp 2,55 lần tỷ lệ người có mục đích cho thuê nhà.
##
## 1-sample proportions test with continuity correction
##
## data: length(c$purpose) out of length(tt$purpose), null probability 0.5
## X-squared = 29313, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.2792029 0.2837097
## sample estimates:
## p
## 0.2814508
Với độ tin cậy 95% ta có tỷ lệ những người trong cuộc khảo sát có mục đích cho thuê nhà nằm trong khoảng từ 27,92% đến 28,37%.
##
## 1-sample proportions test with continuity correction
##
## data: length(c1$purpose) out of length(tt$purpose), null probability 0.5
## X-squared = 29313, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.7162903 0.7207971
## sample estimates:
## p
## 0.7185492
Với độ tin cậy 95% ta có tỷ lệ những người trong cuộc khảo sát có mục đích bán nhà nằm trong khoảng từ 71,63% đến 72,08%.
mh1 <- glm( purpose ~ property_type + price + baths + bedrooms + Area_in_Marla, family= binomial( link = 'logit'), data = tt)## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Call:
## glm(formula = purpose ~ property_type + price + baths + bedrooms +
## Area_in_Marla, family = binomial(link = "logit"), data = tt)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.798e+13 2.733e+06 1.756e+07 <2e-16 ***
## property_typeFlat 1.556e+15 2.752e+06 5.652e+08 <2e-16 ***
## property_typeHouse -5.182e+14 2.739e+06 -1.892e+08 <2e-16 ***
## property_typeLower Portion -3.930e+14 2.827e+06 -1.390e+08 <2e-16 ***
## property_typePenthouse -4.679e+14 4.388e+06 -1.066e+08 <2e-16 ***
## property_typeRoom -1.277e+15 3.774e+06 -3.384e+08 <2e-16 ***
## property_typeUpper Portion -2.953e+14 2.795e+06 -1.056e+08 <2e-16 ***
## price 6.576e+07 5.084e-03 1.294e+10 <2e-16 ***
## baths -4.013e+13 9.157e+04 -4.382e+08 <2e-16 ***
## bedrooms 6.753e+13 1.220e+05 5.535e+08 <2e-16 ***
## Area_in_Marla -2.979e+12 1.843e+03 -1.617e+09 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 182373 on 153429 degrees of freedom
## Residual deviance: 1385951 on 153419 degrees of freedom
## AIC: 1385973
##
## Number of Fisher Scoring iterations: 25
## [1] 0.125308
## Confusion Matrix and Statistics
##
##
## FALSE TRUE
## FALSE 33707 9750
## TRUE 9476 100497
##
## Accuracy : 0.8747
## 95% CI : (0.873, 0.8763)
## No Information Rate : 0.7185
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.6908
##
## Mcnemar's Test P-Value : 0.04897
##
## Sensitivity : 0.7806
## Specificity : 0.9116
## Pos Pred Value : 0.7756
## Neg Pred Value : 0.9138
## Prevalence : 0.2815
## Detection Rate : 0.2197
## Detection Prevalence : 0.2832
## Balanced Accuracy : 0.8461
##
## 'Positive' Class : FALSE
##
mh2 <- glm(purpose ~ price + baths + bedrooms + Area_in_Marla + property_type, family = binomial(link = 'cloglog'), data = tt)## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Call:
## glm(formula = purpose ~ price + baths + bedrooms + Area_in_Marla +
## property_type, family = binomial(link = "cloglog"), data = tt)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.014e+14 2.733e+06 3.711e+07 <2e-16 ***
## price 4.226e+07 5.084e-03 8.311e+09 <2e-16 ***
## baths -2.894e+13 9.157e+04 -3.160e+08 <2e-16 ***
## bedrooms -4.268e+13 1.220e+05 -3.498e+08 <2e-16 ***
## Area_in_Marla -1.871e+12 1.843e+03 -1.015e+09 <2e-16 ***
## property_typeFlat 3.185e+14 2.752e+06 1.157e+08 <2e-16 ***
## property_typeHouse -9.156e+13 2.739e+06 -3.342e+07 <2e-16 ***
## property_typeLower Portion -6.761e+14 2.827e+06 -2.392e+08 <2e-16 ***
## property_typePenthouse 4.042e+14 4.388e+06 9.213e+07 <2e-16 ***
## property_typeRoom -2.253e+14 3.774e+06 -5.971e+07 <2e-16 ***
## property_typeUpper Portion -1.774e+15 2.795e+06 -6.346e+08 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 182373 on 153429 degrees of freedom
## Residual deviance: 1345942 on 153419 degrees of freedom
## AIC: 1345964
##
## Number of Fisher Scoring iterations: 25
## [1] 0.1216907
## Confusion Matrix and Statistics
##
##
## FALSE TRUE
## FALSE 32529 8017
## TRUE 10654 102230
##
## Accuracy : 0.8783
## 95% CI : (0.8767, 0.8799)
## No Information Rate : 0.7185
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6934
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.7533
## Specificity : 0.9273
## Pos Pred Value : 0.8023
## Neg Pred Value : 0.9056
## Prevalence : 0.2815
## Detection Rate : 0.2120
## Detection Prevalence : 0.2643
## Balanced Accuracy : 0.8403
##
## 'Positive' Class : FALSE
##
mh3 <- glm( purpose ~ property_type + price + baths + bedrooms + Area_in_Marla, family= binomial( link = 'probit'), data = tt)## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Call:
## glm(formula = purpose ~ property_type + price + baths + bedrooms +
## Area_in_Marla, family = binomial(link = "probit"), data = tt)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.968e+14 2.733e+06 2.550e+08 <2e-16 ***
## property_typeFlat 1.917e+14 2.752e+06 6.964e+07 <2e-16 ***
## property_typeHouse 3.660e+14 2.739e+06 1.336e+08 <2e-16 ***
## property_typeLower Portion -4.624e+14 2.827e+06 -1.636e+08 <2e-16 ***
## property_typePenthouse -9.449e+14 4.388e+06 -2.153e+08 <2e-16 ***
## property_typeRoom -1.099e+15 3.774e+06 -2.911e+08 <2e-16 ***
## property_typeUpper Portion -3.249e+14 2.795e+06 -1.162e+08 <2e-16 ***
## price 5.401e+07 5.084e-03 1.062e+10 <2e-16 ***
## baths -5.222e+13 9.157e+04 -5.702e+08 <2e-16 ***
## bedrooms -1.233e+14 1.220e+05 -1.011e+09 <2e-16 ***
## Area_in_Marla -2.857e+12 1.843e+03 -1.551e+09 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 182373 on 153429 degrees of freedom
## Residual deviance: 1959910 on 153419 degrees of freedom
## AIC: 1959932
##
## Number of Fisher Scoring iterations: 25
## [1] 0.1772013
## Confusion Matrix and Statistics
##
##
## FALSE TRUE
## FALSE 16352 357
## TRUE 26831 109890
##
## Accuracy : 0.8228
## 95% CI : (0.8209, 0.8247)
## No Information Rate : 0.7185
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4615
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.3787
## Specificity : 0.9968
## Pos Pred Value : 0.9786
## Neg Pred Value : 0.8038
## Prevalence : 0.2815
## Detection Rate : 0.1066
## Detection Prevalence : 0.1089
## Balanced Accuracy : 0.6877
##
## 'Positive' Class : FALSE
##
AIC <- c(1385973,1345964,1959932)
Brierscore <- c(0.125308,0.1216907,0.1772013)
Deviance <- c( 1385951,1345942,1959910)
confusionMatrix <- c(0.8747,0.8783,0.8228)
MH <- c('logit','cloglog','probit')
Bang <- data.frame(MH, confusionMatrix, Deviance, Brierscore, AIC)
Bang## MH confusionMatrix Deviance Brierscore AIC
## 1 logit 0.8747 1385951 0.1253080 1385973
## 2 cloglog 0.8783 1345942 0.1216907 1345964
## 3 probit 0.8228 1959910 0.1772013 1959932
Phương án tốt nhất là cloglog vì có độ chính xác cao nhất (87,83%), Deviance, AIC, Brierscore nhỏ nhất.