1 Nhập dữ liệu

library(readxl)
datass <- read_excel("D:/datass.xlsx")
## Warning: Expecting numeric in G3081 / R3081C7: got a date
## New names:
## • `` -> `...1`
View(datass)

2 Thống kê mô tả

2.1 Thống kê mô tả biến purpose

summary(datass$purpose)
##    Length     Class      Mode 
##    153430 character character
table(datass$purpose)
## 
## For Rent For Sale 
##    43183   110247
table(datass$purpose)/sum(table(datass$purpose))
## 
##  For Rent  For Sale 
## 0.2814508 0.7185492

Dựa vào kết quả ta thấy 153430 căn nhà được tiến hành khảo sát và có 43183 Rent chiếm 2.81% trên tổng số căn nhà khảo sát

library('ggplot2')
## Warning: package 'ggplot2' was built under R version 4.3.1
datass |>ggplot(aes(purpose)) + geom_bar(olor = 'yellow', fill = 'yellow') + theme_classic() + labs(x = 'purpose', y = 'số lượng')
## Warning in geom_bar(olor = "yellow", fill = "yellow"): Ignoring unknown
## parameters: `olor`

3 Ước lượng tỷ lệ

Ước lượng tỷ lệ loại tài sản dùng để cho thuê và đồng thời kiểm định xem tỉ lệ(%) loại tài sản dùng để cho thuê có phải là 25% hay không?

Kiểm định giả thuyết: H0 = 0.25

prop.test(length(datass$purpose),length(datass$purpose),p= 0.3)
## 
##  1-sample proportions test with continuity correction
## 
## data:  length(datass$purpose) out of length(datass$purpose), null probability 0.3
## X-squared = 358000, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.3
## 95 percent confidence interval:
##  0.9999688 1.0000000
## sample estimates:
## p 
## 1

Ta có p_value <0.05, vì vậy tỉ lệ loại tài sản dùng cho thuê không bằng 25% với mức ý nghĩa 5%.

4 Hồi Quy

4.1 Mô hình hồi quy logit

Chọn biến purpose làm biến phụ thuộc vào biến price, baths, bedrooms, property type. area in Marla

datass$purpose <- as.factor(datass$purpose)
mh1 <- glm( purpose ~ property_type + price + baths + bedrooms + Area_in_Marla, family= binomial( link = 'logit'), data = datass)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(mh1)
## 
## Call:
## glm(formula = purpose ~ property_type + price + baths + bedrooms + 
##     Area_in_Marla, family = binomial(link = "logit"), data = datass)
## 
## Coefficients:
##                              Estimate Std. Error    z value Pr(>|z|)    
## (Intercept)                 4.798e+13  2.733e+06  1.756e+07   <2e-16 ***
## property_typeFlat           1.556e+15  2.752e+06  5.652e+08   <2e-16 ***
## property_typeHouse         -5.182e+14  2.739e+06 -1.892e+08   <2e-16 ***
## property_typeLower Portion -3.930e+14  2.827e+06 -1.390e+08   <2e-16 ***
## property_typePenthouse     -4.679e+14  4.388e+06 -1.066e+08   <2e-16 ***
## property_typeRoom          -1.277e+15  3.774e+06 -3.384e+08   <2e-16 ***
## property_typeUpper Portion -2.953e+14  2.795e+06 -1.056e+08   <2e-16 ***
## price                       6.576e+07  5.084e-03  1.294e+10   <2e-16 ***
## baths                      -4.013e+13  9.157e+04 -4.382e+08   <2e-16 ***
## bedrooms                    6.753e+13  1.220e+05  5.535e+08   <2e-16 ***
## Area_in_Marla              -2.979e+12  1.843e+03 -1.617e+09   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance:  182373  on 153429  degrees of freedom
## Residual deviance: 1385951  on 153419  degrees of freedom
## AIC: 1385973
## 
## Number of Fisher Scoring iterations: 25

4.2 Mô hình hồi quy probit

datass$purpose <- as.factor(datass$purpose)
mh2 <- glm( purpose ~ property_type + price + baths + bedrooms + Area_in_Marla, family= binomial( link = 'probit'), data = datass)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(mh2)
## 
## Call:
## glm(formula = purpose ~ property_type + price + baths + bedrooms + 
##     Area_in_Marla, family = binomial(link = "probit"), data = datass)
## 
## Coefficients:
##                              Estimate Std. Error    z value Pr(>|z|)    
## (Intercept)                 6.968e+14  2.733e+06  2.550e+08   <2e-16 ***
## property_typeFlat           1.917e+14  2.752e+06  6.964e+07   <2e-16 ***
## property_typeHouse          3.660e+14  2.739e+06  1.336e+08   <2e-16 ***
## property_typeLower Portion -4.624e+14  2.827e+06 -1.636e+08   <2e-16 ***
## property_typePenthouse     -9.449e+14  4.388e+06 -2.153e+08   <2e-16 ***
## property_typeRoom          -1.099e+15  3.774e+06 -2.911e+08   <2e-16 ***
## property_typeUpper Portion -3.249e+14  2.795e+06 -1.162e+08   <2e-16 ***
## price                       5.401e+07  5.084e-03  1.062e+10   <2e-16 ***
## baths                      -5.222e+13  9.157e+04 -5.702e+08   <2e-16 ***
## bedrooms                   -1.233e+14  1.220e+05 -1.011e+09   <2e-16 ***
## Area_in_Marla              -2.857e+12  1.843e+03 -1.551e+09   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance:  182373  on 153429  degrees of freedom
## Residual deviance: 1959910  on 153419  degrees of freedom
## AIC: 1959932
## 
## Number of Fisher Scoring iterations: 25

4.3 Mô hình hồi quy cloglog

datass$purpose <- as.factor(datass$purpose)
mh3 <- glm( purpose ~ property_type + price + baths + bedrooms + Area_in_Marla, family= binomial( link = 'cloglog'), data = datass)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(mh3)
## 
## Call:
## glm(formula = purpose ~ property_type + price + baths + bedrooms + 
##     Area_in_Marla, family = binomial(link = "cloglog"), data = datass)
## 
## Coefficients:
##                              Estimate Std. Error    z value Pr(>|z|)    
## (Intercept)                 1.014e+14  2.733e+06  3.711e+07   <2e-16 ***
## property_typeFlat           3.185e+14  2.752e+06  1.157e+08   <2e-16 ***
## property_typeHouse         -9.156e+13  2.739e+06 -3.342e+07   <2e-16 ***
## property_typeLower Portion -6.761e+14  2.827e+06 -2.392e+08   <2e-16 ***
## property_typePenthouse      4.042e+14  4.388e+06  9.213e+07   <2e-16 ***
## property_typeRoom          -2.253e+14  3.774e+06 -5.971e+07   <2e-16 ***
## property_typeUpper Portion -1.774e+15  2.795e+06 -6.346e+08   <2e-16 ***
## price                       4.226e+07  5.084e-03  8.311e+09   <2e-16 ***
## baths                      -2.894e+13  9.157e+04 -3.160e+08   <2e-16 ***
## bedrooms                   -4.268e+13  1.220e+05 -3.498e+08   <2e-16 ***
## Area_in_Marla              -1.871e+12  1.843e+03 -1.015e+09   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance:  182373  on 153429  degrees of freedom
## Residual deviance: 1345942  on 153419  degrees of freedom
## AIC: 1345964
## 
## Number of Fisher Scoring iterations: 25

5 lựa chọn mô hình

chọn mô hình loggit