library(readxl)
datass <- read_excel("D:/datass.xlsx")
## Warning: Expecting numeric in G3081 / R3081C7: got a date
## New names:
## • `` -> `...1`
View(datass)
summary(datass$purpose)
## Length Class Mode
## 153430 character character
table(datass$purpose)
##
## For Rent For Sale
## 43183 110247
table(datass$purpose)/sum(table(datass$purpose))
##
## For Rent For Sale
## 0.2814508 0.7185492
Dựa vào kết quả ta thấy 153430 căn nhà được tiến hành khảo sát và có 43183 Rent chiếm 2.81% trên tổng số căn nhà khảo sát
library('ggplot2')
## Warning: package 'ggplot2' was built under R version 4.3.1
datass |>ggplot(aes(purpose)) + geom_bar(olor = 'yellow', fill = 'yellow') + theme_classic() + labs(x = 'purpose', y = 'số lượng')
## Warning in geom_bar(olor = "yellow", fill = "yellow"): Ignoring unknown
## parameters: `olor`
Ước lượng tỷ lệ loại tài sản dùng để cho thuê và đồng thời kiểm định xem tỉ lệ(%) loại tài sản dùng để cho thuê có phải là 25% hay không?
Kiểm định giả thuyết: H0 = 0.25
prop.test(length(datass$purpose),length(datass$purpose),p= 0.3)
##
## 1-sample proportions test with continuity correction
##
## data: length(datass$purpose) out of length(datass$purpose), null probability 0.3
## X-squared = 358000, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.3
## 95 percent confidence interval:
## 0.9999688 1.0000000
## sample estimates:
## p
## 1
Ta có p_value <0.05, vì vậy tỉ lệ loại tài sản dùng cho thuê không bằng 25% với mức ý nghĩa 5%.
Chọn biến purpose làm biến phụ thuộc vào biến price, baths, bedrooms, property type. area in Marla
datass$purpose <- as.factor(datass$purpose)
mh1 <- glm( purpose ~ property_type + price + baths + bedrooms + Area_in_Marla, family= binomial( link = 'logit'), data = datass)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(mh1)
##
## Call:
## glm(formula = purpose ~ property_type + price + baths + bedrooms +
## Area_in_Marla, family = binomial(link = "logit"), data = datass)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.798e+13 2.733e+06 1.756e+07 <2e-16 ***
## property_typeFlat 1.556e+15 2.752e+06 5.652e+08 <2e-16 ***
## property_typeHouse -5.182e+14 2.739e+06 -1.892e+08 <2e-16 ***
## property_typeLower Portion -3.930e+14 2.827e+06 -1.390e+08 <2e-16 ***
## property_typePenthouse -4.679e+14 4.388e+06 -1.066e+08 <2e-16 ***
## property_typeRoom -1.277e+15 3.774e+06 -3.384e+08 <2e-16 ***
## property_typeUpper Portion -2.953e+14 2.795e+06 -1.056e+08 <2e-16 ***
## price 6.576e+07 5.084e-03 1.294e+10 <2e-16 ***
## baths -4.013e+13 9.157e+04 -4.382e+08 <2e-16 ***
## bedrooms 6.753e+13 1.220e+05 5.535e+08 <2e-16 ***
## Area_in_Marla -2.979e+12 1.843e+03 -1.617e+09 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 182373 on 153429 degrees of freedom
## Residual deviance: 1385951 on 153419 degrees of freedom
## AIC: 1385973
##
## Number of Fisher Scoring iterations: 25
datass$purpose <- as.factor(datass$purpose)
mh2 <- glm( purpose ~ property_type + price + baths + bedrooms + Area_in_Marla, family= binomial( link = 'probit'), data = datass)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(mh2)
##
## Call:
## glm(formula = purpose ~ property_type + price + baths + bedrooms +
## Area_in_Marla, family = binomial(link = "probit"), data = datass)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 6.968e+14 2.733e+06 2.550e+08 <2e-16 ***
## property_typeFlat 1.917e+14 2.752e+06 6.964e+07 <2e-16 ***
## property_typeHouse 3.660e+14 2.739e+06 1.336e+08 <2e-16 ***
## property_typeLower Portion -4.624e+14 2.827e+06 -1.636e+08 <2e-16 ***
## property_typePenthouse -9.449e+14 4.388e+06 -2.153e+08 <2e-16 ***
## property_typeRoom -1.099e+15 3.774e+06 -2.911e+08 <2e-16 ***
## property_typeUpper Portion -3.249e+14 2.795e+06 -1.162e+08 <2e-16 ***
## price 5.401e+07 5.084e-03 1.062e+10 <2e-16 ***
## baths -5.222e+13 9.157e+04 -5.702e+08 <2e-16 ***
## bedrooms -1.233e+14 1.220e+05 -1.011e+09 <2e-16 ***
## Area_in_Marla -2.857e+12 1.843e+03 -1.551e+09 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 182373 on 153429 degrees of freedom
## Residual deviance: 1959910 on 153419 degrees of freedom
## AIC: 1959932
##
## Number of Fisher Scoring iterations: 25
datass$purpose <- as.factor(datass$purpose)
mh3 <- glm( purpose ~ property_type + price + baths + bedrooms + Area_in_Marla, family= binomial( link = 'cloglog'), data = datass)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(mh3)
##
## Call:
## glm(formula = purpose ~ property_type + price + baths + bedrooms +
## Area_in_Marla, family = binomial(link = "cloglog"), data = datass)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.014e+14 2.733e+06 3.711e+07 <2e-16 ***
## property_typeFlat 3.185e+14 2.752e+06 1.157e+08 <2e-16 ***
## property_typeHouse -9.156e+13 2.739e+06 -3.342e+07 <2e-16 ***
## property_typeLower Portion -6.761e+14 2.827e+06 -2.392e+08 <2e-16 ***
## property_typePenthouse 4.042e+14 4.388e+06 9.213e+07 <2e-16 ***
## property_typeRoom -2.253e+14 3.774e+06 -5.971e+07 <2e-16 ***
## property_typeUpper Portion -1.774e+15 2.795e+06 -6.346e+08 <2e-16 ***
## price 4.226e+07 5.084e-03 8.311e+09 <2e-16 ***
## baths -2.894e+13 9.157e+04 -3.160e+08 <2e-16 ***
## bedrooms -4.268e+13 1.220e+05 -3.498e+08 <2e-16 ***
## Area_in_Marla -1.871e+12 1.843e+03 -1.015e+09 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 182373 on 153429 degrees of freedom
## Residual deviance: 1345942 on 153419 degrees of freedom
## AIC: 1345964
##
## Number of Fisher Scoring iterations: 25
chọn mô hình loggit