library(readxl)
## Warning: package 'readxl' was built under R version 4.3.1
dataset <- read_excel("C:/Users/NHAN/Downloads/dataset.xlsx")
## New names:
## • `` -> `...1`
View(dataset)
summary(dataset$purpose)
## Length Class Mode
## 153430 character character
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.1
table(dataset$purpose)
##
## For Rent For Sale
## 43183 110247
table(dataset$purpose)/sum(table(dataset$purpose))
##
## For Rent For Sale
## 0.2814508 0.7185492
dataset |> ggplot(aes(x = purpose, y = after_stat(count))) + geom_bar(fill = 'pink') + geom_text(aes(label = scales::percent(after_stat(count/sum(count)))), stat = 'count', color = 'white', vjust = - .5) + theme_classic() + labs(x = 'Mục đích mua nhà', y = 'Số lượng mua')
purpose<-dataset[dataset$purpose == "For Sale",]
length(dataset$purpose)
## [1] 153430
prop.test(length(purpose$purpose),length(dataset$purpose),p= 0.95)
##
## 1-sample proportions test with continuity correction
##
## data: length(purpose$purpose) out of length(dataset$purpose), null probability 0.95
## X-squared = 173030, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.95
## 95 percent confidence interval:
## 0.7162903 0.7207971
## sample estimates:
## p
## 0.7185492
Với mức ý nghĩa 5%, tỷ lệ mua bất động sản với mục đích bán khoảng [0.7162903;0.7202971]
purpose<-dataset[dataset$purpose == "For Rent",]
length(dataset$purpose)
## [1] 153430
prop.test(length(purpose$purpose),length(dataset$purpose),p= 0.95)
##
## 1-sample proportions test with continuity correction
##
## data: length(purpose$purpose) out of length(dataset$purpose), null probability 0.95
## X-squared = 1443707, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.95
## 95 percent confidence interval:
## 0.2792029 0.2837097
## sample estimates:
## p
## 0.2814508
Với mức ý nghĩa 5%, tỷ lệ mua bất động sản với mục đích cho thuê khoảng [0.2792029;0.2837097]
dataset$purpose<-as.factor(dataset$purpose)
dataset$property_type<-as.factor(dataset$property_type)
MH1 <- glm(factor (purpose) ~ price + baths + bedrooms + property_type , family = binomial(link = "logit"), data = dataset)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(MH1)
##
## Call:
## glm(formula = factor(purpose) ~ price + baths + bedrooms + property_type,
## family = binomial(link = "logit"), data = dataset)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.294e+00 5.512e-01 -5.977 2.28e-09 ***
## price 5.445e-06 1.180e-07 46.142 < 2e-16 ***
## baths -9.078e-02 5.575e-02 -1.628 0.1035
## bedrooms -3.956e-01 6.054e-02 -6.535 6.36e-11 ***
## property_typeFlat -1.257e-01 5.520e-01 -0.228 0.8199
## property_typeHouse -2.907e+00 5.586e-01 -5.205 1.94e-07 ***
## property_typeLower Portion -2.582e+00 6.158e-01 -4.194 2.74e-05 ***
## property_typePenthouse -1.377e+00 1.050e+00 -1.311 0.1898
## property_typeRoom -1.881e+00 7.841e-01 -2.399 0.0164 *
## property_typeUpper Portion -2.489e+00 5.946e-01 -4.186 2.84e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 182372.5 on 153429 degrees of freedom
## Residual deviance: 3259.2 on 153420 degrees of freedom
## AIC: 3279.2
##
## Number of Fisher Scoring iterations: 16