library(readxl)
TH2 <- read_excel("C:/RRR/For_EDA_dataset 1.xlsx")
## Warning: Expecting numeric in G3081 / R3081C7: got a date
## New names:
## • `` -> `...1`
str(TH2)
## tibble [153,433 × 15] (S3: tbl_df/tbl/data.frame)
## $ ...1 : num [1:153433] 0 1 2 3 4 5 6 7 8 9 ...
## $ property_type: chr [1:153433] "Flat" "Flat" "House" "House" ...
## $ price : num [1:153433] 10000000 6900000 16500000 43500000 7000000 34500000 27000000 7800000 50000000 40000000 ...
## $ location : chr [1:153433] "G-10" "E-11" "G-15" "Bani Gala" ...
## $ city : chr [1:153433] "Islamabad" "Islamabad" "Islamabad" "Islamabad" ...
## $ province_name: chr [1:153433] "Islamabad Capital" "Islamabad Capital" "Islamabad Capital" "Islamabad Capital" ...
## $ latitude : num [1:153433] 3.37e+06 3.37e+07 3.36e+16 3.37e+13 3.35e+07 ...
## $ longitude : num [1:153433] 7.30e+06 7.30e+07 7.29e+07 7.32e+12 7.33e+07 ...
## $ baths : num [1:153433] 2 3 6 4 3 8 8 2 7 5 ...
## $ purpose : chr [1:153433] "For Sale" "For Sale" "For Sale" "For Sale" ...
## $ bedrooms : num [1:153433] 2 3 5 4 3 8 8 2 7 5 ...
## $ date_added : POSIXct[1:153433], format: "2019-02-04" "2019-05-04" ...
## $ agency : chr [1:153433] "Self" "Self" "Self" "Self" ...
## $ agent : chr [1:153433] "Self" "Self" "Self" "Self" ...
## $ Area_in_Marla: num [1:153433] 4 5.6 8 40 8 32 20 6.2 20 20 ...
summary(TH2)
## ...1 property_type price location
## Min. : 0 Length:153433 Min. :0.000e+00 Length:153433
## 1st Qu.: 40454 Class :character 1st Qu.:2.000e+05 Class :character
## Median : 81389 Mode :character Median :8.700e+06 Mode :character
## Mean : 82557 Mean :1.842e+07
## 3rd Qu.:124933 3rd Qu.:2.000e+07
## Max. :168445 Max. :2.000e+09
## NA's :3 NA's :3
## city province_name latitude longitude
## Length:153433 Length:153433 Min. :2.492e+04 Min. :6.719e+04
## Class :character Class :character 1st Qu.:2.492e+07 1st Qu.:6.710e+07
## Mode :character Mode :character Median :3.146e+07 Median :7.312e+07
## Mean :3.098e+15 Mean :1.634e+15
## 3rd Qu.:3.364e+07 3rd Qu.:6.713e+12
## Max. :3.373e+16 Max. :8.016e+15
## NA's :3 NA's :3
## baths purpose bedrooms
## Min. : 0.000 Length:153433 Min. : 0.000
## 1st Qu.: 1.000 Class :character 1st Qu.: 2.000
## Median : 3.000 Mode :character Median : 3.000
## Mean : 2.956 Mean : 3.259
## 3rd Qu.: 5.000 3rd Qu.: 5.000
## Max. :403.000 Max. :68.000
## NA's :3 NA's :3
## date_added agency agent
## Min. :2018-08-05 00:00:00.00 Length:153433 Length:153433
## 1st Qu.:2019-05-05 00:00:00.00 Class :character Class :character
## Median :2019-06-27 00:00:00.00 Mode :character Mode :character
## Mean :2019-05-20 20:05:07.27
## 3rd Qu.:2019-07-04 00:00:00.00
## Max. :2019-07-18 00:00:00.00
## NA's :3
## Area_in_Marla
## Min. : 0.00
## 1st Qu.: 4.90
## Median : 7.60
## Mean : 12.33
## 3rd Qu.: 12.00
## Max. :16000.00
## NA's :3
p1<-table(TH2$purpose)
p1
##
## For Rent For Sale
## 43183 110247
table(TH2$purpose)/sum(table(TH2$purpose))
##
## For Rent For Sale
## 0.2814508 0.7185492
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
ggplot(TH2,aes(purpose)) + geom_bar(color ="purple", fill = "lavender") + ylab("Số căn hộ") + xlab("Mục đích của căn hộ")
sale <- subset(TH2,TH2$purpose =="For Sale")
head(sale)
## # A tibble: 6 × 15
## ...1 property_type price location city province_name latitude longitude
## <dbl> <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl>
## 1 0 Flat 10000000 G-10 Isla… Islamabad Ca… 3.37e 6 7.30e 6
## 2 1 Flat 6900000 E-11 Isla… Islamabad Ca… 3.37e 7 7.30e 7
## 3 2 House 16500000 G-15 Isla… Islamabad Ca… 3.36e16 7.29e 7
## 4 3 House 43500000 Bani Gala Isla… Islamabad Ca… 3.37e13 7.32e12
## 5 4 House 7000000 DHA Defen… Isla… Islamabad Ca… 3.35e 7 7.33e 7
## 6 5 House 34500000 Ghauri To… Isla… Islamabad Ca… 3.36e16 7.31e 7
## # ℹ 7 more variables: baths <dbl>, purpose <chr>, bedrooms <dbl>,
## # date_added <dttm>, agency <chr>, agent <chr>, Area_in_Marla <dbl>
rent <- subset(TH2,TH2$purpose =="For Rent")
head(rent)
## # A tibble: 6 × 15
## ...1 property_type price location city province_name latitude longitude
## <dbl> <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl>
## 1 35 House 110000 DHA Defence Isla… Islamabad Ca… 33542039 7.31e 7
## 2 36 House 600000 F-7 Isla… Islamabad Ca… 33720413 7.31e 7
## 3 37 Lower Portion 33000 G-15 Isla… Islamabad Ca… 33633416 7.29e15
## 4 38 House 45000 Pakistan To… Isla… Islamabad Ca… 33575405 7.31e 7
## 5 39 Upper Portion 12000 Bhara kahu Isla… Islamabad Ca… 33737402 7.32e 7
## 6 40 House 175000 F-7 Isla… Islamabad Ca… 33720413 7.31e 7
## # ℹ 7 more variables: baths <dbl>, purpose <chr>, bedrooms <dbl>,
## # date_added <dttm>, agency <chr>, agent <chr>, Area_in_Marla <dbl>
Đặt giả thiết:
\(H_O:\) Với các căn hộ có giá trị lớn hơn 29500000 đồng, tỷ lệ giữa người mua “for sale” và “for rent” không có chênh lệch đáng kể
\(H_1:\) Với các căn hộ có giá trị lớn hơn 29500000 đồng, tỷ lệ giữa người mua “for sale” và “for rent” có chênh lệch đáng kể
library(stats)
pur <- TH2[TH2$purpose == 'For Sale',]
purf <- TH2[TH2$purpose == 'For Rent',]
pur1 <- pur[pur$purpose > 29500000,]
purf1 <- purf[purf$purpose > 29500000,]
a <- c(nrow(pur), nrow(purf))
b <- c(nrow(pur1), nrow(purf1))
prop.test(b,a)
## Warning in prop.test(b, a): Chi-squared approximation may be incorrect
##
## 2-sample test for equality of proportions without continuity correction
##
## data: b out of a
## X-squared = NaN, df = 1, p-value = NA
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## 0 0
## sample estimates:
## prop 1 prop 2
## 1 1
HQ1 <- glm(factor(purpose) ~ price + property_type + baths + bedrooms + Area_in_Marla, family = binomial(link = 'logit'), data = TH2)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(HQ1)
##
## Call:
## glm(formula = factor(purpose) ~ price + property_type + baths +
## bedrooms + Area_in_Marla, family = binomial(link = "logit"),
## data = TH2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -8.49 0.00 0.00 0.00 8.49
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.798e+13 2.733e+06 1.756e+07 <2e-16 ***
## price 6.576e+07 5.084e-03 1.294e+10 <2e-16 ***
## property_typeFlat 1.556e+15 2.752e+06 5.652e+08 <2e-16 ***
## property_typeHouse -5.182e+14 2.739e+06 -1.892e+08 <2e-16 ***
## property_typeLower Portion -3.930e+14 2.827e+06 -1.390e+08 <2e-16 ***
## property_typePenthouse -4.679e+14 4.388e+06 -1.066e+08 <2e-16 ***
## property_typeRoom -1.277e+15 3.774e+06 -3.384e+08 <2e-16 ***
## property_typeUpper Portion -2.953e+14 2.795e+06 -1.056e+08 <2e-16 ***
## baths -4.013e+13 9.157e+04 -4.382e+08 <2e-16 ***
## bedrooms 6.753e+13 1.220e+05 5.535e+08 <2e-16 ***
## Area_in_Marla -2.979e+12 1.843e+03 -1.617e+09 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 182373 on 153429 degrees of freedom
## Residual deviance: 1385951 on 153419 degrees of freedom
## (3 observations deleted due to missingness)
## AIC: 1385973
##
## Number of Fisher Scoring iterations: 25
Theo kết quả ước lượng, mô hình logit có AIC: 1385973
HQ2 <- glm(factor(purpose) ~ price + property_type + baths + bedrooms + Area_in_Marla, family = binomial(link = 'probit'), data = TH2)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(HQ2)
##
## Call:
## glm(formula = factor(purpose) ~ price + property_type + baths +
## bedrooms + Area_in_Marla, family = binomial(link = "probit"),
## data = TH2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -8.49 0.00 0.00 0.00 8.49
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.011e+14 2.733e+06 2.566e+08 <2e-16 ***
## price 5.405e+07 5.084e-03 1.063e+10 <2e-16 ***
## property_typeFlat 1.820e+14 2.752e+06 6.612e+07 <2e-16 ***
## property_typeHouse 3.535e+14 2.739e+06 1.290e+08 <2e-16 ***
## property_typeLower Portion -4.695e+14 2.827e+06 -1.661e+08 <2e-16 ***
## property_typePenthouse -9.524e+14 4.388e+06 -2.171e+08 <2e-16 ***
## property_typeRoom -3.869e+15 3.774e+06 -1.025e+09 <2e-16 ***
## property_typeUpper Portion -3.309e+14 2.795e+06 -1.184e+08 <2e-16 ***
## baths -5.222e+13 9.157e+04 -5.702e+08 <2e-16 ***
## bedrooms -1.223e+14 1.220e+05 -1.003e+09 <2e-16 ***
## Area_in_Marla -2.860e+12 1.843e+03 -1.552e+09 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 182373 on 153429 degrees of freedom
## Residual deviance: 1961424 on 153419 degrees of freedom
## (3 observations deleted due to missingness)
## AIC: 1961446
##
## Number of Fisher Scoring iterations: 25
Theo kết quả ước lượng, mô hình probit có AIC: 1961446
HQ3 <- glm(factor(purpose) ~ price + property_type + baths + bedrooms + Area_in_Marla, family = binomial(link = 'cloglog'), data = TH2)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(HQ3)
##
## Call:
## glm(formula = factor(purpose) ~ price + property_type + baths +
## bedrooms + Area_in_Marla, family = binomial(link = "cloglog"),
## data = TH2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -8.49 0.00 0.00 0.00 8.49
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.014e+14 2.733e+06 3.712e+07 <2e-16 ***
## price 4.226e+07 5.084e-03 8.311e+09 <2e-16 ***
## property_typeFlat 3.185e+14 2.752e+06 1.157e+08 <2e-16 ***
## property_typeHouse -9.157e+13 2.739e+06 -3.343e+07 <2e-16 ***
## property_typeLower Portion -6.761e+14 2.827e+06 -2.392e+08 <2e-16 ***
## property_typePenthouse 4.042e+14 4.388e+06 9.213e+07 <2e-16 ***
## property_typeRoom -2.253e+14 3.774e+06 -5.971e+07 <2e-16 ***
## property_typeUpper Portion -1.774e+15 2.795e+06 -6.346e+08 <2e-16 ***
## baths -2.894e+13 9.157e+04 -3.160e+08 <2e-16 ***
## bedrooms -4.268e+13 1.220e+05 -3.498e+08 <2e-16 ***
## Area_in_Marla -1.871e+12 1.843e+03 -1.015e+09 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 182373 on 153429 degrees of freedom
## Residual deviance: 1345942 on 153419 degrees of freedom
## (3 observations deleted due to missingness)
## AIC: 1345964
##
## Number of Fisher Scoring iterations: 25
Theo kết quả ước lượng, mô hình cloglog có AIC: 1345964
Dựa vào kết quả hồi quy của 3 mô hình, ta có:
Logit (AIC: 1385973)
probit (AIC: 1961446)
cloglog (AIC: 1345964)
Kết quả cho thấy mô hình cloglog có chỉ số AIC nhỏ nhất trong 3 mô hình. Vì vậy Mô hình cloglog là mô hình hồi quy tốt nhất đối trong trường hợp này.
library (DescTools)
## Warning: package 'DescTools' was built under R version 4.2.3
BrierScore(HQ1)
## [1] 0.125308
BrierScore(HQ2)
## [1] 0.1773382
BrierScore(HQ3)
## [1] 0.1216907
Theo kết quả ước lượng, ta thấy mô hình cloglog có chỉ số Brier Score nhỏ nhất. Vậy, mô hình cloglog là tốt nhất