Các thư viện sẽ được dùng trong bài tập

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library (readxl)
library (ggplot2)
library (DescTools)
library(epitools)
library(caret)

## Warning: package 'caret' was built under R version 4.3.1

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following objects are masked from 'package:DescTools':
## 
##     MAE, RMSE
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(DT)

## Warning: package 'DT' was built under R version 4.3.1

Dataset <- read_excel("Dataset.xlsx")

## Warning: Expecting numeric in G3081 / R3081C7: got a date

## New names:
## • `` -> `...1`

str(Dataset)

## tibble [153,430 × 15] (S3: tbl_df/tbl/data.frame)
##  $ ...1         : num [1:153430] 0 1 2 3 4 5 6 7 8 9 ...
##  $ property_type: chr [1:153430] "Flat" "Flat" "House" "House" ...
##  $ price        : num [1:153430] 10000000 6900000 16500000 43500000 7000000 34500000 27000000 7800000 50000000 40000000 ...
##  $ location     : chr [1:153430] "G-10" "E-11" "G-15" "Bani Gala" ...
##  $ city         : chr [1:153430] "Islamabad" "Islamabad" "Islamabad" "Islamabad" ...
##  $ province_name: chr [1:153430] "Islamabad Capital" "Islamabad Capital" "Islamabad Capital" "Islamabad Capital" ...
##  $ latitude     : num [1:153430] 3.37e+06 3.37e+07 3.36e+16 3.37e+13 3.35e+07 ...
##  $ longitude    : num [1:153430] 7.30e+06 7.30e+07 7.29e+07 7.32e+12 7.33e+07 ...
##  $ baths        : num [1:153430] 2 3 6 4 3 8 8 2 7 5 ...
##  $ purpose      : chr [1:153430] "For Sale" "For Sale" "For Sale" "For Sale" ...
##  $ bedrooms     : num [1:153430] 2 3 5 4 3 8 8 2 7 5 ...
##  $ date_added   : POSIXct[1:153430], format: "2019-02-04" "2019-05-04" ...
##  $ agency       : chr [1:153430] "Self" "Self" "Self" "Self" ...
##  $ agent        : chr [1:153430] "Self" "Self" "Self" "Self" ...
##  $ Area_in_Marla: num [1:153430] 4 5.6 8 40 8 32 20 6.2 20 20 ...

1 Thống kê mô tả

table(Dataset$purpose)

## 
## For Rent For Sale 
##    43183   110247

prop.table(table(Dataset$purpose))

## 
##  For Rent  For Sale 
## 0.2814508 0.7185492

Tỷ lệ căn hộ được sử dụng với mục đích cho thuê là 28.15% và Tỷ lệ căn hộ được sử dụng với mục đích bán là 71.85%

pur <- data.frame(Dataset$purpose)
pur |> ggplot(aes(Dataset$purpose)) + geom_bar() + ylab ("Số căn hộ") + xlab ("Mục đích sử dụng")

2 Thống kê suy diễn - Khoảng Ước lượng tỷ lệ

2.1 Ước lượng tỷ lệ căn hộ có mục đích bán

Ước lượng tỷ lệ căn hộ được sử dụng với mục đích bán và đồng thời kiểm định xem tỷ lệ căn hộ được sử dụng với mục đích bán có phải là 25% hay không. Ta kiểm định giả thuyết:

\(H_0\): Tỷ lệ căn hộ được sử dụng với mục đích bán là 25%

\(H_1\): Tỷ lệ căn hộ được sử dụng với mục đích bán không phải là 25%

pur <- Dataset[Dataset$purpose == "For Sale",]
prop.test(length(pur$purpose), length(Dataset$purpose), p = 0.25)

## 
##  1-sample proportions test with continuity correction
## 
## data:  length(pur$purpose) out of length(Dataset$purpose), null probability 0.25
## X-squared = 179644, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.25
## 95 percent confidence interval:
##  0.7162903 0.7207971
## sample estimates:
##         p 
## 0.7185492

Vì p_value = 2.2e-16 < 0.05 nên ta bác bỏ giả thuyết \(H_0\). Nghĩa là tỷ lệ căn hộ được sử dụng với mục đích bán không phải là 25%.

Với độ tin cậy 95%, ước lượng tỷ lệ căn hộ được sử dụng với mục đích bán nằm trong khoảng từ 0.7162903 đến 0.7207971

2.2 Ước lượng sự chênh lệch về tỷ lệ căn hộ có giá trị lớn hơn 700 triệu giữa mục đích cho thuê và mục đich bán

Giả thuyết:

\(H_0\): không có sự khác biệt đáng kể trong tỷ lệ giữa mục đích bán và mục đích cho thuê về số lượng căn hộ có giá trị lớn hơn 700 triệu đồng.

\(H_1\): có sự khác biệt đáng kể trong tỷ lệ giữa mục đích bán và mục đích cho thuê về số lượng căn hộ có giá trị lớn hơn 700 triệu đồng.

purs <- Dataset[Dataset$purpose == 'For Sale',]
purr <- Dataset[Dataset$purpose == 'For Rent',]

purs1 <- purs[purs$price > 700000000,]
purr1 <- purr[purr$price > 700000000,]

a <- c(nrow(purs), nrow(purr))
b <- c(nrow(purs1), nrow(purr1))

prop.test(b,a)

## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  b out of a
## X-squared = 8.8414, df = 1, p-value = 0.002945
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.0001290808 0.0003425873
## sample estimates:
##       prop 1       prop 2 
## 0.0002358341 0.0000000000

Vì p_value = 0.002945 < 0.05 nên ta bác bỏ giả thuyết \(H_0\). Nghĩa là có sự khác biệt đáng kể trong tỷ lệ giữa mục đích bán và mục đích cho thuê về số lượng căn hộ có giá trị lớn hơn 700 triệu đồng.

Với độ tin cậy 95%, khoảng chênh lệch giữa mục đích bán và mục đích cho thuê về số lượng căn hộ có giá trị hơn 700 triệu đồng nằm trong khoảng từ 0.0001130100 đến 0.0002521647.

2.3 Ước lượng sự chênh lệch về tỷ lệ căn hộ có nhiều hơn 5 phòng tắm giữa mục đích cho thuê và mục đich bán

Giả thuyết:

\(H_0\): không có sự khác biệt đáng kể trong tỷ lệ giữa mục đích bán và mục đích cho thuê về số lượng căn hộ có nhiều hơn 5 phòng tắm.

\(H_1\): có sự khác biệt đáng kể trong tỷ lệ giữa mục đích bán và mục đích cho thuê về số lượng căn hộ có nhiều hơn 5 phòng tắm.

purs <- Dataset[Dataset$purpose == 'For Sale',]
purr <- Dataset[Dataset$purpose == 'For Rent',]

purs2 <- purs[purs$baths > 5,]
purr2 <- purr[purr$baths > 5,]

a <- c(nrow(purs), nrow(purr))
b <- c(nrow(purs2), nrow(purr2))

prop.test(b,a)

## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  b out of a
## X-squared = 2064.3, df = 1, p-value < 2.2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.09123455 0.09842570
## sample estimates:
##     prop 1     prop 2 
## 0.18776021 0.09293009

Vì p_value = 2.2e-16 < 0.05 nên ta bác bỏ giả thuyết \(H_0\). Nghĩa là có sự khác biệt đáng kể trong tỷ lệ giữa mục đích bán và mục đích cho thuê về số lượng căn hộ có nhiều hơn 5 phòng tắm.

Với độ tin cậy 95%, khoảng chênh lệch giữa mục đích cho thuê và mục đích bán về số lượng căn hộ có nhiều hơn 5 phòng tắm nằm trong khoảng từ 0.09123455 đến 0.09842570.

2.4 Ước lượng sự chênh lệch về tỷ lệ căn hộ có nhiều hơn 7 phòng ngủ giữa mục đích cho thuê và mục đich bán

Giả thuyết:

\(H_0\): không có sự khác biệt đáng kể trong tỷ lệ giữa mục đích bán và mục đích cho thuê về số lượng căn hộ có nhiều hơn 7 phòng ngủ.

\(H_1\): có sự khác biệt đáng kể trong tỷ lệ giữa mục đích bán và mục đích cho thuê về số lượng căn hộ có nhiều hơn 7 phòng ngủ.

purs <- Dataset[Dataset$purpose == 'For Sale',]
purr <- Dataset[Dataset$purpose == 'For Rent',]

purs3 <- purs[purs$bedrooms > 7,]
purr3 <- purr[purr$bedrooms > 7,]

a <- c(nrow(purs), nrow(purr))
b <- c(nrow(purs3), nrow(purr3))

prop.test(b,a)

## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  b out of a
## X-squared = 21.21, df = 1, p-value = 4.117e-06
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.002233155 0.005356586
## sample estimates:
##     prop 1     prop 2 
## 0.02241331 0.01861844

Vì p_value = 4.117e-06 < 0.05 nên ta bác bỏ giả thuyết \(H_0\). Nghĩa là có sự khác biệt đáng kể trong tỷ lệ giữa mục đích bán và mục đích cho thuê về số lượng căn hộ có nhiều hơn 7 phòng ngủ.

Với độ tin cậy 95%, khoảng chênh lệch giữa mục đích cho thuê và mục đích bán về số lượng căn hộ có nhiều hơn 10 phòng ngủ nằm trong khoảng từ 0.002233155 đến 0.005356586.

3 Mô hình hồi quy

3.1 Hồi quy với hàm logit (logit link function)

fit1 <- glm(factor (purpose) ~ Dataset$property_type + Dataset$price + Dataset$baths + Dataset$bedrooms + Dataset$Area_in_Marla, family = binomial(link = "logit"), data = Dataset)

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(fit1)

## 
## Call:
## glm(formula = factor(purpose) ~ Dataset$property_type + Dataset$price + 
##     Dataset$baths + Dataset$bedrooms + Dataset$Area_in_Marla, 
##     family = binomial(link = "logit"), data = Dataset)
## 
## Coefficients:
##                                      Estimate Std. Error    z value Pr(>|z|)
## (Intercept)                         4.798e+13  2.733e+06  1.756e+07   <2e-16
## Dataset$property_typeFlat           1.556e+15  2.752e+06  5.652e+08   <2e-16
## Dataset$property_typeHouse         -5.182e+14  2.739e+06 -1.892e+08   <2e-16
## Dataset$property_typeLower Portion -3.930e+14  2.827e+06 -1.390e+08   <2e-16
## Dataset$property_typePenthouse     -4.679e+14  4.388e+06 -1.066e+08   <2e-16
## Dataset$property_typeRoom          -1.277e+15  3.774e+06 -3.384e+08   <2e-16
## Dataset$property_typeUpper Portion -2.953e+14  2.795e+06 -1.056e+08   <2e-16
## Dataset$price                       6.576e+07  5.084e-03  1.294e+10   <2e-16
## Dataset$baths                      -4.013e+13  9.157e+04 -4.382e+08   <2e-16
## Dataset$bedrooms                    6.753e+13  1.220e+05  5.535e+08   <2e-16
## Dataset$Area_in_Marla              -2.979e+12  1.843e+03 -1.617e+09   <2e-16
##                                       
## (Intercept)                        ***
## Dataset$property_typeFlat          ***
## Dataset$property_typeHouse         ***
## Dataset$property_typeLower Portion ***
## Dataset$property_typePenthouse     ***
## Dataset$property_typeRoom          ***
## Dataset$property_typeUpper Portion ***
## Dataset$price                      ***
## Dataset$baths                      ***
## Dataset$bedrooms                   ***
## Dataset$Area_in_Marla              ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance:  182373  on 153429  degrees of freedom
## Residual deviance: 1385951  on 153419  degrees of freedom
## AIC: 1385973
## 
## Number of Fisher Scoring iterations: 25

# Giá trị BrierScore
BrierScore(fit1)

## [1] 0.125308

# Ma trận nhầm lẫn
a <- predict(fit1, type = "response")
b <- ifelse(a > 0.5, "1", "0")
c <-factor(b, levels = c("0","1"))
d <- factor(Dataset$purpose, labels = c("0","1"))
confusionMatrix(table(c, d))

## Confusion Matrix and Statistics
## 
##    d
## c        0      1
##   0  33707   9750
##   1   9476 100497
##                                          
##                Accuracy : 0.8747         
##                  95% CI : (0.873, 0.8763)
##     No Information Rate : 0.7185         
##     P-Value [Acc > NIR] : < 2e-16        
##                                          
##                   Kappa : 0.6908         
##                                          
##  Mcnemar's Test P-Value : 0.04897        
##                                          
##             Sensitivity : 0.7806         
##             Specificity : 0.9116         
##          Pos Pred Value : 0.7756         
##          Neg Pred Value : 0.9138         
##              Prevalence : 0.2815         
##          Detection Rate : 0.2197         
##    Detection Prevalence : 0.2832         
##       Balanced Accuracy : 0.8461         
##                                          
##        'Positive' Class : 0              
##

3.2 Hồi quy với hàm probit (probit link function)

fit2 <- glm(factor (purpose) ~ Dataset$property_type + Dataset$price + Dataset$baths + Dataset$bedrooms + Dataset$Area_in_Marla, family = binomial(link = "probit"), data = Dataset)

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(fit2)

## 
## Call:
## glm(formula = factor(purpose) ~ Dataset$property_type + Dataset$price + 
##     Dataset$baths + Dataset$bedrooms + Dataset$Area_in_Marla, 
##     family = binomial(link = "probit"), data = Dataset)
## 
## Coefficients:
##                                      Estimate Std. Error    z value Pr(>|z|)
## (Intercept)                         6.968e+14  2.733e+06  2.550e+08   <2e-16
## Dataset$property_typeFlat           1.917e+14  2.752e+06  6.964e+07   <2e-16
## Dataset$property_typeHouse          3.660e+14  2.739e+06  1.336e+08   <2e-16
## Dataset$property_typeLower Portion -4.624e+14  2.827e+06 -1.636e+08   <2e-16
## Dataset$property_typePenthouse     -9.449e+14  4.388e+06 -2.153e+08   <2e-16
## Dataset$property_typeRoom          -1.099e+15  3.774e+06 -2.911e+08   <2e-16
## Dataset$property_typeUpper Portion -3.249e+14  2.795e+06 -1.162e+08   <2e-16
## Dataset$price                       5.401e+07  5.084e-03  1.062e+10   <2e-16
## Dataset$baths                      -5.222e+13  9.157e+04 -5.702e+08   <2e-16
## Dataset$bedrooms                   -1.233e+14  1.220e+05 -1.011e+09   <2e-16
## Dataset$Area_in_Marla              -2.857e+12  1.843e+03 -1.551e+09   <2e-16
##                                       
## (Intercept)                        ***
## Dataset$property_typeFlat          ***
## Dataset$property_typeHouse         ***
## Dataset$property_typeLower Portion ***
## Dataset$property_typePenthouse     ***
## Dataset$property_typeRoom          ***
## Dataset$property_typeUpper Portion ***
## Dataset$price                      ***
## Dataset$baths                      ***
## Dataset$bedrooms                   ***
## Dataset$Area_in_Marla              ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance:  182373  on 153429  degrees of freedom
## Residual deviance: 1959910  on 153419  degrees of freedom
## AIC: 1959932
## 
## Number of Fisher Scoring iterations: 25

# Giá trị Brier Score
BrierScore(fit2)

## [1] 0.1772013

# Ma trận nhầm lẫn
a <- predict(fit2, type = "response")
b <- ifelse(a > 0.5, "1", "0")
c <-factor(b, levels = c("0","1"))
d <- factor(Dataset$purpose, labels = c("0","1"))
confusionMatrix(table(c, d))

## Confusion Matrix and Statistics
## 
##    d
## c        0      1
##   0  16352    357
##   1  26831 109890
##                                           
##                Accuracy : 0.8228          
##                  95% CI : (0.8209, 0.8247)
##     No Information Rate : 0.7185          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4615          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.3787          
##             Specificity : 0.9968          
##          Pos Pred Value : 0.9786          
##          Neg Pred Value : 0.8038          
##              Prevalence : 0.2815          
##          Detection Rate : 0.1066          
##    Detection Prevalence : 0.1089          
##       Balanced Accuracy : 0.6877          
##                                           
##        'Positive' Class : 0               
##

3.3 Hồi quy với hàm tỷ lệ (complementary log-log link function)

fit3 <- glm(factor (purpose) ~ Dataset$property_type + Dataset$price + Dataset$baths + Dataset$bedrooms + Dataset$Area_in_Marla, family = binomial(link = "cloglog"), data = Dataset)

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

summary(fit3)

## 
## Call:
## glm(formula = factor(purpose) ~ Dataset$property_type + Dataset$price + 
##     Dataset$baths + Dataset$bedrooms + Dataset$Area_in_Marla, 
##     family = binomial(link = "cloglog"), data = Dataset)
## 
## Coefficients:
##                                      Estimate Std. Error    z value Pr(>|z|)
## (Intercept)                         1.014e+14  2.733e+06  3.711e+07   <2e-16
## Dataset$property_typeFlat           3.185e+14  2.752e+06  1.157e+08   <2e-16
## Dataset$property_typeHouse         -9.156e+13  2.739e+06 -3.342e+07   <2e-16
## Dataset$property_typeLower Portion -6.761e+14  2.827e+06 -2.392e+08   <2e-16
## Dataset$property_typePenthouse      4.042e+14  4.388e+06  9.213e+07   <2e-16
## Dataset$property_typeRoom          -2.253e+14  3.774e+06 -5.971e+07   <2e-16
## Dataset$property_typeUpper Portion -1.774e+15  2.795e+06 -6.346e+08   <2e-16
## Dataset$price                       4.226e+07  5.084e-03  8.311e+09   <2e-16
## Dataset$baths                      -2.894e+13  9.157e+04 -3.160e+08   <2e-16
## Dataset$bedrooms                   -4.268e+13  1.220e+05 -3.498e+08   <2e-16
## Dataset$Area_in_Marla              -1.871e+12  1.843e+03 -1.015e+09   <2e-16
##                                       
## (Intercept)                        ***
## Dataset$property_typeFlat          ***
## Dataset$property_typeHouse         ***
## Dataset$property_typeLower Portion ***
## Dataset$property_typePenthouse     ***
## Dataset$property_typeRoom          ***
## Dataset$property_typeUpper Portion ***
## Dataset$price                      ***
## Dataset$baths                      ***
## Dataset$bedrooms                   ***
## Dataset$Area_in_Marla              ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance:  182373  on 153429  degrees of freedom
## Residual deviance: 1345942  on 153419  degrees of freedom
## AIC: 1345964
## 
## Number of Fisher Scoring iterations: 25

# Giá trị Brier Score
BrierScore(fit3)

## [1] 0.1216907

# Ma trận nhầm lẫn
a <- predict(fit3, type = "response")
b <- ifelse(a > 0.5, "1", "0")
c <-factor(b, levels = c("0","1"))
d <- factor(Dataset$purpose, labels = c("0","1"))
confusionMatrix(table(c, d))

## Confusion Matrix and Statistics
## 
##    d
## c        0      1
##   0  32529   8017
##   1  10654 102230
##                                           
##                Accuracy : 0.8783          
##                  95% CI : (0.8767, 0.8799)
##     No Information Rate : 0.7185          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6934          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.7533          
##             Specificity : 0.9273          
##          Pos Pred Value : 0.8023          
##          Neg Pred Value : 0.9056          
##              Prevalence : 0.2815          
##          Detection Rate : 0.2120          
##    Detection Prevalence : 0.2643          
##       Balanced Accuracy : 0.8403          
##                                           
##        'Positive' Class : 0               
##

3.4 Lựa chọn mô hình phù hợp

Mô hình	Chỉ số AIC	Deviance	Brier Score	Confusion matrix
fit1	1385973	1385951	0.1253080	0.8747
fit2	1959932	1959910	0.1772013	0.8228
fit3	1345964	1345942	0.1216907	0.8783
Lựa chọn	Mô hình 3	Mô hình 3	Mô hình 3	Mô hình 3

Cả 4 tiêu chí dun để đánh giá một mô hình bao gồm AIC, Deviance, Brier Score và Confusion Matrix đều đưa ra kết quả cho thấy mô hình 3 - Mô hình sử dụng hàm cloglog là tốt nhất trong 3 mô hình được đề xuất. Do đó mô hình hồi quy logistic với hàm tỷ lệ (complementary log-log link function) là mô hình tốt nhất.

THỰC HÀNH NGÀY 26/07/2023

Trần Thị Minh Phương

2023-07-26

1 Thống kê mô tả

2 Thống kê suy diễn - Khoảng Ước lượng tỷ lệ

2.1 Ước lượng tỷ lệ căn hộ có mục đích bán

2.2 Ước lượng sự chênh lệch về tỷ lệ căn hộ có giá trị lớn hơn 700 triệu giữa mục đích cho thuê và mục đich bán

2.3 Ước lượng sự chênh lệch về tỷ lệ căn hộ có nhiều hơn 5 phòng tắm giữa mục đích cho thuê và mục đich bán

2.4 Ước lượng sự chênh lệch về tỷ lệ căn hộ có nhiều hơn 7 phòng ngủ giữa mục đích cho thuê và mục đich bán

3 Mô hình hồi quy

3.1 Hồi quy với hàm logit (logit link function)

3.2 Hồi quy với hàm probit (probit link function)

3.3 Hồi quy với hàm tỷ lệ (complementary log-log link function)

3.4 Lựa chọn mô hình phù hợp