L4_Multinomial Logit

Multinomial Logits

Bối cảnh/ Giới thiệu

Mô hình Logit đa thức (Đa bậc) thường được sử dụng để xem xét lựa chọn của người tiêu dùng/hộ/… về các lựa chọn không thứ bậc. Ví dụ: người tham gia giao thông lựa chọn xe bus, xe máy, đi bộ, metro,..hoặc người tiêu dùng khi vào cửa hàng tiện lợi mua nước sẽ lựa chọn nhãn hiệu nào:…Mô hình gắn liền với thuyết kinh tế về lý thuyết lựa chọn của người tiêu dùng, thuyết về hàm hữu dụng của người tiêu dùng .

Assumptions: Cần lưu ý mô hình này thường dựa trên các giả định quan trọng (nhưng khó kiểm tra định lượng):

(1) Independence of irrelevant alternatives (IIA): các lựa chọn trong biến phụ thuộc là độc lập

(2) The outcome is categorical: Các lựa chọn trong biến phụ thuộc là thang đo định danh

(3) The log-odds of the outcome and independent variable have a linear relationship: mô hình là tuyến tính (có thể check thông qua so sánh các mô hình phi tuyến, bình phương, hàm mũ khi thử nghiệm các mô hình khác nhau)

(4) Errors are independent: Sai số độc lập (kiểm tra sai số sau ước lượng)

(5) No severe multicolinearity: Không bị hiện tượng đa cộng tuyến nặng, kiểm tra VIF ## Thư viện

library(tidyverse)
Warning: package 'tidyverse' was built under R version 4.4.1
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(nnet)

Data

Data <-haven::read_dta("C:\\Users\\Huynh Chuong\\Desktop\\University\\UEL\\Class_QuantMethods\\Giáo trình\\Data\\mus\\mus15data.dta")
Data %>% variable.names()
 [1] "mode"     "price"    "crate"    "dbeach"   "dpier"    "dprivate"
 [7] "dcharter" "pbeach"   "ppier"    "pprivate" "pcharter" "qbeach"  
[13] "qpier"    "qprivate" "qcharter" "income"  
str(Data) # Data structure
tibble [1,182 × 16] (S3: tbl_df/tbl/data.frame)
 $ mode    : dbl+lbl [1:1182] 4, 4, 3, 2, 3, 4, 1, 4, 3, 3, 2, 3, 2, 2, 1, 2, 1, 2,...
   ..@ label       : chr "Fishing mode"
   ..@ format.stata: chr "%9.0g"
   ..@ labels      : Named num [1:4] 1 2 3 4
   .. ..- attr(*, "names")= chr [1:4] "beach" "pier" "private" "charter"
 $ price   : num [1:1182] 182.9 34.5 24.3 15.1 41.5 ...
  ..- attr(*, "label")= chr "price for chosen alternative"
  ..- attr(*, "format.stata")= chr "%9.0g"
 $ crate   : num [1:1182] 0.5391 0.4671 0.2413 0.0789 0.1082 ...
  ..- attr(*, "label")= chr "catch rate for chosen alternative"
  ..- attr(*, "format.stata")= chr "%9.0g"
 $ dbeach  : num [1:1182] 0 0 0 0 0 0 1 0 0 0 ...
  ..- attr(*, "label")= chr "1 if beach mode chosen"
  ..- attr(*, "format.stata")= chr "%9.0g"
 $ dpier   : num [1:1182] 0 0 0 1 0 0 0 0 0 0 ...
  ..- attr(*, "label")= chr "1 if pier mode chosen"
  ..- attr(*, "format.stata")= chr "%9.0g"
 $ dprivate: num [1:1182] 0 0 1 0 1 0 0 0 1 1 ...
  ..- attr(*, "label")= chr "1 if private boat mode chosen"
  ..- attr(*, "format.stata")= chr "%9.0g"
 $ dcharter: num [1:1182] 1 1 0 0 0 1 0 1 0 0 ...
  ..- attr(*, "label")= chr "1 if charter boat mode chosen"
  ..- attr(*, "format.stata")= chr "%9.0g"
 $ pbeach  : num [1:1182] 157.9 15.1 161.9 15.1 106.9 ...
  ..- attr(*, "label")= chr "price for beach mode"
  ..- attr(*, "format.stata")= chr "%9.0g"
 $ ppier   : num [1:1182] 157.9 15.1 161.9 15.1 106.9 ...
  ..- attr(*, "label")= chr "price for pier mode"
  ..- attr(*, "format.stata")= chr "%9.0g"
 $ pprivate: num [1:1182] 157.9 10.5 24.3 55.9 41.5 ...
  ..- attr(*, "label")= chr "price for private boat mode"
  ..- attr(*, "format.stata")= chr "%9.0g"
 $ pcharter: num [1:1182] 182.9 34.5 59.3 84.9 71 ...
  ..- attr(*, "label")= chr "price for charter boat mode"
  ..- attr(*, "format.stata")= chr "%9.0g"
 $ qbeach  : num [1:1182] 0.0678 0.1049 0.5333 0.0678 0.0678 ...
  ..- attr(*, "label")= chr "catch rate for beach mode"
  ..- attr(*, "format.stata")= chr "%9.0g"
 $ qpier   : num [1:1182] 0.0503 0.0451 0.4522 0.0789 0.0503 ...
  ..- attr(*, "label")= chr "catch rate for pier mode"
  ..- attr(*, "format.stata")= chr "%9.0g"
 $ qprivate: num [1:1182] 0.26 0.157 0.241 0.164 0.108 ...
  ..- attr(*, "label")= chr "catch rate for private boat mode"
  ..- attr(*, "format.stata")= chr "%9.0g"
 $ qcharter: num [1:1182] 0.539 0.467 1.027 0.539 0.324 ...
  ..- attr(*, "label")= chr "catch rate for charter boat mode"
  ..- attr(*, "format.stata")= chr "%9.0g"
 $ income  : num [1:1182] 7.08 1.25 3.75 2.08 4.58 ...
  ..- attr(*, "label")= chr "monthly income in thousands $"
  ..- attr(*, "format.stata")= chr "%9.0g"
attr( Data$mode, "label") # Thành tố của biến attribute
[1] "Fishing mode"
attr( Data$mode, "labels")
  beach    pier private charter 
      1       2       3       4 
ids = attr(Data$mode, "labels")
labels = names(attr(Data$mode, "labels"))

df <- data.frame(id = ids, label = labels)
df
        id   label
beach    1   beach
pier     2    pier
private  3 private
charter  4 charter

Khám phá dữ liệu

Các em tự thực hiện thêm

Data %>%
  mutate(Y=as_factor(mode)) -> Data # Tạo ra biến Y
Data |>  
  select(Y) |>
  table () 
Y
  beach    pier private charter 
    134     178     418     452 
Data |>  
  select(Y) |>
  table () |>
  prop.table() |>
  round(digits = 3)*100
Y
  beach    pier private charter 
   11.3    15.1    35.4    38.2 
Data |> # dấu này có thể thay %>%, cách viết | >
  select(Y, income) |>
  group_by(Y) %>%
  summarise(min =min(income),
            mean=mean(income),
            max=max(income),
            sd=sd(income))
# A tibble: 4 × 5
  Y         min  mean   max    sd
  <fct>   <dbl> <dbl> <dbl> <dbl>
1 beach   0.417  4.05  12.5  2.51
2 pier    0.417  3.39  12.5  2.34
3 private 0.417  4.65  12.5  2.78
4 charter 0.417  3.88  12.5  2.05

Multinomial logit

mlogit0 <- multinom(mode ~ income, data = Data)
# weights:  12 (6 variable)
initial  value 1638.599935 
iter  10 value 1477.505846
final  value 1477.150569 
converged
summary(mlogit0)
Call:
multinom(formula = mode ~ income, data = Data)

Coefficients:
  (Intercept)      income
2   0.8141701 -0.14340453
3   0.7389569  0.09190030
4   1.3413284 -0.03164588

Std. Errors:
  (Intercept)     income
2   0.2286325 0.05328822
3   0.1967314 0.04066362
4   0.1945173 0.04184622

Residual Deviance: 2954.301 
AIC: 2966.301 
mlogit <- multinom(Y ~ income, data = Data)
# weights:  12 (6 variable)
initial  value 1638.599935 
iter  10 value 1477.505846
final  value 1477.150569 
converged
output<- summary(mlogit)
output
Call:
multinom(formula = Y ~ income, data = Data)

Coefficients:
        (Intercept)      income
pier      0.8141701 -0.14340453
private   0.7389569  0.09190030
charter   1.3413284 -0.03164588

Std. Errors:
        (Intercept)     income
pier      0.2286325 0.05328822
private   0.1967314 0.04066362
charter   0.1945173 0.04184622

Residual Deviance: 2954.301 
AIC: 2966.301 
coef(mlogit) %>%
  exp() 
        (Intercept)    income
pier       2.257301 0.8664035
private    2.093750 1.0962555
charter    3.824120 0.9688496

Tính chỉ số ODD

round(exp(coef(mlogit))-1, digits=3)
        (Intercept) income
pier          1.257 -0.134
private       1.094  0.096
charter       2.824 -0.031
models <- list("Câu cá ( so với bãi biển)" = mlogit)
modelsummary::modelsummary(models,
                           shape = term + response ~ statistic,
                           statistic = c({"std.error"}),
                           stars =  c('*' = .1,"**"=0.05, '***' = .01))
tinytable_io6wnyx4hs1qsy6wy1iy
Câu cá ( so với bãi biển)
response Est. S.E.
* p < 0.1, ** p < 0.05, *** p < 0.01
(Intercept) pier 0.814*** 0.229
private 0.739*** 0.197
charter 1.341*** 0.195
income pier -0.143*** 0.053
private 0.092** 0.041
charter -0.032 0.042

Tobit model

Giới thiệu

Khi dữ liệu bị chặn (trên, dưới hoặc cả 2) thì mô hình OLS ước lượng sẽ bị chệch.

Mô hình này ước lượng nhằm hiệu chỉnh tính chệch của OLS.

Thư viện

library(ggplot2)
library(GGally)
Warning: package 'GGally' was built under R version 4.4.1
Registered S3 method overwritten by 'GGally':
  method from   
  +.gg   ggplot2
library(VGAM)
Warning: package 'VGAM' was built under R version 4.4.1
Loading required package: stats4
Loading required package: splines

Data

Data <-haven::read_dta("C:\\Users\\Huynh Chuong\\Desktop\\University\\UEL\\Class_QuantMethods\\Giáo trình\\Data\\mus\\mus16data.dta")
variable.names(Data)
 [1] "educ"       "age"        "income"     "female"     "vgood"     
 [6] "good"       "hospexp"    "totchr"     "ffs"        "dhospexp"  
[11] "age2"       "agefem"     "fairpoor"   "year01"     "instype"   
[16] "ambexp"     "lambexp"    "blhisp"     "instype_s1" "dambexp"   
[21] "lnambx"     "ins"       

Khám phá dữ liệu

Các em tự thực hiện thêm

summary(Data)
      educ            age            income           female      
 Min.   : 0.00   Min.   :2.100   Min.   :-90.05   Min.   :0.0000  
 1st Qu.:12.00   1st Qu.:3.100   1st Qu.: 19.46   1st Qu.:0.0000  
 Median :13.00   Median :4.000   Median : 31.09   Median :1.0000  
 Mean   :13.41   Mean   :4.057   Mean   : 36.80   Mean   :0.5084  
 3rd Qu.:16.00   3rd Qu.:4.900   3rd Qu.: 46.83   3rd Qu.:1.0000  
 Max.   :17.00   Max.   :6.400   Max.   :237.30   Max.   :1.0000  
                                                                  
     vgood             good           hospexp             totchr      
 Min.   :0.0000   Min.   :0.0000   Min.   :     0.0   Min.   :0.0000  
 1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:     0.0   1st Qu.:0.0000  
 Median :0.0000   Median :0.0000   Median :     0.0   Median :0.0000  
 Mean   :0.4159   Mean   :0.2761   Mean   :   481.6   Mean   :0.4832  
 3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:     0.0   3rd Qu.:1.0000  
 Max.   :1.0000   Max.   :1.0000   Max.   :175775.0   Max.   :5.0000  
                                                                      
      ffs            dhospexp            age2           agefem     
 Min.   :0.0000   Min.   :0.00000   Min.   : 4.41   Min.   :0.000  
 1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.: 9.61   1st Qu.:0.000  
 Median :0.0000   Median :0.00000   Median :16.00   Median :2.100  
 Mean   :0.4123   Mean   :0.05559   Mean   :17.72   Mean   :2.076  
 3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:24.01   3rd Qu.:4.100  
 Max.   :1.0000   Max.   :1.00000   Max.   :40.96   Max.   :6.400  
                                                                   
    fairpoor          year01     instype           ambexp       
 Min.   :0.0000   Min.   :1   Min.   :0.0000   Min.   :    0.0  
 1st Qu.:0.0000   1st Qu.:1   1st Qu.:0.0000   1st Qu.:  113.0  
 Median :0.0000   Median :1   Median :1.0000   Median :  534.5  
 Mean   :0.0604   Mean   :1   Mean   :0.5877   Mean   : 1386.5  
 3rd Qu.:0.0000   3rd Qu.:1   3rd Qu.:1.0000   3rd Qu.: 1618.0  
 Max.   :1.0000   Max.   :1   Max.   :1.0000   Max.   :49960.0  
                                                                
    lambexp           blhisp         instype_s1        dambexp      
 Min.   : 0.000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
 1st Qu.: 5.618   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:1.0000  
 Median : 6.658   Median :0.0000   Median :0.0000   Median :1.0000  
 Mean   : 6.555   Mean   :0.3086   Mean   :0.3651   Mean   :0.8419  
 3rd Qu.: 7.556   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000  
 Max.   :10.819   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
 NA's   :526                                                        
     lnambx            ins        
 Min.   : 0.000   Min.   :0.0000  
 1st Qu.: 4.727   1st Qu.:0.0000  
 Median : 6.281   Median :0.0000  
 Mean   : 5.519   Mean   :0.3651  
 3rd Qu.: 7.389   3rd Qu.:1.0000  
 Max.   :10.819   Max.   :1.0000  
                                  
# setup base plot
p <- ggplot(Data, aes(x = ambexp, fill=as.factor(female) ))

# histogram, coloured by proportion in different programs
# with a normal distribution overlayed
f <- function(x, var, bw = 500) {
  dnorm(x, mean = mean(var), sd(var)) * length(var)  * bw
}
p + stat_bin(binwidth=500)  +
  stat_function(fun = f, size = 1,
    args = list(var = Data$ambexp))
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
Warning: Multiple drawing groups in `geom_function()`
ℹ Did you use the correct group, colour, or fill aesthetics?

tobit1 <- vglm( ambexp ~ age +female+ educ+ blhisp+ totchr+ ins, tobit(Lower = 0),data=Data)
tobit2 <- vglm( lambexp ~ age +female+ educ+ blhisp+ totchr+ ins, tobit(Lower = 0 ),data=Data)
OLS1 <- lm( ambexp ~ age +female+ educ+ blhisp+ totchr+ ins,data=Data)
OLS2 <- lm( lambexp ~ age +female+ educ+ blhisp+ totchr+ ins,data=Data)
models <- list("Tobit 1" = tobit1,"OLS 1"= OLS1,"Tobit 2" =  tobit2, "OLS 2"=OLS2)
modelsummary::modelsummary(models, 
                           statistic = c({"std.error"}),
                           stars =  c('*' = .1,"**"=0.05, '***' = .01))
tinytable_kgshawlqv1wtka4l3sgv
Tobit 1 OLS 1 Tobit 2 OLS 2
* p < 0.1, ** p < 0.05, *** p < 0.01
(Intercept) × 1 -1882.785*** 4.908***
(329.555) (0.168)
(Intercept) × 2 7.854*** 0.238***
(0.016) (0.013)
age 314.166*** 250.228*** 0.217*** 0.217***
(44.113) (37.391) (0.022) (0.022)
female 685.000*** 374.975*** 0.380*** 0.379***
(96.048) (81.262) (0.049) (0.049)
educ 70.866*** 33.566** 0.022** 0.022**
(19.222) (16.129) (0.010) (0.010)
blhisp -530.327*** -310.734*** -0.238*** -0.239***
(108.104) (90.567) (0.055) (0.055)
totchr 1244.618*** 1076.201*** 0.562*** 0.562***
(61.746) (53.988) (0.030) (0.031)
ins -167.498* -237.631*** -0.021 -0.021
(100.031) (84.737) (0.050) (0.050)
(Intercept) -606.593** 4.908***
(274.240) (0.168)
Num.Obs. 3328 3328 2802 2802
R2 0.160 0.192
R2 Adj. 0.158 0.190
AIC 52734.8 61038.1 9301.0 9298.3
BIC 52783.7 61087.0 9348.5 9345.8
Log.Lik. -30511.057 -4641.157
F 105.113 110.575
RMSE 4512.60 2319.36 1.06 1.27
Data %>%
  mutate(lambexp=ifelse(lambexp<9.2, lambexp, 9.2)) -> Data1 # chuyển dữ liệu với các quan sát quá lớn về giới hạn chặn
tobit3 <- vglm( lambexp ~ age +female+ educ+ blhisp+ totchr+ ins, tobit(Lower = 0, Upper=9.2),data=Data1)
tobit3 <- vglm( lambexp ~ age +female+ educ+ blhisp+ totchr+ ins, tobit( Upper=9.2),data=Data1)
models <- list("Tobit 1"= tobit1, "OLS 1"=OLS1, "Tobit 2"=tobit2, "OLS 2"=OLS2, "Tobit 3"=tobit3)
modelsummary::modelsummary(models, 
                           statistic = c({"std.error"}),
                           stars =  c('*' = .1,"**"=0.05, '***' = .01))
tinytable_c95iwxp8c1b69ueioa0v
Tobit 1 OLS 1 Tobit 2 OLS 2 Tobit 3
* p < 0.1, ** p < 0.05, *** p < 0.01
(Intercept) × 1 -1882.785*** 4.908*** 4.895***
(329.555) (0.168) (0.169)
(Intercept) × 2 7.854*** 0.238*** 0.243***
(0.016) (0.013) (0.014)
age 314.166*** 250.228*** 0.217*** 0.217*** 0.219***
(44.113) (37.391) (0.022) (0.022) (0.022)
female 685.000*** 374.975*** 0.380*** 0.379*** 0.381***
(96.048) (81.262) (0.049) (0.049) (0.049)
educ 70.866*** 33.566** 0.022** 0.022** 0.022**
(19.222) (16.129) (0.010) (0.010) (0.010)
blhisp -530.327*** -310.734*** -0.238*** -0.239*** -0.238***
(108.104) (90.567) (0.055) (0.055) (0.056)
totchr 1244.618*** 1076.201*** 0.562*** 0.562*** 0.569***
(61.746) (53.988) (0.030) (0.031) (0.031)
ins -167.498* -237.631*** -0.021 -0.021 -0.022
(100.031) (84.737) (0.050) (0.050) (0.050)
(Intercept) -606.593** 4.908***
(274.240) (0.168)
Num.Obs. 3328 3328 2802 2802 2802
R2 0.160 0.192
R2 Adj. 0.158 0.190
AIC 52734.8 61038.1 9301.0 9298.3 9289.1
BIC 52783.7 61087.0 9348.5 9345.8 9336.6
Log.Lik. -30511.057 -4641.157
F 105.113 110.575
RMSE 4512.60 2319.36 1.06 1.27 1.07

Tính chỉ số R2

Data1<- Data1%>%
  na.omit(lambexp)
Data1$yhat <- fitted(tobit2)[,1]
Data1$rr <- resid(tobit2, type = "response")
Data1$rp <- resid(tobit2, type = "pearson")[,1]

par(mfcol = c(2, 3))
R2 = with(Data1, cor(yhat, lambexp))^2
R2
[1] 0.1903067
with(Data1, {
  plot(yhat, rr, main = "Fitted vs Residuals")
  qqnorm(rr)
  plot(yhat, rp, main = "Fitted vs Pearson Residuals")
  qqnorm(rp)
  plot(lambexp, rp, main = "Actual vs Pearson Residuals")
  plot(lambexp, yhat, main = "Actual vs Fitted")
})