# Loading the libraries
library(tidyverse)
library(tidyr)
library(mlogit)
library(stargazer)
library(xtable)
library(ggplot2)
library(haven)
library(kableExtra)
# Loading the dataset
Heating_data <- read_dta("/Users/bastienpatras/Desktop/ENS - Master of Economics/Econometrics/Tutorial 2-20210221/Heating.dta")
\(~\)
Once we have the correct dataset we can then convert it to long format for further manipulation :
\(~\)
# Converting to long format with dplyr::pivot_longer()
Heating_data_long <- Heating_data %>%
pivot_longer(cols = c("ic_gc", "ic_gr", "ic_ec", "ic_er", "ic_hp",
"oc_gc", "oc_gr", "oc_ec", "oc_er", "oc_hp",
"pb_gc", "pb_gr", "pb_ec", "pb_er", "pb_hp"),
names_to = c(".value", "alternative"),
names_sep = "_") %>%
mutate(mode_1 = dplyr::case_when(mode == '1' ~ 'gc', #Creating mode_1 variable
mode == '2' ~ 'gr',
mode == '3' ~ 'ec',
mode == '4' ~ 'er',
mode == '5' ~ 'hp'),
choice = ifelse(mode_1 == alternative, #Creating choice variable
1,
0)
) %>%
select(idcase, mode_1, choice, alternative, everything())
head(Heating_data_long) %>% kbl(booktabs = T) %>%
kable_styling(latex_options = "striped", full_width = T)
| idcase | mode_1 | choice | alternative | mode | income | agehed | rooms | region | ic | oc | pb |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | gc | 1 | gc | 1 | 7 | 25 | 6 | 4 | 866.00 | 199.69 | 4.336722 |
| 1 | gc | 0 | gr | 1 | 7 | 25 | 6 | 4 | 962.64 | 151.72 | 6.344846 |
| 1 | gc | 0 | ec | 1 | 7 | 25 | 6 | 4 | 859.90 | 553.34 | 1.554017 |
| 1 | gc | 0 | er | 1 | 7 | 25 | 6 | 4 | 995.76 | 505.60 | 1.969462 |
| 1 | gc | 0 | hp | 1 | 7 | 25 | 6 | 4 | 1135.50 | 237.88 | 4.773415 |
| 2 | gc | 1 | gc | 1 | 5 | 60 | 5 | 2 | 727.93 | 168.66 | 4.315961 |
\(~\)
We can then produce descriptive statistics on long data :
\(~\)
# Descriptive statistics
StatDesc <- Heating_data_long %>%
dplyr::group_by(alternative) %>%
dplyr::summarize(mean_oc = mean(oc),
sd_oc = sd(oc),
mean_ic = mean(ic),
sd_ic = sd(ic))
# Output
StatDesc %>% kbl(caption = "Descriptive statistics by alternatives",
booktabs = T) %>%
kable_styling(latex_options = "striped", full_width = T)
| alternative | mean_oc | sd_oc | mean_ic | sd_ic |
|---|---|---|---|---|
| ec | 476.8030 | 73.15333 | 824.5435 | 125.2739 |
| er | 429.7299 | 65.79263 | 983.9280 | 147.1554 |
| gc | 172.1158 | 25.93978 | 776.8266 | 115.5630 |
| gr | 154.4714 | 22.88574 | 921.7702 | 138.0867 |
| hp | 219.2993 | 32.96955 | 1046.4813 | 156.7030 |
\(~\)
\(~\)
# Summary statistics for all alternatives
SummStat_AllAlternatives <- Heating_data_long %>%
dplyr::group_by(alternative) %>%
dplyr::summarize(dplyr::across(
.cols = c(ic, oc),
.fns = list(n = ~n(),
Mean = ~round(mean(.x, na.rm = T)),
SD = ~round(var(.x, na.rm = T)^0.5),
Min = ~round(min(.x, na.rm = T)),
Max = ~round(max(.x, na.rm = T)),
Med = ~round(median(.x, na.rm = T))),
.names = "{.col}_{.fn}")
)
# Output
SummStat_AllAlternatives %>% kbl(caption = "Summary statistics for all alternatives",
booktabs = T) %>%
kable_styling(latex_options = "striped", full_width = T)
| alternative | ic_n | ic_Mean | ic_SD | ic_Min | ic_Max | ic_Med | oc_n | oc_Mean | oc_SD | oc_Min | oc_Max | oc_Med |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ec | 900 | 825 | 125 | 470 | 1230 | 825 | 900 | 477 | 73 | 237 | 705 | 480 |
| er | 900 | 984 | 147 | 547 | 1496 | 990 | 900 | 430 | 66 | 180 | 664 | 431 |
| gc | 900 | 777 | 116 | 432 | 1159 | 779 | 900 | 172 | 26 | 84 | 248 | 172 |
| gr | 900 | 922 | 138 | 575 | 1344 | 924 | 900 | 154 | 23 | 78 | 228 | 154 |
| hp | 900 | 1046 | 157 | 532 | 1680 | 1047 | 900 | 219 | 33 | 121 | 319 | 221 |
\(~\)
# Loading palettes
cbPalette <- c("#999999",
"#E69F00",
"#56B4E9",
"#009E73",
"#F0E442",
"#0072B2",
"#D55E00",
"#CC79A7",
"#000000")
# Building plot_1
Plot_1 <- ggplot(Heating_data_long,
aes(x = factor(alternative), y = oc, color = factor(choice))) +
theme_bw() +
labs(x=expression(paste("Heating alternatives")),y=expression(paste("Operating costs"))) +
geom_jitter(alpha = 0.5)+
geom_violin(size=1.25, alpha=0)+
theme(legend.position = "bottom") +
scale_color_manual(name = "Houses chosing the heating alternative",
values = cbPalette[c(2,4)],
labels = c("No", "Yes"))+
guides(color = guide_legend(ncol=1, direction = "vertical"))
# Output
show(Plot_1)
\(~\)
\(~\)
# Loading palettes
cbPalette <- c("#999999",
"#E69F00",
"#56B4E9",
"#009E73",
"#F0E442",
"#0072B2",
"#D55E00",
"#CC79A7",
"#000000")
# Building plot_2
plot_2 <- ggplot(Heating_data_long,
aes(x = factor(alternative), y = ic, color = factor(choice))) +
theme_bw() +
labs(x=expression(paste("Heating alternatives")),y=expression(paste("Installation costs"))) +
geom_jitter(alpha = 0.5)+
geom_violin(size=1.25, alpha=0)+
theme(legend.position = "bottom") +
scale_color_manual(name = "Houses chosing the heating alternative",
values = cbPalette[c(2,4)],
labels = c("No", "Yes"))+
guides(color = guide_legend(ncol=1, direction = "vertical"))
# Output
show(plot_2)
\(~\)
\(~\)
H <- mlogit.data(Heating_data_long,
id.var = "idcase",
shape = "long",
choice = "choice",
alt.var = "alternative")
\(~\)
\(~\)
# Logit regression
m <- mlogit(formula = choice~ic+oc|0,
data = H)
# Output
summary(m)
##
## Call:
## mlogit(formula = choice ~ ic + oc | 0, data = H, method = "nr")
##
## Frequencies of alternatives:choice
## ec er gc gr hp
## 0.071111 0.093333 0.636667 0.143333 0.055556
##
## nr method
## 4 iterations, 0h:0m:0s
## g'(-H)^-1g = 1.56E-07
## gradient close to zero
##
## Coefficients :
## Estimate Std. Error z-value Pr(>|z|)
## ic -0.00623187 0.00035277 -17.665 < 2.2e-16 ***
## oc -0.00458008 0.00032216 -14.217 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Log-Likelihood: -1095.2
\(~\)
1. Do the estimated coefficients have the expected signs?
\(~\)
As we have seen above, the descriptive statistics were implying a negative relationship between cost and effectively chosen alternatives. As we see in the previous table the coefficients are negative meaning that as the cost of a system rises, the probability of selecting a given alternative decreases which confirms our expectations regarding the signs from descriptive statistics.
\(~\)
2. Are both coefficients significantly different from zero?
\(~\)
As we can see in the table the coefficient are both highly significant as their respective p-value are way above 1.96, which is the critical level for 95% confidence level.
\(~\)
# Computing the magnitudes :
Marginal <- effects(m, "oc")
# Output
Marginal %>% kbl(caption = "Magnitudes : Marginal effects",
booktabs = T) %>%
kable_styling(latex_options = "striped", full_width = T)
| ec | er | gc | gr | hp | |
|---|---|---|---|---|---|
| ec | -0.0004086 | 0.0000206 | 0.0002441 | 0.0001073 | 0.0000366 |
| er | 0.0000206 | -0.0001989 | 0.0001122 | 0.0000493 | 0.0000168 |
| gc | 0.0002441 | 0.0001122 | -0.0011383 | 0.0005829 | 0.0001991 |
| gr | 0.0001073 | 0.0000493 | 0.0005829 | -0.0008269 | 0.0000875 |
| hp | 0.0000366 | 0.0000168 | 0.0001991 | 0.0000875 | -0.0003401 |
\(~\)
3. How closely do the average probabilities match the shares of customers choosing each alternative?
\(~\)
# Computing matching :
Matchinng <- apply(X = fitted(m, outcome = F), MARGIN = 2, FUN = mean)
# Output :
Matchinng %>% kbl(caption = "Matchinng : Average probabilities with customers choosing",
booktabs = T) %>%
kable_styling(latex_options = "striped", full_width = T)
| x | |
|---|---|
| ec | 0.1041306 |
| er | 0.0514148 |
| gc | 0.5169565 |
| gr | 0.2403090 |
| hp | 0.0871891 |
\(~\)
The average probabilities does not match the shares of customers choosing each alternative. As we have seen from the descriptive statistics above 63.67% of consumers have chosen “gc” and yet the model produces an average probability of only 51.695%. The other alternatives are also fairly poorly predicted.
\(~\)
4. What is the estimated wtp from this model? Is it reasonable in magnitude?
\(~\)
The model is given such that :
\[U = \beta_{oc}oc + \beta_{ic}ic\]
Thus fully differentiating equation \((1)\) and setting \(\text{d}U=0\) yields:
\[\begin{split} \text{d}U &= \beta_{oc}\text{d}oc + \beta_{ic}\text{d}ic \\ \frac{\text{d}oc}{\text{d}ic} &= -\frac{\beta_{oc}}{\beta_{ic}} \end{split}\]
\(~\)
The willingness to pay can be computed by dividing the coef of “oc” to “ic” :
\[ \text{Willingness to pay} = \frac{\text{coef}(oc)}{\text{coef}(ic)} = 0.7349453 \]
wtp <- coef(m)["oc"] / coef(m)["ic"]
wtp
## oc
## 0.7349453
\(~\)
5. What value of \(r\) is implied by the estimated \(wtp\) that you calculated in question 4.(d)? Is this reasonable?
\(~\)
We know that :
Thus we can derive an expression of U such that :
\[ U = a\text{IC} + \frac{a}{r}\times\text{OC} \]
We derive the estimates of \(a\) and \(\frac{a}{r}\) :
\[\begin{cases} a = -0.00623 \\ \frac{a}{r} = -0.00457 \end{cases}\]
Thus the discount rate is given by :
\[ r = \frac{a}{\frac{a}{r}} = 1.36 = 136\% \]
The level of this discount rate 136% is unreasonable as it is far too high.
\(~\)
\(~\)
# Running logit regression with alternative specific constants:
mc <- mlogit(formula = choice ~ ic + oc,
data = H,
reflevel = "hp")
# Output summary
summary(mc)
##
## Call:
## mlogit(formula = choice ~ ic + oc, data = H, reflevel = "hp",
## method = "nr")
##
## Frequencies of alternatives:choice
## hp ec er gc gr
## 0.055556 0.071111 0.093333 0.636667 0.143333
##
## nr method
## 6 iterations, 0h:0m:0s
## g'(-H)^-1g = 9.58E-06
## successive function values within tolerance limits
##
## Coefficients :
## Estimate Std. Error z-value Pr(>|z|)
## (Intercept):ec 1.65884594 0.44841936 3.6993 0.0002162 ***
## (Intercept):er 1.85343697 0.36195509 5.1206 3.045e-07 ***
## (Intercept):gc 1.71097930 0.22674214 7.5459 4.485e-14 ***
## (Intercept):gr 0.30826328 0.20659222 1.4921 0.1356640
## ic -0.00153315 0.00062086 -2.4694 0.0135333 *
## oc -0.00699637 0.00155408 -4.5019 6.734e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Log-Likelihood: -1008.2
## McFadden R^2: 0.013691
## Likelihood ratio test : chisq = 27.99 (p.value = 8.3572e-07)
\(~\)
1. How well do the estimated probabilities match the shares of customers choosing each alternative?
\(~\)
# Matching efficiency
Fitted_output <- apply(fitted(mc, outcome = FALSE), 2, mean)
# Output
Fitted_output %>% kbl(caption = "Matchinng efficiency with alternative specific constants:",
booktabs = T) %>%
kable_styling(latex_options = "striped", full_width = T)
| x | |
|---|---|
| hp | 0.0555556 |
| ec | 0.0711111 |
| er | 0.0933333 |
| gc | 0.6366667 |
| gr | 0.1433333 |
\(~\)
In this case (with alternative specific constants) the model is able to perfectly fit the data as shown by the table below.
\(~\)
2. Calculate the \(wtp\) and discount rate \(r\) that is implied by the estimates. Are these reasonable?
\(~\)
# Compute the new wtp
wtp_2 <- coef(mc)["oc"]/coef(mc)["ic"]
wtp_2
## oc
## 4.563385
# Compute the new r
r <- 1/wtp_2
r
## oc
## 0.2191356
\(~\)
\(~\)
1. Entering : installation cost divided by income, instead of installation cost.
\(~\)
# Model with sociodemographic variables entering
Logit_Income <- mlogit(choice~oc+I(ic/income), H)
# Output
summary(Logit_Income)
##
## Call:
## mlogit(formula = choice ~ oc + I(ic/income), data = H, method = "nr")
##
## Frequencies of alternatives:choice
## ec er gc gr hp
## 0.071111 0.093333 0.636667 0.143333 0.055556
##
## nr method
## 6 iterations, 0h:0m:0s
## g'(-H)^-1g = 1.03E-05
## successive function values within tolerance limits
##
## Coefficients :
## Estimate Std. Error z-value Pr(>|z|)
## (Intercept):er 0.0639934 0.1944893 0.3290 0.742131
## (Intercept):gc 0.0563481 0.4650251 0.1212 0.903555
## (Intercept):gr -1.4653063 0.5033845 -2.9109 0.003604 **
## (Intercept):hp -1.8700773 0.4364248 -4.2850 1.827e-05 ***
## oc -0.0071066 0.0015518 -4.5797 4.657e-06 ***
## I(ic/income) -0.0027658 0.0018944 -1.4600 0.144298
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Log-Likelihood: -1010.2
## McFadden R^2: 0.011765
## Likelihood ratio test : chisq = 24.052 (p.value = 5.9854e-06)
# Matching efficiency with sociodemographics entering :
Fitted_output_Income <- apply(fitted(Logit_Income, outcome = FALSE), 2, mean)
Fitted_output_Income %>% kbl(caption = "Matchinng efficiency with sociodemographics",
booktabs = T) %>%
kable_styling(latex_options = "striped", full_width = T)
| x | |
|---|---|
| ec | 0.0711111 |
| er | 0.0933333 |
| gc | 0.6366667 |
| gr | 0.1433333 |
| hp | 0.0555556 |
The transformation seems to make the model worse as the Log-Likelihood is getting lower and that the coefficient on installation cost is getting statistically insignificant.
\(~\)
2. Entering : alternative-specific income effects
\(~\)
# Model_2 with alternative-specific income effects entering
Logit_Income_2 <- mlogit(choice~oc+ic|income, H, reflevel="hp")
# Output
summary(Logit_Income_2)
##
## Call:
## mlogit(formula = choice ~ oc + ic | income, data = H, reflevel = "hp",
## method = "nr")
##
## Frequencies of alternatives:choice
## hp ec er gc gr
## 0.055556 0.071111 0.093333 0.636667 0.143333
##
## nr method
## 6 iterations, 0h:0m:0s
## g'(-H)^-1g = 6.27E-06
## successive function values within tolerance limits
##
## Coefficients :
## Estimate Std. Error z-value Pr(>|z|)
## (Intercept):ec 1.95445797 0.70353833 2.7780 0.0054688 **
## (Intercept):er 2.30560852 0.62390478 3.6954 0.0002195 ***
## (Intercept):gc 2.05517018 0.48639682 4.2253 2.386e-05 ***
## (Intercept):gr 1.14158139 0.51828845 2.2026 0.0276231 *
## oc -0.00696000 0.00155383 -4.4792 7.491e-06 ***
## ic -0.00153534 0.00062251 -2.4664 0.0136486 *
## income:ec -0.06362917 0.11329865 -0.5616 0.5743846
## income:er -0.09685787 0.10755423 -0.9005 0.3678281
## income:gc -0.07178917 0.08878777 -0.8085 0.4187752
## income:gr -0.17981159 0.10012691 -1.7958 0.0725205 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Log-Likelihood: -1005.9
## McFadden R^2: 0.01598
## Likelihood ratio test : chisq = 32.67 (p.value = 1.2134e-05)
# Matching efficiency with alternative-specific income effects entering :
Fitted_output_Income_2 <- apply(fitted(Logit_Income_2, outcome = FALSE), 2, mean)
Fitted_output_Income_2 %>% kbl(caption = "Matchinng efficiency with alternative-specific income effects entering ",
booktabs = T) %>%
kable_styling(latex_options = "striped", full_width = T)
| x | |
|---|---|
| hp | 0.0555556 |
| ec | 0.0711111 |
| er | 0.0933333 |
| gc | 0.6366667 |
| gr | 0.1433333 |
\(~\)
The model suggests that a increase in income tends to increase the probability the the consumers select the heat pump relatively to other alternatives. Additionally, Model_2 indicates that the income does not enter significantly(except for “gr” but for low level of significance). Therefore it seems that the income has no effect on the probability of choosing an alternative.
\(~\)