Logit model estimation - Kenneth Train’s exercises

1 - First we load the libraries and the correct dataset :

# Loading the libraries

library(tidyverse)
library(tidyr)
library(mlogit) 
library(stargazer) 
library(xtable) 
library(ggplot2)
library(haven)
library(kableExtra)

# Loading the dataset

Heating_data <- read_dta("/Users/bastienpatras/Desktop/ENS - Master of Economics/Econometrics/Tutorial 2-20210221/Heating.dta")

$~$

Once we have the correct dataset we can then convert it to long format for further manipulation :

$~$

# Converting to long format with dplyr::pivot_longer()

Heating_data_long <- Heating_data %>%
  pivot_longer(cols = c("ic_gc", "ic_gr", "ic_ec", "ic_er", "ic_hp",
                        "oc_gc", "oc_gr", "oc_ec", "oc_er", "oc_hp",
                        "pb_gc", "pb_gr", "pb_ec", "pb_er", "pb_hp"),
               names_to = c(".value", "alternative"), 
               names_sep = "_") %>%
  mutate(mode_1 = dplyr::case_when(mode == '1' ~ 'gc',  #Creating mode_1 variable
                                   mode == '2' ~ 'gr',
                                   mode == '3' ~ 'ec',
                                   mode == '4' ~ 'er',
                                   mode == '5' ~ 'hp'),
         choice = ifelse(mode_1 == alternative,         #Creating choice variable
                         1,
                         0)
  ) %>%
  select(idcase, mode_1, choice, alternative, everything())

head(Heating_data_long) %>% kbl(booktabs = T) %>%
             kable_styling(latex_options = "striped", full_width = T)

idcase	mode_1	choice	alternative	mode	income	agehed	rooms	region	ic	oc	pb
1	gc	1	gc	1	7	25	6	4	866.00	199.69	4.336722
1	gc	0	gr	1	7	25	6	4	962.64	151.72	6.344846
1	gc	0	ec	1	7	25	6	4	859.90	553.34	1.554017
1	gc	0	er	1	7	25	6	4	995.76	505.60	1.969462
1	gc	0	hp	1	7	25	6	4	1135.50	237.88	4.773415
2	gc	1	gc	1	5	60	5	2	727.93	168.66	4.315961

$~$

We can then produce descriptive statistics on long data :

$~$

# Descriptive statistics

StatDesc <- Heating_data_long %>%
            dplyr::group_by(alternative) %>%
            dplyr::summarize(mean_oc = mean(oc),
            sd_oc = sd(oc), 
            mean_ic = mean(ic),
            sd_ic = sd(ic))

# Output

StatDesc %>% kbl(caption = "Descriptive statistics by alternatives", 
                 booktabs = T) %>%
             kable_styling(latex_options = "striped", full_width = T)

Descriptive statistics by alternatives
alternative	mean_oc	sd_oc	mean_ic	sd_ic
ec	476.8030	73.15333	824.5435	125.2739
er	429.7299	65.79263	983.9280	147.1554
gc	172.1158	25.93978	776.8266	115.5630
gr	154.4714	22.88574	921.7702	138.0867
hp	219.2993	32.96955	1046.4813	156.7030

$~$

2.1 - Produce descriptive statistics regarding the installation and operating costs for every alternatives:

$~$

# Summary statistics for all alternatives

SummStat_AllAlternatives <- Heating_data_long %>%
  dplyr::group_by(alternative) %>%
  dplyr::summarize(dplyr::across(
    .cols = c(ic, oc),
    .fns = list(n = ~n(), 
                Mean = ~round(mean(.x, na.rm = T)), 
                SD = ~round(var(.x, na.rm = T)^0.5),
                Min = ~round(min(.x, na.rm = T)),
                Max = ~round(max(.x, na.rm = T)),
                Med = ~round(median(.x, na.rm = T))),
    .names = "{.col}_{.fn}")
  ) 

# Output 

SummStat_AllAlternatives %>% kbl(caption = "Summary statistics for all alternatives", 
                                 booktabs = T) %>%
             kable_styling(latex_options = "striped", full_width = T)

Summary statistics for all alternatives
alternative	ic_n	ic_Mean	ic_SD	ic_Min	ic_Max	ic_Med	oc_n	oc_Mean	oc_SD	oc_Min	oc_Max	oc_Med
ec	900	825	125	470	1230	825	900	477	73	237	705	480
er	900	984	147	547	1496	990	900	430	66	180	664	431
gc	900	777	116	432	1159	779	900	172	26	84	248	172
gr	900	922	138	575	1344	924	900	154	23	78	228	154
hp	900	1046	157	532	1680	1047	900	219	33	121	319	221

2.2 - Then produce a table of descriptive statistics which gives the number of time and the share of the chosen heating alternatives, as well as descriptive statistics by chosen alternatives regarding installation and operating costs:

$~$

# Summary statistics for chosen alternatives

SummStat_ChosenAltern <- Heating_data_long %>%
  filter(mode_1 == alternative) %>%
  group_by(mode_1) %>%
  summarize(across(
    .cols = c(ic, oc),
    .fns = list(n = ~n(),
                share = ~ round(n() / length(unique(Heating_data_long$idcase))*100),
                Mean = ~round(mean(.x, na.rm = T)), 
                SD = ~round(var(.x, na.rm = T)^0.5),
                Min = ~round(min(.x, na.rm = T)),
                Max = ~round(max(.x, na.rm = T)),
                Med = ~round(median(.x, na.rm = T))),
    .names = "{.col}_{.fn}"))

# Output 

SummStat_ChosenAltern %>% kbl(caption = "Summary statistics for chosen alternatives", 
                              booktabs = T) %>%
             kable_styling(latex_options = "striped", full_width = T)

Summary statistics for chosen alternatives
mode_1	ic_n	ic_share	ic_Mean	ic_SD	ic_Min	ic_Max	ic_Med	oc_n	oc_share	oc_Mean	oc_SD	oc_Min	oc_Max	oc_Med
ec	64	7	789	137	470	1116	782	64	7	451	65	309	607	443
er	84	9	928	136	617	1212	939	84	9	407	61	233	527	418
gc	573	64	781	118	432	1159	780	573	64	173	25	84	248	173
gr	129	14	933	139	600	1264	932	129	14	155	25	88	226	158
hp	50	6	1027	161	653	1381	1041	50	6	213	36	136	289	211

$~$

As we can see the distribution of effectively chosen alternatives is centered around “gc” as 63% of people have chosen “gc” when they were proposed each alternatives. We can also observe that the average cost in both “ic” and “oc” is the lowest for the “gc” alternative. Having a look at other alternatives provide us with an idea of an underlying negative relationship between cost and effectively chosen alternatives as the most expensive alternatives tend to be avoided.

$~$

2.3 - Graph for densities of operating costs by alternative :

$~$

# Loading palettes

cbPalette <- c("#999999", 
               "#E69F00", 
               "#56B4E9", 
               "#009E73", 
               "#F0E442", 
               "#0072B2", 
               "#D55E00", 
               "#CC79A7", 
               "#000000")

# Building plot_1

Plot_1 <- ggplot(Heating_data_long, 
       aes(x = factor(alternative), y = oc, color = factor(choice))) + 
  theme_bw() +
  labs(x=expression(paste("Heating alternatives")),y=expression(paste("Operating costs"))) +
  geom_jitter(alpha = 0.5)+
  geom_violin(size=1.25, alpha=0)+  
  theme(legend.position = "bottom") +
  scale_color_manual(name = "Houses chosing the heating alternative", 
                     values = cbPalette[c(2,4)], 
                     labels = c("No", "Yes"))+
  guides(color = guide_legend(ncol=1, direction = "vertical"))

# Output

show(Plot_1)

$~$

2.4 - Graph for densities of installation costs by alternative :

$~$

# Loading palettes

cbPalette <- c("#999999", 
               "#E69F00", 
               "#56B4E9", 
               "#009E73", 
               "#F0E442", 
               "#0072B2", 
               "#D55E00", 
               "#CC79A7", 
               "#000000")

# Building plot_2

plot_2 <- ggplot(Heating_data_long, 
       aes(x = factor(alternative), y = ic, color = factor(choice))) + 
  theme_bw() +
  labs(x=expression(paste("Heating alternatives")),y=expression(paste("Installation costs"))) +
  geom_jitter(alpha = 0.5)+
  geom_violin(size=1.25, alpha=0)+  
  theme(legend.position = "bottom") +
  scale_color_manual(name = "Houses chosing the heating alternative", 
                     values = cbPalette[c(2,4)], 
                     labels = c("No", "Yes"))+
  guides(color = guide_legend(ncol=1, direction = "vertical"))

# Output

show(plot_2)

$~$

3 - Use the mlogit.data command to specify to R the “long” shape of your data, the “choice” variable giving the chosen heating system, the identifier variable (house id), and the variable which measures the alternative heating system. :

$~$

H <- mlogit.data(Heating_data_long, 
                 id.var = "idcase",
                 shape = "long", 
                 choice = "choice", 
                 alt.var = "alternative")

$~$

4 - Estimate a logit model where the heating system decision only depends on installation cost and operating cost, without intercepts :

$~$

# Logit regression 

m <- mlogit(formula = choice~ic+oc|0,
            data = H)

# Output 

summary(m)

## 
## Call:
## mlogit(formula = choice ~ ic + oc | 0, data = H, method = "nr")
## 
## Frequencies of alternatives:choice
##       ec       er       gc       gr       hp 
## 0.071111 0.093333 0.636667 0.143333 0.055556 
## 
## nr method
## 4 iterations, 0h:0m:0s 
## g'(-H)^-1g = 1.56E-07 
## gradient close to zero 
## 
## Coefficients :
##       Estimate  Std. Error z-value  Pr(>|z|)    
## ic -0.00623187  0.00035277 -17.665 < 2.2e-16 ***
## oc -0.00458008  0.00032216 -14.217 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Log-Likelihood: -1095.2

$~$

1. Do the estimated coefficients have the expected signs?

$~$

As we have seen above, the descriptive statistics were implying a negative relationship between cost and effectively chosen alternatives. As we see in the previous table the coefficients are negative meaning that as the cost of a system rises, the probability of selecting a given alternative decreases which confirms our expectations regarding the signs from descriptive statistics.

$~$

2. Are both coefficients significantly different from zero?

$~$

As we can see in the table the coefficient are both highly significant as their respective p-value are way above 1.96, which is the critical level for 95% confidence level.

$~$

# Computing the magnitudes : 

Marginal <- effects(m, "oc")

# Output

Marginal %>% kbl(caption = "Magnitudes : Marginal effects", 
                              booktabs = T) %>%
             kable_styling(latex_options = "striped", full_width = T)

Magnitudes : Marginal effects
	ec	er	gc	gr	hp
ec	-0.0004086	0.0000206	0.0002441	0.0001073	0.0000366
er	0.0000206	-0.0001989	0.0001122	0.0000493	0.0000168
gc	0.0002441	0.0001122	-0.0011383	0.0005829	0.0001991
gr	0.0001073	0.0000493	0.0005829	-0.0008269	0.0000875
hp	0.0000366	0.0000168	0.0001991	0.0000875	-0.0003401

$~$

3. How closely do the average probabilities match the shares of customers choosing each alternative?

$~$

# Computing matching :

Matchinng <- apply(X = fitted(m, outcome = F), MARGIN = 2, FUN = mean)

# Output :

Matchinng %>% kbl(caption = "Matchinng : Average probabilities with customers choosing", 
                              booktabs = T) %>%
             kable_styling(latex_options = "striped", full_width = T)

Matchinng : Average probabilities with customers choosing
	x
ec	0.1041306
er	0.0514148
gc	0.5169565
gr	0.2403090
hp	0.0871891

$~$

The average probabilities does not match the shares of customers choosing each alternative. As we have seen from the descriptive statistics above 63.67% of consumers have chosen “gc” and yet the model produces an average probability of only 51.695%. The other alternatives are also fairly poorly predicted.

$~$

4. What is the estimated wtp from this model? Is it reasonable in magnitude?

$~$

The model is given such that :

\[U = \beta_{oc}oc + \beta_{ic}ic\]

Thus fully differentiating equation $(1)$ and setting $\text{d}U=0$ yields:

\[\begin{split} \text{d}U &= \beta_{oc}\text{d}oc + \beta_{ic}\text{d}ic \\ \frac{\text{d}oc}{\text{d}ic} &= -\frac{\beta_{oc}}{\beta_{ic}} \end{split}\]

$~$

The willingness to pay can be computed by dividing the coef of “oc” to “ic” :

\[ \text{Willingness to pay} = \frac{\text{coef}(oc)}{\text{coef}(ic)} = 0.7349453 \]

wtp <- coef(m)["oc"] / coef(m)["ic"]

wtp

##        oc 
## 0.7349453

$~$

5. What value of $r$ is implied by the estimated $wtp$ that you calculated in question 4.(d)? Is this reasonable?

$~$

We know that :

$U = a\text{LC}$ with $\text{LC}$ being the lifecycle cost
The lifecycle cost can be defined as the sum of installation cost and the present value of operating costs: $\text{LC} = \text{IC} + \frac{1}{r}\times\text{OC}$

Thus we can derive an expression of U such that :

\[ U = a\text{IC} + \frac{a}{r}\times\text{OC} \]

We derive the estimates of $a$ and $\frac{a}{r}$ :

\[\begin{cases} a = -0.00623 \\ \frac{a}{r} = -0.00457 \end{cases}\]

Thus the discount rate is given by :

\[ r = \frac{a}{\frac{a}{r}} = 1.36 = 136\% \]

The level of this discount rate 136% is unreasonable as it is far too high.

$~$

5 - Add alternative-specific constants to the model

$~$

# Running logit regression with alternative specific constants:

mc <- mlogit(formula = choice ~ ic + oc,
             data = H,
             reflevel = "hp")

# Output summary

summary(mc)

## 
## Call:
## mlogit(formula = choice ~ ic + oc, data = H, reflevel = "hp", 
##     method = "nr")
## 
## Frequencies of alternatives:choice
##       hp       ec       er       gc       gr 
## 0.055556 0.071111 0.093333 0.636667 0.143333 
## 
## nr method
## 6 iterations, 0h:0m:0s 
## g'(-H)^-1g = 9.58E-06 
## successive function values within tolerance limits 
## 
## Coefficients :
##                   Estimate  Std. Error z-value  Pr(>|z|)    
## (Intercept):ec  1.65884594  0.44841936  3.6993 0.0002162 ***
## (Intercept):er  1.85343697  0.36195509  5.1206 3.045e-07 ***
## (Intercept):gc  1.71097930  0.22674214  7.5459 4.485e-14 ***
## (Intercept):gr  0.30826328  0.20659222  1.4921 0.1356640    
## ic             -0.00153315  0.00062086 -2.4694 0.0135333 *  
## oc             -0.00699637  0.00155408 -4.5019 6.734e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Log-Likelihood: -1008.2
## McFadden R^2:  0.013691 
## Likelihood ratio test : chisq = 27.99 (p.value = 8.3572e-07)

$~$

1. How well do the estimated probabilities match the shares of customers choosing each alternative?

$~$

# Matching efficiency

Fitted_output <- apply(fitted(mc, outcome = FALSE), 2, mean)

# Output

Fitted_output %>% kbl(caption = "Matchinng efficiency with alternative specific constants:", 
                              booktabs = T) %>%
             kable_styling(latex_options = "striped", full_width = T)

Matchinng efficiency with alternative specific constants:
	x
hp	0.0555556
ec	0.0711111
er	0.0933333
gc	0.6366667
gr	0.1433333

$~$

In this case (with alternative specific constants) the model is able to perfectly fit the data as shown by the table below.

$~$

2. Calculate the $wtp$ and discount rate $r$ that is implied by the estimates. Are these reasonable?

$~$

# Compute the new wtp

wtp_2 <- coef(mc)["oc"]/coef(mc)["ic"]
wtp_2

##       oc 
## 4.563385

# Compute the new r

r <- 1/wtp_2
r

##        oc 
## 0.2191356

$~$

6 - Models with sociodemographic variables entering

$~$

1. Entering : installation cost divided by income, instead of installation cost.

$~$

# Model with sociodemographic variables entering 

Logit_Income <- mlogit(choice~oc+I(ic/income), H)

# Output

summary(Logit_Income)

## 
## Call:
## mlogit(formula = choice ~ oc + I(ic/income), data = H, method = "nr")
## 
## Frequencies of alternatives:choice
##       ec       er       gc       gr       hp 
## 0.071111 0.093333 0.636667 0.143333 0.055556 
## 
## nr method
## 6 iterations, 0h:0m:0s 
## g'(-H)^-1g = 1.03E-05 
## successive function values within tolerance limits 
## 
## Coefficients :
##                  Estimate Std. Error z-value  Pr(>|z|)    
## (Intercept):er  0.0639934  0.1944893  0.3290  0.742131    
## (Intercept):gc  0.0563481  0.4650251  0.1212  0.903555    
## (Intercept):gr -1.4653063  0.5033845 -2.9109  0.003604 ** 
## (Intercept):hp -1.8700773  0.4364248 -4.2850 1.827e-05 ***
## oc             -0.0071066  0.0015518 -4.5797 4.657e-06 ***
## I(ic/income)   -0.0027658  0.0018944 -1.4600  0.144298    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Log-Likelihood: -1010.2
## McFadden R^2:  0.011765 
## Likelihood ratio test : chisq = 24.052 (p.value = 5.9854e-06)

# Matching efficiency with sociodemographics entering :

Fitted_output_Income <- apply(fitted(Logit_Income, outcome = FALSE), 2, mean)

Fitted_output_Income %>% kbl(caption = "Matchinng efficiency with sociodemographics",
                             booktabs = T) %>%
             kable_styling(latex_options = "striped", full_width = T)

Matchinng efficiency with sociodemographics
	x
ec	0.0711111
er	0.0933333
gc	0.6366667
gr	0.1433333
hp	0.0555556

The transformation seems to make the model worse as the Log-Likelihood is getting lower and that the coefficient on installation cost is getting statistically insignificant.

$~$

2. Entering : alternative-specific income effects

$~$

# Model_2 with alternative-specific income effects entering 

Logit_Income_2 <- mlogit(choice~oc+ic|income, H, reflevel="hp")

# Output

summary(Logit_Income_2)

## 
## Call:
## mlogit(formula = choice ~ oc + ic | income, data = H, reflevel = "hp", 
##     method = "nr")
## 
## Frequencies of alternatives:choice
##       hp       ec       er       gc       gr 
## 0.055556 0.071111 0.093333 0.636667 0.143333 
## 
## nr method
## 6 iterations, 0h:0m:0s 
## g'(-H)^-1g = 6.27E-06 
## successive function values within tolerance limits 
## 
## Coefficients :
##                   Estimate  Std. Error z-value  Pr(>|z|)    
## (Intercept):ec  1.95445797  0.70353833  2.7780 0.0054688 ** 
## (Intercept):er  2.30560852  0.62390478  3.6954 0.0002195 ***
## (Intercept):gc  2.05517018  0.48639682  4.2253 2.386e-05 ***
## (Intercept):gr  1.14158139  0.51828845  2.2026 0.0276231 *  
## oc             -0.00696000  0.00155383 -4.4792 7.491e-06 ***
## ic             -0.00153534  0.00062251 -2.4664 0.0136486 *  
## income:ec      -0.06362917  0.11329865 -0.5616 0.5743846    
## income:er      -0.09685787  0.10755423 -0.9005 0.3678281    
## income:gc      -0.07178917  0.08878777 -0.8085 0.4187752    
## income:gr      -0.17981159  0.10012691 -1.7958 0.0725205 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Log-Likelihood: -1005.9
## McFadden R^2:  0.01598 
## Likelihood ratio test : chisq = 32.67 (p.value = 1.2134e-05)

# Matching efficiency with alternative-specific income effects entering :

Fitted_output_Income_2 <- apply(fitted(Logit_Income_2, outcome = FALSE), 2, mean)

Fitted_output_Income_2 %>% kbl(caption = "Matchinng efficiency with alternative-specific income effects entering ",
                             booktabs = T) %>%
             kable_styling(latex_options = "striped", full_width = T)

Matchinng efficiency with alternative-specific income effects entering
	x
hp	0.0555556
ec	0.0711111
er	0.0933333
gc	0.6366667
gr	0.1433333

$~$

The model suggests that a increase in income tends to increase the probability the the consumers select the heat pump relatively to other alternatives. Additionally, Model_2 indicates that the income does not enter significantly(except for “gr” but for low level of significance). Therefore it seems that the income has no effect on the probability of choosing an alternative.

$~$

7 - Using the alternative specific constant model of question 5., calculate new probabilities and predicted shares using the new installation cost of heat pump. How much do the rebates raise the share of houses with heat pumps?

$~$

# Define the rebates function -10%

rebates <- function(x){x*0.9}

# Transform the dataset by applying the rebates

Heating_data_long_1 <- Heating_data_long %>% 
                       mutate(ic = ifelse(alternative == 'hp', rebates(ic), ic))

# Transform the dataset to long data

Hnew <- mlogit.data(Heating_data_long_1, 
                 id.var = "idcase",
                 shape = "long", 
                 choice = "choice", 
                 alt.var = "alternative")

# Run the predict from 5 on old and new dataset to see the difference

old <- apply(predict(mc, newdata = H), 2, mean)
new <- apply(predict(mc, newdata = Hnew), 2, mean)

# Printing output old 

old %>% kbl(caption = "Matchinng efficiency old ",
                             booktabs = T) %>%
             kable_styling(latex_options = "striped", full_width = T)

Matchinng efficiency old
	x
hp	0.0555556
ec	0.0711111
er	0.0933333
gc	0.6366667
gr	0.1433333

# Printing output new 

new %>% kbl(caption = "Matchinng efficiency new ",
                             booktabs = T) %>%
             kable_styling(latex_options = "striped", full_width = T)

Matchinng efficiency new
	x
hp	0.0644623
ec	0.0704549
er	0.0924703
gc	0.6306444
gr	0.1419681

$~$

As shown by Table 10 when rebates are given on $'hp'$ The average probability is the predicted share for an alternative. Before the rebate the predicted share of effectively chosen alternative over $'hp'$ was near 5.5%, when the rebate is applied share is supposed to rise from 5.5% to 6.4%.

$~$

8 - Suppose a new technology is developed that provides more efficient central heating. The new technology costs $200 more than the central electric system, denoted ec. However, it saves 25% of the electricity, such that its operating costs are 75% of the operating costs of the ec alternative. We want to predict the potential market penetration of this technology. Note that there are now six alternatives: the original five alternatives plus this new one. Revise your program to calculate the probability and predict the market share (that is, the average probability) for all six alternatives, using the model that is estimated on the original five alternatives.