Problem Set: Ordered Probit Methodology The San Francisco Airport is very concerned about customer satisfaction. Attached you will find a customer satisfaction survey for the SFO. Q7ALL is a categorical variable ranking the customer’s satisfaction from unacceptable to outstanding
library('readr')
library('dplyr') # for data manipulation
library('tidyr') # for reshaping data
library('ggplot2') # plotting data
library('scales') # for scale_y_continuous(label = percent)
library('ggthemes') # for scale_fill_few('medium')
library('ztable') # format tables for reporting
library('kableExtra')
library('janitor')
library('forcats')
library('plyr')
library('AER')
knitr::opts_chunk$set(comment = NA)
options(ztable.type = 'html')
options(digits = 3)
SFO <- read_csv("2015_SFO_Customer_Survey.csv")
SFO$Q7ALL[SFO$Q7ALL==0]<- NA
SFO$Q7ALL[SFO$Q7ALL==6]<- NA
SFO$Q7ALL<-factor(SFO$Q7ALL)
levels(SFO$Q7ALL) <- c("1-Unacceptable","2","3","4","5-Outstanding")
#added "Other" to avoid NA being listed twice and needed to resolve the argument error
table(SFO$Q7ALL) %>%
kable(col.names = c("Ratings (1:Unacceptable - 5:Outstanding)", "Frequencies as Count"), format = "markdown", align = "c")
| Ratings (1:Unacceptable - 5:Outstanding) | Frequencies as Count |
|---|---|
| 1-Unacceptable | 7 |
| 2 | 29 |
| 3 | 526 |
| 4 | 1508 |
| 5-Outstanding | 740 |
prop.table(table(SFO$Q7ALL)) %>% kable(col.names = c("Ratings (1:Unacceptable - 5:Outstanding)", "Frequencies as Percentages"), format = "markdown", align = "c")
| Ratings (1:Unacceptable - 5:Outstanding) | Frequencies as Percentages |
|---|---|
| 1-Unacceptable | 0.002 |
| 2 | 0.010 |
| 3 | 0.187 |
| 4 | 0.537 |
| 5-Outstanding | 0.263 |
The below are variables that capture information about the customer: 1. How many times they have flown out of SFO (Q5TimesFlown) 2. If it’s their first flight (Q5FirstTime) 3. How long they have been utilizing SFO (Q6LongUse) 4. Their Gender (Q19) 5. Their Income (Q20) 6. Did they fly over 100k miles (Q21) 7. Their Age Range (Q18) 8. Area they live in (region of California) (Q16LIVE) 9. LANGUAGE
library("formattable")
formattable(tabyl(SFO$Q18AGE) %>% adorn_pct_formatting(digits = 2),
align =c("l","c","c"),
caption = "Age Range",
list(`Indicator Name` = formatter(
"span", style = ~ style(color = "grey",font.weight = "bold"))))
| SFO$Q18AGE | n | percent |
|---|---|---|
| 0 | 95 | 3.21% |
| 1 | 49 | 1.66% |
| 2 | 379 | 12.81% |
| 3 | 669 | 22.62% |
| 4 | 546 | 18.46% |
| 5 | 498 | 16.84% |
| 6 | 415 | 14.03% |
| 7 | 306 | 10.34% |
| 8 | 1 | 0.03% |
formattable(tabyl(SFO$Q19GENDER) %>% adorn_pct_formatting(digits = 2),
align =c("l","c","c"),
caption = "Gender",
list(`Indicator Name` = formatter(
"span", style = ~ style(color = "grey",font.weight = "bold"))))
| SFO$Q19GENDER | n | percent |
|---|---|---|
| 0 | 177 | 5.98% |
| 1 | 1289 | 43.58% |
| 2 | 1485 | 50.20% |
| 3 | 7 | 0.24% |
formattable(tabyl(SFO$Q20INCOME) %>% adorn_pct_formatting(digits = 2),
align =c("l","c","c"),
caption = "Household Income",
list(`Indicator Name` = formatter(
"span", style = ~ style(color = "grey",font.weight = "bold"))))
| SFO$Q20INCOME | n | percent |
|---|---|---|
| 0 | 529 | 17.88% |
| 1 | 437 | 14.77% |
| 2 | 760 | 25.69% |
| 3 | 453 | 15.31% |
| 4 | 731 | 24.71% |
| 5 | 45 | 1.52% |
| 6 | 3 | 0.10% |
formattable(tabyl(SFO$Q16LIVE) %>% adorn_pct_formatting(digits = 2),
align =c("l","c","c"),
caption = "Area Lived In",
list(`Indicator Name` = formatter(
"span", style = ~ style(color = "grey",font.weight = "bold"))))
| SFO$Q16LIVE | n | percent |
|---|---|---|
| 0 | 67 | 2.27% |
| 1 | 1018 | 34.42% |
| 2 | 141 | 4.77% |
| 3 | 1732 | 58.55% |
formattable(tabyl(SFO$LANG) %>% adorn_pct_formatting(digits = 2),
align =c("l","c","c"),
caption = "Language",
list(`Indicator Name` = formatter(
"span", style = ~ style(color = "grey",font.weight = "bold"))))
| SFO$LANG | n | percent |
|---|---|---|
| 1 | 2824 | 95.47% |
| 2 | 29 | 0.98% |
| 3 | 71 | 2.40% |
| 4 | 34 | 1.15% |
formattable(tabyl(SFO$Q21FLY) %>% adorn_pct_formatting(digits = 2),
align =c("l","c","c"),
caption = "Fly 100K+ Miles/Year",
list(`Indicator Name` = formatter(
"span", style = ~ style(color = "grey",font.weight = "bold"))))
| SFO$Q21FLY | n | percent |
|---|---|---|
| 0 | 116 | 3.92% |
| 1 | 478 | 16.16% |
| 2 | 2130 | 72.01% |
| 3 | 234 | 7.91% |
formattable(tabyl(SFO$Q5TIMESFLOWN) %>% adorn_pct_formatting(digits = 2),
align =c("l","c","c"),
caption = "# Times Flown Out of SFO, Past 12 Months",
list(`Indicator Name` = formatter(
"span", style = ~ style(color = "grey",font.weight = "bold"))))
| SFO$Q5TIMESFLOWN | n | percent |
|---|---|---|
| 0 | 11 | 0.37% |
| 1 | 1112 | 37.59% |
| 2 | 655 | 22.14% |
| 3 | 737 | 24.92% |
| 4 | 222 | 7.51% |
| 5 | 126 | 4.26% |
| 6 | 95 | 3.21% |
formattable(tabyl(SFO$Q5FIRSTTIME) %>% adorn_pct_formatting(digits = 2),
align =c("l","c","c"),
list(`Indicator Name` = formatter(
"span", style = ~ style(color = "grey",font.weight = "bold"))))
| SFO$Q5FIRSTTIME | n | percent |
|---|---|---|
| 0 | 10 | 0.34% |
| 1 | 2425 | 81.98% |
| 2 | 523 | 17.68% |
formattable(tabyl(SFO$Q6LONGUSE) %>% adorn_pct_formatting(digits = 2),
align =c("l","c","c"),
caption = "First Time Flying Out of SFO",
list(`Indicator Name` = formatter(
"span", style = ~ style(color = "grey",font.weight = "bold"))))
| SFO$Q6LONGUSE | n | percent |
|---|---|---|
| 0 | 50 | 1.69% |
| 1 | 752 | 25.42% |
| 2 | 640 | 21.64% |
| 3 | 317 | 10.72% |
| 4 | 1198 | 40.50% |
| 5 | 1 | 0.03% |
library(MASS)
library(stargazer)
m <- polr(factor(Q7ALL) ~ factor(Q18AGE) + factor(Q19GENDER) + factor(Q20INCOME) + factor(Q21FLY) + factor(LANG), SFO, method = "probit")
stargazer(m, type = "text", covariate.labels = c("Under 18","18 - 24", "25 - 34", "35 - 44", "45 - 54 ", "55 - 64", "65 and over", "Age Not Specified", "Male", "Female", "Gender Not Specified", "Under $50,000", "$50,000 - $100,000", "$100,001 - 150,000", "Over $150,000", "Other Income", "Income Not Specified", "Frequent Flyer", "Not a Frequent Flyer", "Frequent Flyer Not Specified","English","Spanish","Chinese","Japanese"))
======================================================== Dependent variable:
————————— Q7ALL
——————————————————– Under 18 0.486**
(0.234)
18 - 24 0.258
(0.177)
25 - 34 0.110
(0.173)
35 - 44 -0.049
(0.174)
45 - 54 -0.042
(0.174)
55 - 64 0.029
(0.174)
65 and over 0.166
(0.177)
Age Not Specified 4.760
(58.000)
Male 0.219**
(0.110)
Female 0.347***
(0.109)
Gender Not Specified 0.402
(0.437)
Under 50,000 0.008
(0.081)
50,000 - 100,000 0.078
(0.070)
100,001 - 150,000 -0.074
(0.078)
Over 150,000 -0.099
(0.072)
Other Income -0.037
(0.183)
Income Not Specified -0.114
(0.638)
Frequent Flyer -0.024
(0.159)
Not a Frequent Flyer 0.035
(0.151)
Frequent Flyer Not Specified -0.166
(0.166)
English 0.466**
(0.227)
Spanish 0.284*
(0.145)
Chinese -0.866***
(0.199)
Note: p<0.1; p<0.05; p<0.01
Problem Set: BLP Methodology Attached is a table of market shares, prices, and characteristics on the top-selling brands of cereal in 1992. The data are aggregated from household-level scanner data (collected at supermarket checkout counters).
The market shares below are shares of total cereal purchases observed in the dataset. For the purposes of this problem set, assume that all households purchased some cereal during 1992 (so that non-purchase is not an option). Assume that brand #51, the composite basket of “all other brands”, is the outside good.
Two sets of prices are given in the table. Shelf prices are those listed on supermarket shelves, and do not include coupon discounts. Transactions prices are prices actually paid by consumers, net of coupon discounts. Estimate using the transactions prices. Note that you should subtract the price of brand #51, the “outside good”, from the prices of the top fifty brands.
Assume a utility specification for uij , household i’s utility from brand j: \[u_{ij} = X_j\beta+\alpha p_j + \xi_j + v_{ij}\] where \(X_j\) are characteristics of brand j, \(\xi_j\) is an unobserved (to the econometrician) quality parameter for brand j, and \(v_{ij}\) is a disturbance term which is identically and independently distributed (i.i.d.) over households i and brands j. As in Berry (1994), denote the mean utility level from brand j as \[\delta_j \equiv X_j\beta+\alpha p_j + \xi_j\] 1. If we assume that the \(v_{ij}\) ’s are distributed i.i.d. type I extreme value, then the resulting expressions for the market shares of each brand j; \(j = 1,...,51\).
\[ s_j = \frac{exp(X_j\beta+\alpha p_j + \xi_j)}{\sum_i^n exp(X_i\beta+\alpha p_i + \xi_i)}\] We need to normalize the outside good. Remember the outside good is good 51. The outside good has not characteristics (i.e. no X’s), but does have a price, \(p_0\).
In this part of the assignment, you need to define a new variable called difprice = price of the good j - the price of the outside good.
cereal <- read_csv("workbook.csv")
library(dplyr)
cereal$difprice <- cereal$`average transaction price`-last(cereal$`average transaction price`)
You will be using this new price in the remaining steps.
Next we implement the BLP two-step estimator.
Big hint! We get to observe market shares for each product, \(s_j\). These market shares are data. However, we need to convert them so that we can estimate \[\delta_j \equiv X_j\beta+\alpha p_j + \xi_j\]
In the notes there is function that tell us how convert market shares into the \(\delta_j\). Once you find this function, create a new variable called delta.
cereal$delta_j <- log(last(cereal$`in sample market share`))-log(cereal$`in sample market share`)
b <- cereal[-nrow(cereal),]
# delta, difprice, product characteristics
# this is just a regular OLS
# need a linear regression with delta as a dependent variable and the rest as your explanatory variables
# product characteristics needed, not shelf price for example
ols.1<-lm(delta_j~difprice+cals+fat+suger, data=b)
stargazer(ols.1, type = "html")
| Dependent variable: | |
| delta_j | |
| difprice | 0.224 |
| (0.149) | |
| cals | -0.001 |
| (0.003) | |
| fat | 0.005 |
| (0.057) | |
| suger | 0.029** |
| (0.014) | |
| Constant | 2.620*** |
| (0.319) | |
| Observations | 50 |
| R2 | 0.150 |
| Adjusted R2 | 0.074 |
| Residual Std. Error | 0.483 (df = 45) |
| F Statistic | 1.980 (df = 4; 45) |
| Note: | p<0.1; p<0.05; p<0.01 |
companygrouping <- b$company
#View(companygrouping)
#do a linear regression here
#use an instrumental variable or more than one
#need to group by company
#then take the characterstics of the competitor to be instrumental variables
#use mutate or group by if cant use his suggestions
b$difprice.z<-(ave(b$difprice,factor(b$company),FUN=function(x)
sum(x)-b$difprice)/(ave(b$difprice,factor(b$company),FUN=function(x) length(x))-1))
b$cal.z<-(ave(b$cals,factor(b$company),FUN=function(x)
sum(x)-b$cals)/(ave(b$cals,factor(b$company),FUN=function(x) length(x))-1))
b$sug.z<-(ave(b$suger,factor(b$company),FUN=function(x)
sum(x)-b$suger)/(ave(b$suger,factor(b$company),FUN=function(x) length(x))-1))
b$fat.z<-(ave(b$fat,factor(b$company),FUN=function(x)
sum(x)-b$fat)/(ave(b$fat,factor(b$company),FUN=function(x) length(x))-1))
ols.2<-lm(delta_j~difprice+b$difprice.z+b$cal.z+b$sug.z+b$fat.z, data=b)
stargazer(ols.1,ols.2,type="html")
| Dependent variable: | ||
| delta_j | ||
| (1) | (2) | |
| difprice | 0.224 | 0.474*** |
| (0.149) | (0.157) | |
| cals | -0.001 | |
| (0.003) | ||
| fat | 0.005 | |
| (0.057) | ||
| suger | 0.029** | |
| (0.014) | ||
| difprice.z | -0.355* | |
| (0.197) | ||
| cal.z | 0.005 | |
| (0.003) | ||
| sug.z | -0.024 | |
| (0.039) | ||
| fat.z | 0.093 | |
| (0.121) | ||
| Constant | 2.620*** | 2.230*** |
| (0.319) | (0.554) | |
| Observations | 50 | 50 |
| R2 | 0.150 | 0.222 |
| Adjusted R2 | 0.074 | 0.134 |
| Residual Std. Error | 0.483 (df = 45) | 0.468 (df = 44) |
| F Statistic | 1.980 (df = 4; 45) | 2.510** (df = 5; 44) |
| Note: | p<0.1; p<0.05; p<0.01 | |
Remember that a mean is \[\bar{x}=\frac{1}{n}\sum_i^n x_i\]
but let’s say that x is made up of \(n_1\) observations of \(\hat{x}\) and \(n_2\) obserations of \(\tilde{x}\) We can write the mean of \(\hat{x}\) two ways \[\bar{\hat{x}}=\frac{1}{n_1}\sum_i^n \hat{x}_i\] or \[\bar{\hat{x}}=\frac{1}{n-n_2}(\sum_i^{n}x_i-\sum_j^{n_2}\tilde{x}_j)\]
The last function in the R chunk about the ave function does this version of the mean. You can see that it is useful when you want the mean from the other groups.
b$difprice.z<-(ave(b$difprice,factor(b$company),FUN=function(x)
sum(x)-b$difprice)/(ave(b$difprice,factor(b$company),FUN=function(x) length(x))-1))
b$cal.z<-(ave(b$cals,factor(b$company),FUN=function(x)
sum(x)-b$cals)/(ave(b$cals,factor(b$company),FUN=function(x) length(x))-1))
b$sug.z<-(ave(b$suger,factor(b$company),FUN=function(x)
sum(x)-b$suger)/(ave(b$suger,factor(b$company),FUN=function(x) length(x))-1))
b$fat.z<-(ave(b$fat,factor(b$company),FUN=function(x)
sum(x)-b$fat)/(ave(b$fat,factor(b$company),FUN=function(x) length(x))-1))
iv.reg1<-ivreg(delta_j~difprice+cals+fat+suger+factor(b$company)|cals+fat+suger+factor(b$company)+b$difprice.z+b$cal.z+b$sug.z+b$fat.z,data=b)
stargazer(iv.reg1,type = "html")
| Dependent variable: | |
| delta_j | |
| difprice | 0.428 |
| (0.504) | |
| cals | -0.001 |
| (0.004) | |
| fat | -0.014 |
| (0.080) | |
| suger | 0.038** |
| (0.017) | |
| company)KG | -0.075 |
| (0.172) | |
| company)NB | 0.566* |
| (0.331) | |
| company)PT | 0.374 |
| (0.261) | |
| company)QK | 0.683 |
| (0.548) | |
| company)RL | 0.266 |
| (0.414) | |
| Constant | 2.510*** |
| (0.596) | |
| Observations | 50 |
| R2 | 0.344 |
| Adjusted R2 | 0.197 |
| Residual Std. Error | 0.450 (df = 40) |
| Note: | p<0.1; p<0.05; p<0.01 |
b$difprice.z2<-(ave(b$difprice,factor(b$sgmnt),FUN=function(x)
sum(x)-b$difprice)/(ave(b$difprice,factor(b$sgmnt),FUN=function(x) length(x))-1))
b$cal.z2<-(ave(b$cals,factor(b$sgmnt),FUN=function(x)
sum(x)-b$cals)/(ave(b$cals,factor(b$sgmnt),FUN=function(x) length(x))-1))
b$sug.z2<-(ave(b$suger,factor(b$sgmnt),FUN=function(x)
sum(x)-b$suger)/(ave(b$suger,factor(b$sgmnt),FUN=function(x) length(x))-1))
b$fat.z2<-(ave(b$fat,factor(b$sgmnt),FUN=function(x)
sum(x)-b$fat)/(ave(b$fat,factor(b$sgmnt),FUN=function(x) length(x))-1))
iv.reg2<-ivreg(delta_j~difprice+cals+fat+suger+factor(b$sgmnt)|cals+fat+suger+factor(b$sgmnt)+b$difprice.z2+b$cal.z2+b$sug.z2+b$fat.z2,data=b)
stargazer(iv.reg2,type = "html")
| Dependent variable: | |
| delta_j | |
| difprice | 0.913 |
| (0.621) | |
| cals | 0.002 |
| (0.005) | |
| fat | -0.036 |
| (0.072) | |
| suger | 0.030 |
| (0.023) | |
| sgmnt)Fam | -0.337 |
| (0.294) | |
| sgmnt)Kids | -0.129 |
| (0.286) | |
| Constant | 2.260*** |
| (0.726) | |
| Observations | 50 |
| R2 | -0.193 |
| Adjusted R2 | -0.360 |
| Residual Std. Error | 0.586 (df = 43) |
| Note: | p<0.1; p<0.05; p<0.01 |
2SLS: using average characteristics for all other brands not produced by manufacturer of product j as instruments for \(p_j\)
2SLS: using the average characteristics for all other brands besides product j.
b$difprice.z4<-(ave(b$difprice,FUN=function(x)
sum(x)-b$difprice)/(ave(b$difprice,FUN=function(x) length(x))-1))
b$cal.z4<-(ave(b$cals,FUN=function(x)
sum(x)-b$cals)/(ave(b$cals,FUN=function(x) length(x))-1))
b$sug.z4<-(ave(b$suger,FUN=function(x)
sum(x)-b$suger)/(ave(b$suger,FUN=function(x) length(x))-1))
b$fat.z4<-(ave(b$fat,FUN=function(x)
sum(x)-b$fat)/(ave(b$fat,FUN=function(x) length(x))-1))
iv.reg4<-ivreg(delta_j~difprice+cals+fat+suger|cals+fat+suger+b$difprice.z4+b$cal.z4+b$sug.z4+b$fat.z4,data=b)
stargazer(iv.reg4,type = "html")
| Dependent variable: | |
| delta_j | |
| difprice | 0.224 |
| (0.149) | |
| cals | -0.001 |
| (0.003) | |
| fat | 0.005 |
| (0.057) | |
| suger | 0.029** |
| (0.014) | |
| Constant | 2.620*** |
| (0.319) | |
| Observations | 50 |
| R2 | 0.150 |
| Adjusted R2 | 0.074 |
| Residual Std. Error | 0.483 (df = 45) |
| Note: | p<0.1; p<0.05; p<0.01 |
The second set, OLS 2, produces the largest first stage F-statistic at 2.514.