Problem Set: Ordered Probit Methodology The San Francisco Airport is very concerned about customer satisfaction. Attached you will find a customer satisfaction survey for the SFO. Q7ALL is a categorical variable ranking the customer’s satisfaction from unacceptable to outstanding

library('readr')
library('dplyr')      # for data manipulation
library('tidyr')      # for reshaping data
library('ggplot2')    # plotting data
library('scales')     # for scale_y_continuous(label = percent)
library('ggthemes')   # for scale_fill_few('medium')
library('ztable')     # format tables for reporting
library('kableExtra')
library('janitor')
library('forcats')
library('plyr')
library('AER')
knitr::opts_chunk$set(comment = NA)
options(ztable.type = 'html')
options(digits = 3)

SFO <- read_csv("2015_SFO_Customer_Survey.csv")

SFO$Q7ALL[SFO$Q7ALL==0]<- NA
SFO$Q7ALL[SFO$Q7ALL==6]<- NA
SFO$Q7ALL<-factor(SFO$Q7ALL)
levels(SFO$Q7ALL) <- c("1-Unacceptable","2","3","4","5-Outstanding")
#added "Other" to avoid NA being listed twice and needed to resolve the argument error
table(SFO$Q7ALL) %>% 
  kable(col.names = c("Ratings (1:Unacceptable - 5:Outstanding)", "Frequencies as Count"), format = "markdown", align = "c")
Ratings (1:Unacceptable - 5:Outstanding) Frequencies as Count
1-Unacceptable 7
2 29
3 526
4 1508
5-Outstanding 740
  1. Create a table summarizing the counts of the potential outcomes and report them as percentages.
prop.table(table(SFO$Q7ALL)) %>% kable(col.names = c("Ratings (1:Unacceptable - 5:Outstanding)", "Frequencies as Percentages"), format = "markdown", align = "c")
Ratings (1:Unacceptable - 5:Outstanding) Frequencies as Percentages
1-Unacceptable 0.002
2 0.010
3 0.187
4 0.537
5-Outstanding 0.263
  1. Review the available variables. Which variables capture customers characteristics?

The below are variables that capture information about the customer: 1. How many times they have flown out of SFO (Q5TimesFlown) 2. If it’s their first flight (Q5FirstTime) 3. How long they have been utilizing SFO (Q6LongUse) 4. Their Gender (Q19) 5. Their Income (Q20) 6. Did they fly over 100k miles (Q21) 7. Their Age Range (Q18) 8. Area they live in (region of California) (Q16LIVE) 9. LANGUAGE

library("formattable")
formattable(tabyl(SFO$Q18AGE) %>% adorn_pct_formatting(digits = 2),
            align =c("l","c","c"), 
            caption = "Age Range",
            list(`Indicator Name` = formatter(
              "span", style = ~ style(color = "grey",font.weight = "bold"))))
Age Range
SFO$Q18AGE n percent
0 95 3.21%
1 49 1.66%
2 379 12.81%
3 669 22.62%
4 546 18.46%
5 498 16.84%
6 415 14.03%
7 306 10.34%
8 1 0.03%
formattable(tabyl(SFO$Q19GENDER) %>% adorn_pct_formatting(digits = 2),
            align =c("l","c","c"), 
            caption = "Gender",
            list(`Indicator Name` = formatter(
              "span", style = ~ style(color = "grey",font.weight = "bold"))))
Gender
SFO$Q19GENDER n percent
0 177 5.98%
1 1289 43.58%
2 1485 50.20%
3 7 0.24%
formattable(tabyl(SFO$Q20INCOME) %>% adorn_pct_formatting(digits = 2),
            align =c("l","c","c"), 
            caption = "Household Income",
            list(`Indicator Name` = formatter(
              "span", style = ~ style(color = "grey",font.weight = "bold"))))
Household Income
SFO$Q20INCOME n percent
0 529 17.88%
1 437 14.77%
2 760 25.69%
3 453 15.31%
4 731 24.71%
5 45 1.52%
6 3 0.10%
formattable(tabyl(SFO$Q16LIVE) %>% adorn_pct_formatting(digits = 2),
            align =c("l","c","c"), 
            caption = "Area Lived In",
            list(`Indicator Name` = formatter(
              "span", style = ~ style(color = "grey",font.weight = "bold"))))
Area Lived In
SFO$Q16LIVE n percent
0 67 2.27%
1 1018 34.42%
2 141 4.77%
3 1732 58.55%
formattable(tabyl(SFO$LANG) %>% adorn_pct_formatting(digits = 2),
            align =c("l","c","c"), 
            caption = "Language",
            list(`Indicator Name` = formatter(
              "span", style = ~ style(color = "grey",font.weight = "bold"))))
Language
SFO$LANG n percent
1 2824 95.47%
2 29 0.98%
3 71 2.40%
4 34 1.15%
formattable(tabyl(SFO$Q21FLY) %>% adorn_pct_formatting(digits = 2),
            align =c("l","c","c"), 
            caption = "Fly 100K+ Miles/Year",
            list(`Indicator Name` = formatter(
              "span", style = ~ style(color = "grey",font.weight = "bold"))))
Fly 100K+ Miles/Year
SFO$Q21FLY n percent
0 116 3.92%
1 478 16.16%
2 2130 72.01%
3 234 7.91%
formattable(tabyl(SFO$Q5TIMESFLOWN) %>% adorn_pct_formatting(digits = 2),
            align =c("l","c","c"), 
            caption = "# Times Flown Out of SFO, Past 12 Months",
            list(`Indicator Name` = formatter(
              "span", style = ~ style(color = "grey",font.weight = "bold"))))
# Times Flown Out of SFO, Past 12 Months
SFO$Q5TIMESFLOWN n percent
0 11 0.37%
1 1112 37.59%
2 655 22.14%
3 737 24.92%
4 222 7.51%
5 126 4.26%
6 95 3.21%
formattable(tabyl(SFO$Q5FIRSTTIME) %>% adorn_pct_formatting(digits = 2),
            align =c("l","c","c"), 
            list(`Indicator Name` = formatter(
              "span", style = ~ style(color = "grey",font.weight = "bold"))))
SFO$Q5FIRSTTIME n percent
0 10 0.34%
1 2425 81.98%
2 523 17.68%
formattable(tabyl(SFO$Q6LONGUSE) %>% adorn_pct_formatting(digits = 2),
            align =c("l","c","c"), 
            caption = "First Time Flying Out of SFO",
            list(`Indicator Name` = formatter(
              "span", style = ~ style(color = "grey",font.weight = "bold"))))
First Time Flying Out of SFO
SFO$Q6LONGUSE n percent
0 50 1.69%
1 752 25.42%
2 640 21.64%
3 317 10.72%
4 1198 40.50%
5 1 0.03%
  1. Estimate an ordered probit model of overall satisfaction using customer characteristics as explanatory variables.
library(MASS)
library(stargazer)
m <- polr(factor(Q7ALL) ~ factor(Q18AGE) + factor(Q19GENDER) + factor(Q20INCOME) + factor(Q21FLY) + factor(LANG), SFO, method = "probit")

stargazer(m, type = "text", covariate.labels = c("Under 18","18 - 24", "25 - 34", "35 - 44", "45 - 54 ", "55 - 64", "65 and over", "Age Not Specified", "Male", "Female", "Gender Not Specified", "Under $50,000", "$50,000 - $100,000", "$100,001 - 150,000", "Over $150,000", "Other Income", "Income Not Specified", "Frequent Flyer", "Not a Frequent Flyer", "Frequent Flyer Not Specified","English","Spanish","Chinese","Japanese"))

======================================================== Dependent variable:
————————— Q7ALL
——————————————————– Under 18 0.486**
(0.234)

18 - 24 0.258
(0.177)

25 - 34 0.110
(0.173)

35 - 44 -0.049
(0.174)

45 - 54 -0.042
(0.174)

55 - 64 0.029
(0.174)

65 and over 0.166
(0.177)

Age Not Specified 4.760
(58.000)

Male 0.219**
(0.110)

Female 0.347***
(0.109)

Gender Not Specified 0.402
(0.437)

Under 50,000 0.008
(0.081)

50,000 - 100,000 0.078
(0.070)

100,001 - 150,000 -0.074
(0.078)

Over 150,000 -0.099
(0.072)

Other Income -0.037
(0.183)

Income Not Specified -0.114
(0.638)

Frequent Flyer -0.024
(0.159)

Not a Frequent Flyer 0.035
(0.151)

Frequent Flyer Not Specified -0.166
(0.166)

English 0.466**
(0.227)

Spanish 0.284*
(0.145)

Chinese -0.866***
(0.199)


Observations 2,810

Note: p<0.1; p<0.05; p<0.01

Problem Set: BLP Methodology Attached is a table of market shares, prices, and characteristics on the top-selling brands of cereal in 1992. The data are aggregated from household-level scanner data (collected at supermarket checkout counters).

The market shares below are shares of total cereal purchases observed in the dataset. For the purposes of this problem set, assume that all households purchased some cereal during 1992 (so that non-purchase is not an option). Assume that brand #51, the composite basket of “all other brands”, is the outside good.

Two sets of prices are given in the table. Shelf prices are those listed on supermarket shelves, and do not include coupon discounts. Transactions prices are prices actually paid by consumers, net of coupon discounts. Estimate using the transactions prices. Note that you should subtract the price of brand #51, the “outside good”, from the prices of the top fifty brands.

Assume a utility specification for uij , household i’s utility from brand j: \[u_{ij} = X_j\beta+\alpha p_j + \xi_j + v_{ij}\] where \(X_j\) are characteristics of brand j, \(\xi_j\) is an unobserved (to the econometrician) quality parameter for brand j, and \(v_{ij}\) is a disturbance term which is identically and independently distributed (i.i.d.) over households i and brands j. As in Berry (1994), denote the mean utility level from brand j as \[\delta_j \equiv X_j\beta+\alpha p_j + \xi_j\] 1. If we assume that the \(v_{ij}\) ’s are distributed i.i.d. type I extreme value, then the resulting expressions for the market shares of each brand j; \(j = 1,...,51\).

\[ s_j = \frac{exp(X_j\beta+\alpha p_j + \xi_j)}{\sum_i^n exp(X_i\beta+\alpha p_i + \xi_i)}\] We need to normalize the outside good. Remember the outside good is good 51. The outside good has not characteristics (i.e. no X’s), but does have a price, \(p_0\).

In this part of the assignment, you need to define a new variable called difprice = price of the good j - the price of the outside good.

cereal <- read_csv("workbook.csv")
library(dplyr)
cereal$difprice <- cereal$`average transaction price`-last(cereal$`average transaction price`)

You will be using this new price in the remaining steps.

Next we implement the BLP two-step estimator.

  1. Invert the resulting system of demand functions to get estimates of the mean utility levels \(\delta_j\) as a function of the shares \(s_j\) . (Hint: look in Berry (1994, pg. 250)).

Big hint! We get to observe market shares for each product, \(s_j\). These market shares are data. However, we need to convert them so that we can estimate \[\delta_j \equiv X_j\beta+\alpha p_j + \xi_j\]

In the notes there is function that tell us how convert market shares into the \(\delta_j\). Once you find this function, create a new variable called delta.

cereal$delta_j <- log(last(cereal$`in sample market share`))-log(cereal$`in sample market share`)

b <- cereal[-nrow(cereal),]
  1. Estimate the second stage regression of \(\delta_j\) on \(X_j\) and \(p_j\) in different ways:
  1. OLS: estimate the relationship between the delta’s, difprice, and the product characteristics.
# delta, difprice, product characteristics 
# this is just a regular OLS 
# need a linear regression with delta as a dependent variable and the rest as your explanatory variables 
# product characteristics needed, not shelf price for example 

ols.1<-lm(delta_j~difprice+cals+fat+suger, data=b)
stargazer(ols.1, type = "html")
Dependent variable:
delta_j
difprice 0.224
(0.149)
cals -0.001
(0.003)
fat 0.005
(0.057)
suger 0.029**
(0.014)
Constant 2.620***
(0.319)
Observations 50
R2 0.150
Adjusted R2 0.074
Residual Std. Error 0.483 (df = 45)
F Statistic 1.980 (df = 4; 45)
Note: p<0.1; p<0.05; p<0.01
  1. OLS: Do the same as above, but estimate manufacture fixed effects. Do your results change? If so how? What are the fixed effects capturing?
companygrouping <- b$company
#View(companygrouping)
#do a linear regression here 
#use an instrumental variable or more than one 
#need to group by company 
#then take the characterstics of the competitor to be instrumental variables 
#use mutate or group by if cant use his suggestions 


b$difprice.z<-(ave(b$difprice,factor(b$company),FUN=function(x)
  sum(x)-b$difprice)/(ave(b$difprice,factor(b$company),FUN=function(x) length(x))-1))
b$cal.z<-(ave(b$cals,factor(b$company),FUN=function(x)
  sum(x)-b$cals)/(ave(b$cals,factor(b$company),FUN=function(x) length(x))-1))
b$sug.z<-(ave(b$suger,factor(b$company),FUN=function(x)
  sum(x)-b$suger)/(ave(b$suger,factor(b$company),FUN=function(x) length(x))-1))
b$fat.z<-(ave(b$fat,factor(b$company),FUN=function(x)
  sum(x)-b$fat)/(ave(b$fat,factor(b$company),FUN=function(x) length(x))-1))

ols.2<-lm(delta_j~difprice+b$difprice.z+b$cal.z+b$sug.z+b$fat.z, data=b)
stargazer(ols.1,ols.2,type="html")
Dependent variable:
delta_j
(1) (2)
difprice 0.224 0.474***
(0.149) (0.157)
cals -0.001
(0.003)
fat 0.005
(0.057)
suger 0.029**
(0.014)
difprice.z -0.355*
(0.197)
cal.z 0.005
(0.003)
sug.z -0.024
(0.039)
fat.z 0.093
(0.121)
Constant 2.620*** 2.230***
(0.319) (0.554)
Observations 50 50
R2 0.150 0.222
Adjusted R2 0.074 0.134
Residual Std. Error 0.483 (df = 45) 0.468 (df = 44)
F Statistic 1.980 (df = 4; 45) 2.510** (df = 5; 44)
Note: p<0.1; p<0.05; p<0.01

Remember that a mean is \[\bar{x}=\frac{1}{n}\sum_i^n x_i\]

but let’s say that x is made up of \(n_1\) observations of \(\hat{x}\) and \(n_2\) obserations of \(\tilde{x}\) We can write the mean of \(\hat{x}\) two ways \[\bar{\hat{x}}=\frac{1}{n_1}\sum_i^n \hat{x}_i\] or \[\bar{\hat{x}}=\frac{1}{n-n_2}(\sum_i^{n}x_i-\sum_j^{n_2}\tilde{x}_j)\]

The last function in the R chunk about the ave function does this version of the mean. You can see that it is useful when you want the mean from the other groups.

  1. 2SLS: using average characteristics for all other brands produced by the same manufacturer as brand j as instruments for \(p_j\). Notice, these are within manufacturer means, but not including product j.
b$difprice.z<-(ave(b$difprice,factor(b$company),FUN=function(x)
  sum(x)-b$difprice)/(ave(b$difprice,factor(b$company),FUN=function(x) length(x))-1))
b$cal.z<-(ave(b$cals,factor(b$company),FUN=function(x)
  sum(x)-b$cals)/(ave(b$cals,factor(b$company),FUN=function(x) length(x))-1))
b$sug.z<-(ave(b$suger,factor(b$company),FUN=function(x)
  sum(x)-b$suger)/(ave(b$suger,factor(b$company),FUN=function(x) length(x))-1))
b$fat.z<-(ave(b$fat,factor(b$company),FUN=function(x)
  sum(x)-b$fat)/(ave(b$fat,factor(b$company),FUN=function(x) length(x))-1))

iv.reg1<-ivreg(delta_j~difprice+cals+fat+suger+factor(b$company)|cals+fat+suger+factor(b$company)+b$difprice.z+b$cal.z+b$sug.z+b$fat.z,data=b)
stargazer(iv.reg1,type = "html")
Dependent variable:
delta_j
difprice 0.428
(0.504)
cals -0.001
(0.004)
fat -0.014
(0.080)
suger 0.038**
(0.017)
company)KG -0.075
(0.172)
company)NB 0.566*
(0.331)
company)PT 0.374
(0.261)
company)QK 0.683
(0.548)
company)RL 0.266
(0.414)
Constant 2.510***
(0.596)
Observations 50
R2 0.344
Adjusted R2 0.197
Residual Std. Error 0.450 (df = 40)
Note: p<0.1; p<0.05; p<0.01
  1. 2SLS: using average characteristics for all other brands produced by rivals to the manufacturer as brand j as instruments for \(p_j\). These are external to the manufacturer of product j who compete in the same segment
b$difprice.z2<-(ave(b$difprice,factor(b$sgmnt),FUN=function(x)
  sum(x)-b$difprice)/(ave(b$difprice,factor(b$sgmnt),FUN=function(x) length(x))-1))
b$cal.z2<-(ave(b$cals,factor(b$sgmnt),FUN=function(x)
  sum(x)-b$cals)/(ave(b$cals,factor(b$sgmnt),FUN=function(x) length(x))-1))
b$sug.z2<-(ave(b$suger,factor(b$sgmnt),FUN=function(x)
  sum(x)-b$suger)/(ave(b$suger,factor(b$sgmnt),FUN=function(x) length(x))-1))
b$fat.z2<-(ave(b$fat,factor(b$sgmnt),FUN=function(x)
  sum(x)-b$fat)/(ave(b$fat,factor(b$sgmnt),FUN=function(x) length(x))-1))

iv.reg2<-ivreg(delta_j~difprice+cals+fat+suger+factor(b$sgmnt)|cals+fat+suger+factor(b$sgmnt)+b$difprice.z2+b$cal.z2+b$sug.z2+b$fat.z2,data=b)

stargazer(iv.reg2,type = "html")
Dependent variable:
delta_j
difprice 0.913
(0.621)
cals 0.002
(0.005)
fat -0.036
(0.072)
suger 0.030
(0.023)
sgmnt)Fam -0.337
(0.294)
sgmnt)Kids -0.129
(0.286)
Constant 2.260***
(0.726)
Observations 50
R2 -0.193
Adjusted R2 -0.360
Residual Std. Error 0.586 (df = 43)
Note: p<0.1; p<0.05; p<0.01
  1. 2SLS: using average characteristics for all other brands not produced by manufacturer of product j as instruments for \(p_j\)

  2. 2SLS: using the average characteristics for all other brands besides product j.

b$difprice.z4<-(ave(b$difprice,FUN=function(x)
  sum(x)-b$difprice)/(ave(b$difprice,FUN=function(x) length(x))-1))
b$cal.z4<-(ave(b$cals,FUN=function(x)
  sum(x)-b$cals)/(ave(b$cals,FUN=function(x) length(x))-1))
b$sug.z4<-(ave(b$suger,FUN=function(x)
  sum(x)-b$suger)/(ave(b$suger,FUN=function(x) length(x))-1))
b$fat.z4<-(ave(b$fat,FUN=function(x)
  sum(x)-b$fat)/(ave(b$fat,FUN=function(x) length(x))-1))

iv.reg4<-ivreg(delta_j~difprice+cals+fat+suger|cals+fat+suger+b$difprice.z4+b$cal.z4+b$sug.z4+b$fat.z4,data=b)
stargazer(iv.reg4,type = "html")
Dependent variable:
delta_j
difprice 0.224
(0.149)
cals -0.001
(0.003)
fat 0.005
(0.057)
suger 0.029**
(0.014)
Constant 2.620***
(0.319)
Observations 50
R2 0.150
Adjusted R2 0.074
Residual Std. Error 0.483 (df = 45)
Note: p<0.1; p<0.05; p<0.01
  1. Which set of instrument produces the largest first stage F-statistic? What is the F-stat value?

The second set, OLS 2, produces the largest first stage F-statistic at 2.514.