Problem Set: Ordered Probit Methodology The San Francisco Airport is very concerned about customer satisfaction. Attached you will find a customer satisfaction survey for the SFO. Q7ALL is a categorical variable ranking the customer’s satisfaction from unacceptable to outstanding

library('readr')
library('dplyr')      # for data manipulation
library('tidyr')      # for reshaping data
library('ggplot2')    # plotting data
library('scales')     # for scale_y_continuous(label = percent)
library('ggthemes')   # for scale_fill_few('medium')
library('ztable')     # format tables for reporting
library('kableExtra')
library('janitor')
library('forcats')
library('plyr')
library('AER')
knitr::opts_chunk$set(comment = NA)
options(ztable.type = 'html')
options(digits = 3)

SFO <- read_csv("2015_SFO_Customer_Survey.csv")

SFO$Q7ALL[SFO$Q7ALL==0]<- NA
SFO$Q7ALL[SFO$Q7ALL==6]<- NA
SFO$Q7ALL<-factor(SFO$Q7ALL)
levels(SFO$Q7ALL) <- c("1-Unacceptable","2","3","4","5-Outstanding")
#added "Other" to avoid NA being listed twice and needed to resolve the argument error
table(SFO$Q7ALL) %>% 
  kable(col.names = c("Ratings (1:Unacceptable - 5:Outstanding)", "Frequencies as Count"), format = "markdown", align = "c")

Ratings (1:Unacceptable - 5:Outstanding)	Frequencies as Count
1-Unacceptable	7
2	29
3	526
4	1508
5-Outstanding	740

Create a table summarizing the counts of the potential outcomes and report them as percentages.

prop.table(table(SFO$Q7ALL)) %>% kable(col.names = c("Ratings (1:Unacceptable - 5:Outstanding)", "Frequencies as Percentages"), format = "markdown", align = "c")

Ratings (1:Unacceptable - 5:Outstanding)	Frequencies as Percentages
1-Unacceptable	0.002
2	0.010
3	0.187
4	0.537
5-Outstanding	0.263

Review the available variables. Which variables capture customers characteristics?

The below are variables that capture information about the customer: 1. How many times they have flown out of SFO (Q5TimesFlown) 2. If it’s their first flight (Q5FirstTime) 3. How long they have been utilizing SFO (Q6LongUse) 4. Their Gender (Q19) 5. Their Income (Q20) 6. Did they fly over 100k miles (Q21) 7. Their Age Range (Q18) 8. Area they live in (region of California) (Q16LIVE) 9. LANGUAGE

library("formattable")
formattable(tabyl(SFO$Q18AGE) %>% adorn_pct_formatting(digits = 2),
            align =c("l","c","c"), 
            caption = "Age Range",
            list(`Indicator Name` = formatter(
              "span", style = ~ style(color = "grey",font.weight = "bold"))))

Age Range
SFO$Q18AGE	n	percent
0	95	3.21%
1	49	1.66%
2	379	12.81%
3	669	22.62%
4	546	18.46%
5	498	16.84%
6	415	14.03%
7	306	10.34%
8	1	0.03%

formattable(tabyl(SFO$Q19GENDER) %>% adorn_pct_formatting(digits = 2),
            align =c("l","c","c"), 
            caption = "Gender",
            list(`Indicator Name` = formatter(
              "span", style = ~ style(color = "grey",font.weight = "bold"))))

Gender
SFO$Q19GENDER	n	percent
0	177	5.98%
1	1289	43.58%
2	1485	50.20%
3	7	0.24%

formattable(tabyl(SFO$Q20INCOME) %>% adorn_pct_formatting(digits = 2),
            align =c("l","c","c"), 
            caption = "Household Income",
            list(`Indicator Name` = formatter(
              "span", style = ~ style(color = "grey",font.weight = "bold"))))

Household Income
SFO$Q20INCOME	n	percent
0	529	17.88%
1	437	14.77%
2	760	25.69%
3	453	15.31%
4	731	24.71%
5	45	1.52%
6	3	0.10%

formattable(tabyl(SFO$Q16LIVE) %>% adorn_pct_formatting(digits = 2),
            align =c("l","c","c"), 
            caption = "Area Lived In",
            list(`Indicator Name` = formatter(
              "span", style = ~ style(color = "grey",font.weight = "bold"))))

Area Lived In
SFO$Q16LIVE	n	percent
0	67	2.27%
1	1018	34.42%
2	141	4.77%
3	1732	58.55%

formattable(tabyl(SFO$LANG) %>% adorn_pct_formatting(digits = 2),
            align =c("l","c","c"), 
            caption = "Language",
            list(`Indicator Name` = formatter(
              "span", style = ~ style(color = "grey",font.weight = "bold"))))

Language
SFO$LANG	n	percent
1	2824	95.47%
2	29	0.98%
3	71	2.40%
4	34	1.15%

formattable(tabyl(SFO$Q21FLY) %>% adorn_pct_formatting(digits = 2),
            align =c("l","c","c"), 
            caption = "Fly 100K+ Miles/Year",
            list(`Indicator Name` = formatter(
              "span", style = ~ style(color = "grey",font.weight = "bold"))))

Fly 100K+ Miles/Year
SFO$Q21FLY	n	percent
0	116	3.92%
1	478	16.16%
2	2130	72.01%
3	234	7.91%

formattable(tabyl(SFO$Q5TIMESFLOWN) %>% adorn_pct_formatting(digits = 2),
            align =c("l","c","c"), 
            caption = "# Times Flown Out of SFO, Past 12 Months",
            list(`Indicator Name` = formatter(
              "span", style = ~ style(color = "grey",font.weight = "bold"))))

# Times Flown Out of SFO, Past 12 Months
SFO$Q5TIMESFLOWN	n	percent
0	11	0.37%
1	1112	37.59%
2	655	22.14%
3	737	24.92%
4	222	7.51%
5	126	4.26%
6	95	3.21%

formattable(tabyl(SFO$Q5FIRSTTIME) %>% adorn_pct_formatting(digits = 2),
            align =c("l","c","c"), 
            list(`Indicator Name` = formatter(
              "span", style = ~ style(color = "grey",font.weight = "bold"))))

SFO$Q5FIRSTTIME	n	percent
0	10	0.34%
1	2425	81.98%
2	523	17.68%

formattable(tabyl(SFO$Q6LONGUSE) %>% adorn_pct_formatting(digits = 2),
            align =c("l","c","c"), 
            caption = "First Time Flying Out of SFO",
            list(`Indicator Name` = formatter(
              "span", style = ~ style(color = "grey",font.weight = "bold"))))

First Time Flying Out of SFO
SFO$Q6LONGUSE	n	percent
0	50	1.69%
1	752	25.42%
2	640	21.64%
3	317	10.72%
4	1198	40.50%
5	1	0.03%

Estimate an ordered probit model of overall satisfaction using customer characteristics as explanatory variables.

library(MASS)
library(stargazer)
m <- polr(factor(Q7ALL) ~ factor(Q18AGE) + factor(Q19GENDER) + factor(Q20INCOME) + factor(Q21FLY) + factor(LANG), SFO, method = "probit")

stargazer(m, type = "text", covariate.labels = c("Under 18","18 - 24", "25 - 34", "35 - 44", "45 - 54 ", "55 - 64", "65 and over", "Age Not Specified", "Male", "Female", "Gender Not Specified", "Under $50,000", "$50,000 - $100,000", "$100,001 - 150,000", "Over $150,000", "Other Income", "Income Not Specified", "Frequent Flyer", "Not a Frequent Flyer", "Frequent Flyer Not Specified","English","Spanish","Chinese","Japanese"))

======================================================== Dependent variable:
————————— Q7ALL
——————————————————– Under 18 0.486**
(0.234)

18 - 24 0.258
(0.177)

25 - 34 0.110
(0.173)

35 - 44 -0.049
(0.174)

45 - 54 -0.042
(0.174)

55 - 64 0.029
(0.174)

65 and over 0.166
(0.177)

Age Not Specified 4.760
(58.000)

Male 0.219**
(0.110)

Female 0.347***
(0.109)

Gender Not Specified 0.402
(0.437)

Under 50,000 0.008
(0.081)

50,000 - 100,000 0.078
(0.070)

100,001 - 150,000 -0.074
(0.078)

Over 150,000 -0.099
(0.072)

Other Income -0.037
(0.183)

Income Not Specified -0.114
(0.638)

Frequent Flyer -0.024
(0.159)

Not a Frequent Flyer 0.035
(0.151)

Frequent Flyer Not Specified -0.166
(0.166)

English 0.466**
(0.227)

Spanish 0.284*
(0.145)

Chinese -0.866***
(0.199)

Observations 2,810

Note: p<0.1; p<0.05; p<0.01

Problem Set: BLP Methodology Attached is a table of market shares, prices, and characteristics on the top-selling brands of cereal in 1992. The data are aggregated from household-level scanner data (collected at supermarket checkout counters).

The market shares below are shares of total cereal purchases observed in the dataset. For the purposes of this problem set, assume that all households purchased some cereal during 1992 (so that non-purchase is not an option). Assume that brand #51, the composite basket of “all other brands”, is the outside good.

Two sets of prices are given in the table. Shelf prices are those listed on supermarket shelves, and do not include coupon discounts. Transactions prices are prices actually paid by consumers, net of coupon discounts. Estimate using the transactions prices. Note that you should subtract the price of brand #51, the “outside good”, from the prices of the top fifty brands.

Assume a utility specification for uij , household i’s utility from brand j: \[u_{ij} = X_j\beta+\alpha p_j + \xi_j + v_{ij}\] where $X_j$ are characteristics of brand j, $\xi_j$ is an unobserved (to the econometrician) quality parameter for brand j, and $v_{ij}$ is a disturbance term which is identically and independently distributed (i.i.d.) over households i and brands j. As in Berry (1994), denote the mean utility level from brand j as \[\delta_j \equiv X_j\beta+\alpha p_j + \xi_j\] 1. If we assume that the $v_{ij}$ ’s are distributed i.i.d. type I extreme value, then the resulting expressions for the market shares of each brand j; $j = 1,...,51$.

\[ s_j = \frac{exp(X_j\beta+\alpha p_j + \xi_j)}{\sum_i^n exp(X_i\beta+\alpha p_i + \xi_i)}\] We need to normalize the outside good. Remember the outside good is good 51. The outside good has not characteristics (i.e. no X’s), but does have a price, $p_0$.

In this part of the assignment, you need to define a new variable called difprice = price of the good j - the price of the outside good.

cereal <- read_csv("workbook.csv")
library(dplyr)
cereal$difprice <- cereal$`average transaction price`-last(cereal$`average transaction price`)

You will be using this new price in the remaining steps.

Next we implement the BLP two-step estimator.

Invert the resulting system of demand functions to get estimates of the mean utility levels $\delta_j$ as a function of the shares $s_j$ . (Hint: look in Berry (1994, pg. 250)).

Big hint! We get to observe market shares for each product, $s_j$. These market shares are data. However, we need to convert them so that we can estimate \[\delta_j \equiv X_j\beta+\alpha p_j + \xi_j\]

In the notes there is function that tell us how convert market shares into the $\delta_j$. Once you find this function, create a new variable called delta.

cereal$delta_j <- log(last(cereal$`in sample market share`))-log(cereal$`in sample market share`)

b <- cereal[-nrow(cereal),]

Estimate the second stage regression of $\delta_j$ on $X_j$ and $p_j$ in different ways:

OLS: estimate the relationship between the delta’s, difprice, and the product characteristics.

# delta, difprice, product characteristics 
# this is just a regular OLS 
# need a linear regression with delta as a dependent variable and the rest as your explanatory variables 
# product characteristics needed, not shelf price for example 

ols.1<-lm(delta_j~difprice+cals+fat+suger, data=b)
stargazer(ols.1, type = "html")


	Dependent variable:

	delta_j

difprice	0.224
	(0.149)

cals	-0.001
	(0.003)

fat	0.005
	(0.057)

suger	0.029^**
	(0.014)

Constant	2.620^***
	(0.319)


Observations	50
R²	0.150
Adjusted R²	0.074
Residual Std. Error	0.483 (df = 45)
F Statistic	1.980 (df = 4; 45)

Note:	p<0.1; p<0.05; p<0.01

OLS: Do the same as above, but estimate manufacture fixed effects. Do your results change? If so how? What are the fixed effects capturing?

companygrouping <- b$company
#View(companygrouping)
#do a linear regression here 
#use an instrumental variable or more than one 
#need to group by company 
#then take the characterstics of the competitor to be instrumental variables 
#use mutate or group by if cant use his suggestions 


b$difprice.z<-(ave(b$difprice,factor(b$company),FUN=function(x)
  sum(x)-b$difprice)/(ave(b$difprice,factor(b$company),FUN=function(x) length(x))-1))
b$cal.z<-(ave(b$cals,factor(b$company),FUN=function(x)
  sum(x)-b$cals)/(ave(b$cals,factor(b$company),FUN=function(x) length(x))-1))
b$sug.z<-(ave(b$suger,factor(b$company),FUN=function(x)
  sum(x)-b$suger)/(ave(b$suger,factor(b$company),FUN=function(x) length(x))-1))
b$fat.z<-(ave(b$fat,factor(b$company),FUN=function(x)
  sum(x)-b$fat)/(ave(b$fat,factor(b$company),FUN=function(x) length(x))-1))

ols.2<-lm(delta_j~difprice+b$difprice.z+b$cal.z+b$sug.z+b$fat.z, data=b)
stargazer(ols.1,ols.2,type="html")


	Dependent variable:

	delta_j
	(1)	(2)

difprice	0.224	0.474^***
	(0.149)	(0.157)

cals	-0.001
	(0.003)

fat	0.005
	(0.057)

suger	0.029^**
	(0.014)

difprice.z		-0.355^*
		(0.197)

cal.z		0.005
		(0.003)

sug.z		-0.024
		(0.039)

fat.z		0.093
		(0.121)

Constant	2.620^***	2.230^***
	(0.319)	(0.554)


Observations	50	50
R²	0.150	0.222
Adjusted R²	0.074	0.134
Residual Std. Error	0.483 (df = 45)	0.468 (df = 44)
F Statistic	1.980 (df = 4; 45)	2.510^** (df = 5; 44)

Note:	p<0.1; p<0.05; p<0.01

Remember that a mean is \[\bar{x}=\frac{1}{n}\sum_i^n x_i\]

but let’s say that x is made up of $n_1$ observations of $\hat{x}$ and $n_2$ obserations of $\tilde{x}$ We can write the mean of $\hat{x}$ two ways \[\bar{\hat{x}}=\frac{1}{n_1}\sum_i^n \hat{x}_i\] or \[\bar{\hat{x}}=\frac{1}{n-n_2}(\sum_i^{n}x_i-\sum_j^{n_2}\tilde{x}_j)\]

The last function in the R chunk about the ave function does this version of the mean. You can see that it is useful when you want the mean from the other groups.

2SLS: using average characteristics for all other brands produced by the same manufacturer as brand j as instruments for $p_j$. Notice, these are within manufacturer means, but not including product j.

b$difprice.z<-(ave(b$difprice,factor(b$company),FUN=function(x)
  sum(x)-b$difprice)/(ave(b$difprice,factor(b$company),FUN=function(x) length(x))-1))
b$cal.z<-(ave(b$cals,factor(b$company),FUN=function(x)
  sum(x)-b$cals)/(ave(b$cals,factor(b$company),FUN=function(x) length(x))-1))
b$sug.z<-(ave(b$suger,factor(b$company),FUN=function(x)
  sum(x)-b$suger)/(ave(b$suger,factor(b$company),FUN=function(x) length(x))-1))
b$fat.z<-(ave(b$fat,factor(b$company),FUN=function(x)
  sum(x)-b$fat)/(ave(b$fat,factor(b$company),FUN=function(x) length(x))-1))

iv.reg1<-ivreg(delta_j~difprice+cals+fat+suger+factor(b$company)|cals+fat+suger+factor(b$company)+b$difprice.z+b$cal.z+b$sug.z+b$fat.z,data=b)
stargazer(iv.reg1,type = "html")


	Dependent variable:

	delta_j

difprice	0.428
	(0.504)

cals	-0.001
	(0.004)

fat	-0.014
	(0.080)

suger	0.038^**
	(0.017)

company)KG	-0.075
	(0.172)

company)NB	0.566^*
	(0.331)

company)PT	0.374
	(0.261)

company)QK	0.683
	(0.548)

company)RL	0.266
	(0.414)

Constant	2.510^***
	(0.596)


Observations	50
R²	0.344
Adjusted R²	0.197
Residual Std. Error	0.450 (df = 40)

Note:	p<0.1; p<0.05; p<0.01

2SLS: using average characteristics for all other brands produced by rivals to the manufacturer as brand j as instruments for $p_j$. These are external to the manufacturer of product j who compete in the same segment

b$difprice.z2<-(ave(b$difprice,factor(b$sgmnt),FUN=function(x)
  sum(x)-b$difprice)/(ave(b$difprice,factor(b$sgmnt),FUN=function(x) length(x))-1))
b$cal.z2<-(ave(b$cals,factor(b$sgmnt),FUN=function(x)
  sum(x)-b$cals)/(ave(b$cals,factor(b$sgmnt),FUN=function(x) length(x))-1))
b$sug.z2<-(ave(b$suger,factor(b$sgmnt),FUN=function(x)
  sum(x)-b$suger)/(ave(b$suger,factor(b$sgmnt),FUN=function(x) length(x))-1))
b$fat.z2<-(ave(b$fat,factor(b$sgmnt),FUN=function(x)
  sum(x)-b$fat)/(ave(b$fat,factor(b$sgmnt),FUN=function(x) length(x))-1))

iv.reg2<-ivreg(delta_j~difprice+cals+fat+suger+factor(b$sgmnt)|cals+fat+suger+factor(b$sgmnt)+b$difprice.z2+b$cal.z2+b$sug.z2+b$fat.z2,data=b)

stargazer(iv.reg2,type = "html")


	Dependent variable:

	delta_j

difprice	0.913
	(0.621)

cals	0.002
	(0.005)

fat	-0.036
	(0.072)

suger	0.030
	(0.023)

sgmnt)Fam	-0.337
	(0.294)

sgmnt)Kids	-0.129
	(0.286)

Constant	2.260^***
	(0.726)


Observations	50
R²	-0.193
Adjusted R²	-0.360
Residual Std. Error	0.586 (df = 43)

Note:	p<0.1; p<0.05; p<0.01

2SLS: using average characteristics for all other brands not produced by manufacturer of product j as instruments for $p_j$
2SLS: using the average characteristics for all other brands besides product j.

b$difprice.z4<-(ave(b$difprice,FUN=function(x)
  sum(x)-b$difprice)/(ave(b$difprice,FUN=function(x) length(x))-1))
b$cal.z4<-(ave(b$cals,FUN=function(x)
  sum(x)-b$cals)/(ave(b$cals,FUN=function(x) length(x))-1))
b$sug.z4<-(ave(b$suger,FUN=function(x)
  sum(x)-b$suger)/(ave(b$suger,FUN=function(x) length(x))-1))
b$fat.z4<-(ave(b$fat,FUN=function(x)
  sum(x)-b$fat)/(ave(b$fat,FUN=function(x) length(x))-1))

iv.reg4<-ivreg(delta_j~difprice+cals+fat+suger|cals+fat+suger+b$difprice.z4+b$cal.z4+b$sug.z4+b$fat.z4,data=b)
stargazer(iv.reg4,type = "html")


	Dependent variable:

	delta_j

difprice	0.224
	(0.149)

cals	-0.001
	(0.003)

fat	0.005
	(0.057)

suger	0.029^**
	(0.014)

Constant	2.620^***
	(0.319)


Observations	50
R²	0.150
Adjusted R²	0.074
Residual Std. Error	0.483 (df = 45)

Note:	p<0.1; p<0.05; p<0.01

Which set of instrument produces the largest first stage F-statistic? What is the F-stat value?

The second set, OLS 2, produces the largest first stage F-statistic at 2.514.

BLP Assignment

Jennifer Russo, Alec Rabalais, Dzhamilya Chakhalidze, John Schulten, Qais Shaban, Amy Shah

MSBA Data Analytics III

Observations 2,810