# Load the data
URL <- "https://raw.githubusercontent.com/DS4PS/cpp-523-fall-2019/master/labs/data/IncomeHappiness.csv"
dat <- read.csv(URL)

# Fit the quadratic regression model
model <- lm(happiness ~ income + I(income^2), data = dat)

# Predict y_hat using the model
dat$y_hat <- predict(model, newdata = dat)

# Plot the data and the fitted quadratic regression line
plot(dat$income, dat$happiness, 
     xlab="Income (Thousands of Dollars)", ylab="Happiness Scale",
     main="Does Money Make You Happy?",
     pch=19, col="darkorange", bty="n",
     xaxt="n")

axis(side=1, at=c(0,50000,100000,150000,200000), labels=c("$0","$50k","$100k","$150k","$200k"))
lines(dat$income, dat$y_hat, col=gray(0.3,0.5), lwd=6)

Questions

Q1. Read the study below, and then use the dataset called “IncomeHappiness.csv” to estimate the following model:

\(Happiness = b_0+b_1 Income+ b_2 (Income)^2+e\)

y.happy <- dat$happiness
x.income <- dat$income 
x2.income.sq <- x.income*x.income
options( scipen=8 )
summary( lm( y.happy ~ x.income + x2.income.sq ) )
## 
## Call:
## lm(formula = y.happy ~ x.income + x2.income.sq)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.1420  -3.9703  -0.0493   3.9720  20.4357 
## 
## Coefficients:
##                       Estimate        Std. Error t value Pr(>|t|)    
## (Intercept)  35.34826872249546  0.36139931707627   97.81   <2e-16 ***
## x.income      0.00073610233669  0.00000887024309   82.99   <2e-16 ***
## x2.income.sq -0.00000000251607  0.00000000004385  -57.38   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.806 on 1997 degrees of freedom
## Multiple R-squared:  0.8829, Adjusted R-squared:  0.8828 
## F-statistic:  7529 on 2 and 1997 DF,  p-value: < 2.2e-16
# divide income by 10,000
y.happy <- dat$happiness
x.income <- dat$income / 10000  
x2.income.sq <- x.income*x.income
summary( lm( y.happy ~ x.income + x2.income.sq ) )
## 
## Call:
## lm(formula = y.happy ~ x.income + x2.income.sq)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.1420  -3.9703  -0.0493   3.9720  20.4357 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  35.348269   0.361399   97.81   <2e-16 ***
## x.income      7.361023   0.088702   82.99   <2e-16 ***
## x2.income.sq -0.251607   0.004385  -57.38   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.806 on 1997 degrees of freedom
## Multiple R-squared:  0.8829, Adjusted R-squared:  0.8828 
## F-statistic:  7529 on 2 and 1997 DF,  p-value: < 2.2e-16

Q2. How much happiness do you gain making an extra $10k when your initial income is $15k?

# replace with model coefficients 
b0 <- 35.348269
b1 <- 7.361023
b2 <- -0.251607

x <- 15  # use 15000 if you did not rescale above
happy.15k <- b0 + b1*x + b2*x*x

x <- 25  # use 25000 if you did not rescale above
happy.25k <- b0 + b1*x + b2*x*x

happy.25k - happy.15k  # marginal effect of $10k increase at $15k starting salary
## [1] -27.03257

[1] We gain 27.03257 units of happiness from gaining an extra $10,000 when our initial income is $15,000.

Q3. How much happiness do you gain making an extra $10k when your initial income is $75k?

# replace with model coefficients 
b0 <- 35.348269
b1 <- 7.361023
b2 <- -0.251607

x <- 75  # use 15000 if you did not rescale above
happy.75k <- b0 + b1*x + b2*x*x

x <- 85  # use 25000 if you did not rescale above
happy.85k <- b0 + b1*x + b2*x*x

happy.85k - happy.75k  # marginal effect of $10k increase at $75k starting salary
## [1] -328.961

We will gain 328.961 units of happiness from making an extra $10,000 when our beginning income is $75,000.

Q4. How much happiness do you gain making an extra $10k when your initial income is $100k?

# replace with model coefficients 
b0 <- 35.348269
b1 <- 7.361023
b2 <- -0.251607

x <- 100  # use 15000 if you did not rescale above
happy.100k <- b0 + b1*x + b2*x*x

x <- 110  # use 25000 if you did not rescale above
happy.110k <- b0 + b1*x + b2*x*x

happy.110k - happy.100k  # marginal effect of $10k increase at $75k starting salary
## [1] -454.7645

We will gain 454.7645 units of happiness from making an extra $10,000 when our beginning income is $100,000.

Part II: Outliers

For this part of the final assignment you will be using a dataset that examines compensation of nonprofit executive directors from the years 2012-2013. The data is extracted from the IRS E-Filer database available on AWS.

install.packages("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
URL <- "https://github.com/DS4PS/cpp-523-fall-2019/blob/master/labs/data/np-comp-data.rds?raw=true"
dat <- readRDS(gzcon(url( URL )))
set.seed( 1234 )
d2 <- sample_n( dat, 2000 )

The full codebook is below for reference. We will use these two variables for the model that predicts nonprofit executive salaries based upon the size of the nonprofit.

plot( log(d2$REVENUE), log(d2$SALARY), bty="n", pch=19, col="darkorange",
      xlab="Nonprofit Revenue (logged)", ylab="Executive Director Salary (logged)",
      xlim=c(5,25), ylim=c(5,16))
abline( h=seq( 1, 20, 0.5 ), col=gray(0.5,0.2), lwd=1 )
abline( v=seq( 1, 25, 0.5 ), col=gray(0.5,0.2), lwd=1 )
abline( lm( log(d2$SALARY) ~ log(d2$REVENUE) ), col=gray(0.5,0.5), lwd=3 )

plot( log(d2$REVENUE), log(d2$SALARY), bty="n", pch=19, col=gray(0.5,0.2), cex=1.2,
      xlab="Nonprofit Revenue (logged)", ylab="Executive Director Salary (logged)",
      xlim=c(5,25), ylim=c(5,16))

abline( lm( log(d2$SALARY) ~ log(d2$REVENUE) ), col="darkorange", lwd=3 )

points( mean(log(d2$REVENUE)), mean(log(d2$SALARY)), pch=19, col="darkorange", cex=2 )
points( c(8,8), c(12,6),
         cex=3, col="steelblue", lwd=2 )
points( c(8,8), c(12,6),
         cex=1.5, col="steelblue", pch=19 )
text( c(8,8), c(12,6), c("A","B"), 
      pos=4, offset=1.2, col="steelblue", cex=2  )

The image in your template may differ because it’s based on a random sample of 2,000 points to reduce data density so the visualization is not cluttered. Use this image to answer the question.

Q1. What is the likely impact of the outlier “A” on the regression line?

  • Will it make the slope larger or smaller? The slope will become smaller.

  • Would it contribute to a Type I or Type II error? Type 2 Error.

Q2. What is the likely impact of the outlier “B” on the regression line?

  • Will it make the slope larger or smaller? The slope will become larger.

  • Would it contribute to a Type I or Type II error? Type 1 Error.

Q3. The average logged revenue of a nonprofit in this data is 14.44879. What does that translate to in normal dollars? Use the exp() function.

Note there is a decimal after the 14: 14.44879

exp(14.44879)
## [1] 1883778

The average revenue of a nonprofit is $1,883,778.

Q4. What would be the typical salary for a director of an averaged-size nonprofit (log(14.44879) in revenue a year)?

\(log(Salary) = 6.367 + 0.343 \cdot log(Revenue)\)

\(log(Salary) = 6.367 + 0.343 \cdot log(14.44879)\) = log(6.7648221973)

exp(6.7648221973)
## [1] 866.8121

The salary for a director of an average nonprofit would be $866.812

install.packages("stargazer")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
m <- lm( log(SALARY) ~ log(REVENUE), data=d2 )
stargazer( m, type="html",
           omit.stat = c("rsq","f","ser"),
           notes.label = "Standard errors in parentheses" )
Dependent variable:
log(SALARY)
log(REVENUE) 0.354***
(0.009)
Constant 6.193***
(0.128)
Observations 2,000
Adjusted R2 0.445
Standard errors in parentheses p<0.1; p<0.05; p<0.01

Q5. Interpret the coefficient b1 (slope for log of revenue) in the model above (i.e. “a one-unit change in X corresponds with a b1-unit change in Y”, but adjusted for the log-log context). See the hand-out for guidance.

A 1% change in x, log(Revenue), will result in a 0.354% change in y, log(Salary).