rm(list = ls()) # removes data from global enviroment
graphics.off() # closes previous opened graphs
library("corrplot")## corrplot 0.84 loaded
## ── Attaching packages ───────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.0
## ✓ tidyr 1.1.0 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ──────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library("ggthemes")
source("http://online.sfsu.edu/mbar/ECON312_files/TTestFun.R") #Visualize t-testFor this report we will explore factors that impact Annual Income in America. This project is to showcase my abilities in visual & exploratory analysis, and to also provide a template for those curious about R programming. For each test we will use a significance level of 5%. This data frame was created by my Professor Michael Bar.
wage <- read.csv("http://online.sfsu.edu/mbar/ECON312_files/wage21.csv")
#original d.f can be found here:
"http://online.sfsu.edu/mbar/ECON312_files/wage21.csv"## [1] "http://online.sfsu.edu/mbar/ECON312_files/wage21.csv"
First lets explore our data set.
## # A tibble: 6 x 51
## ID FEMALE MALE ETHBLACK ETHHISP ETHWHITE AGE S EDUCPROF EDUCPHD
## <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 1065 1 0 0 0 1 43 12 0 0
## 2 12126 1 0 1 0 0 38 12 0 0
## 3 5708 1 0 0 0 1 44 14 0 0
## 4 1385 1 0 1 0 0 38 13 0 0
## 5 3042 1 0 0 0 1 40 12 0 0
## 6 1240 1 0 0 0 1 38 18 0 0
## # … with 41 more variables: EDUCMAST <int>, EDUCBA <int>, EDUCAA <int>,
## # EDUCHSD <int>, EDUCDO <int>, SINGLE <int>, MARRIED <int>, DIVORCED <int>,
## # FAITHN <int>, FAITHP <int>, FAITHC <int>, FAITHJ <int>, FAITHO <int>,
## # ASVAB01 <int>, ASVAB02 <int>, ASVAB03 <int>, ASVAB04 <int>, ASVAB05 <int>,
## # ASVAB06 <int>, ASVABC <dbl>, HEIGHT <int>, WEIGHT85 <int>, WEIGHT02 <int>,
## # SM <int>, SF <int>, SIBLINGS <int>, LIBRARY <int>, POV78 <int>, EXP <dbl>,
## # EARNINGS <dbl>, HOURS <int>, TENURE <dbl>, COLLBARG <int>, CATGOV <int>,
## # CATPRI <int>, CATSE <int>, URBAN <int>, REGNE <int>, REGNC <int>,
## # REGW <int>, REGS <int>
## # A tibble: 6 x 51
## ID FEMALE MALE ETHBLACK ETHHISP ETHWHITE AGE S EDUCPROF EDUCPHD
## <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 4359 0 1 0 1 0 43 17 0 0
## 2 3290 0 1 0 0 1 41 20 0 0
## 3 5267 0 1 0 0 1 42 13 0 0
## 4 3474 0 1 0 0 1 40 12 0 0
## 5 5229 0 1 0 0 1 41 14 0 0
## 6 2417 0 1 0 0 1 45 13 0 0
## # … with 41 more variables: EDUCMAST <int>, EDUCBA <int>, EDUCAA <int>,
## # EDUCHSD <int>, EDUCDO <int>, SINGLE <int>, MARRIED <int>, DIVORCED <int>,
## # FAITHN <int>, FAITHP <int>, FAITHC <int>, FAITHJ <int>, FAITHO <int>,
## # ASVAB01 <int>, ASVAB02 <int>, ASVAB03 <int>, ASVAB04 <int>, ASVAB05 <int>,
## # ASVAB06 <int>, ASVABC <dbl>, HEIGHT <int>, WEIGHT85 <int>, WEIGHT02 <int>,
## # SM <int>, SF <int>, SIBLINGS <int>, LIBRARY <int>, POV78 <int>, EXP <dbl>,
## # EARNINGS <dbl>, HOURS <int>, TENURE <dbl>, COLLBARG <int>, CATGOV <int>,
## # CATPRI <int>, CATSE <int>, URBAN <int>, REGNE <int>, REGNC <int>,
## # REGW <int>, REGS <int>
We notice some factor variables are inputted as integers. We must convert those to factors in order to build a better model.
With 540 observations the coding for this new variable is far to long for this report. I did the following to create it:
wage$RACE <- as.factor(c(“W”,“B”,“W”,“B”,“W”,“W”,“W”, etc)
# convert binary variables from integers to factors
wage$FEMALE <- as.factor(wage$FEMALE)
wage$MALE <- as.factor(wage$MALE)
wage$ETHBLACK <- as.factor(wage$ETHBLACK)
wage$ETHHISP <- as.factor(wage$ETHHISP)
wage$ETHWHITE <- as.factor(wage$ETHWHITE)
wage$EDUCPROF <- as.factor(wage$EDUCPROF)
wage$EDUCPHD <- as.factor(wage$EDUCPHD)
wage$EDUCMAST <- as.factor(wage$EDUCMAST)
wage$EDUCBA <- as.factor(wage$EDUCBA)
wage$EDUCAA <- as.factor(wage$EDUCAA)
wage$EDUCHSD <- as.factor(wage$EDUCHSD)
wage$EDUCDO <- as.factor(wage$EDUCDO)
wage$SINGLE <- as.factor(wage$SINGLE)
wage$MARRIED <- as.factor(wage$MARRIED)
wage$DIVORCED <- as.factor(wage$DIVORCED)
wage$FAITHN <- as.factor(wage$FAITHN)
wage$FAITHP <- as.factor(wage$FAITHP)
wage$FAITHC <- as.factor(wage$FAITHC)
wage$FAITHJ <- as.factor(wage$FAITHJ)
wage$FAITHO <- as.factor(wage$FAITHO)
wage$FAITHO <- as.factor(wage$FAITHO)
wage$LIBRARY <- as.factor(wage$LIBRARY)
wage$COLLBARG <- as.factor(wage$COLLBARG)
wage$CATGOV <- as.factor(wage$CATGOV)
wage$CATPRI <- as.factor(wage$CATPRI)
wage$CATSE <- as.factor(wage$CATSE)
wage$URBAN <- as.factor(wage$URBAN)
wage$REGNC <- as.factor(wage$REGNC)
wage$REGNE <- as.factor(wage$REGNE)
wage$REGS <- as.factor(wage$REGS)
wage$REGW <- as.factor(wage$REGW) Now that our binary variables are created, we must calculate annual income for each individual in this data set.
weeks <- 52 # number of weeks within a given year
wage <- wage %>%
mutate(AI = ((EARNINGS * HOURS)* weeks)/1000)%>%#creating new variable
select( AI, EARNINGS , HOURS, everything())%>% #moving 3 variables up front
mutate(RACE = as.character(RACE),
RACE = if_else(RACE == 'O', 'H', RACE),
RACE = as.factor(RACE))%>% #change "O" to "H"
select(-c(ID)) #removes ID
wage #views d.f as tibble ## # A tibble: 540 x 52
## AI EARNINGS HOURS FEMALE MALE ETHBLACK ETHHISP ETHWHITE AGE S
## <dbl> <dbl> <int> <fct> <fct> <fct> <fct> <fct> <int> <int>
## 1 26.0 20.8 24 1 0 0 0 1 43 12
## 2 27.4 13.2 40 1 0 1 0 0 38 12
## 3 41.7 20.0 40 1 0 0 0 1 44 14
## 4 57.5 27.6 40 1 0 1 0 0 38 13
## 5 46.0 22.1 40 1 0 0 0 1 40 12
## 6 63.2 30.4 40 1 0 0 0 1 38 18
## 7 31.3 15.0 40 1 0 0 0 1 44 12
## 8 46.0 22.1 40 1 0 0 0 1 39 15
## 9 11.2 8.62 25 1 0 0 0 1 37 12
## 10 56.8 27.3 40 1 0 0 0 1 44 12
## # … with 530 more rows, and 42 more variables: EDUCPROF <fct>, EDUCPHD <fct>,
## # EDUCMAST <fct>, EDUCBA <fct>, EDUCAA <fct>, EDUCHSD <fct>, EDUCDO <fct>,
## # SINGLE <fct>, MARRIED <fct>, DIVORCED <fct>, FAITHN <fct>, FAITHP <fct>,
## # FAITHC <fct>, FAITHJ <fct>, FAITHO <fct>, ASVAB01 <int>, ASVAB02 <int>,
## # ASVAB03 <int>, ASVAB04 <int>, ASVAB05 <int>, ASVAB06 <int>, ASVABC <dbl>,
## # HEIGHT <int>, WEIGHT85 <int>, WEIGHT02 <int>, SM <int>, SF <int>,
## # SIBLINGS <int>, LIBRARY <fct>, POV78 <int>, EXP <dbl>, TENURE <dbl>,
## # COLLBARG <fct>, CATGOV <fct>, CATPRI <fct>, CATSE <fct>, URBAN <fct>,
## # REGNE <fct>, REGNC <fct>, REGW <fct>, REGS <fct>, RACE <fct>
plot1 <- ggplot(wage, aes(AGE, AI))
(plot1 + geom_bar(stat = 'identity',
color = 'black',
fill = 'white')
+ labs( title = "Income Distribution vs Age",
x = "Age", y = "Annual Income (Thousands)")
+ theme_bw() +
scale_y_continuous(labels = scales::dollar_format(accuracy = 0.01, decimal.mark = '.')))We see that our annual income for this data set is most centered between the ages 38-40. Each dash represents the specific annual income the individual made within the bar.
The objective is to observe the factors that influence annual income.
\[AI_i = \beta_1 + \beta_2S_i + \beta_3EXP_i + u_i\] Variables for the model:
\(AI\) Annual income
\(S\) Schooling
\(EXP\) Experience
\(u\) Is our error term (Other factor that affect AI other than S,EXP)
stargazer(as.data.frame(subset(wage,select = c(AI,S,EXP))), #converts back to type: data frame
type = 'text',
font.size = "small",
align = TRUE,
title = "Summary of Statistics",
digits = 2,
flip = T,
out = "table_summary.html")##
## Summary of Statistics
## ============================
## Statistic AI S EXP
## ----------------------------
## N 540 540 540
## Mean 59.87 13.55 16.67
## St. Dev. 52.08 2.41 4.57
## Min 3.74 7 0.88
## Pctl(25) 29.73 12 13.75
## Pctl(75) 70.99 16 20.18
## Max 574.85 20 23.63
## ----------------------------
Within the 540 observation, we see the average years of schooling is 13.55 meaning on average our participants at most completed 13 years of schooling. The average amount of years spent in the workforce is 16.67, meaning at most 16 years was invested in their field.
\[H_O: \beta_2 = 0\] \[H_A: \beta_2 \neq 0\]
\[H_O: \beta_3 = 0\] \[H_A: \beta_3 \neq 0\]
\[\widehat{AI}_i = b_1 + b_2S_i + b_3EXP_i\]
##
## Call:
## lm(formula = AI ~ S + EXP, data = wage)
##
## Residuals:
## Min 1Q Median 3Q Max
## -81.45 -24.23 -7.89 13.81 452.38
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -106.0750 14.8549 -7.141 3.03e-12 ***
## S 9.5868 0.8447 11.350 < 2e-16 ***
## EXP 2.1624 0.4463 4.845 1.66e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 46.57 on 537 degrees of freedom
## Multiple R-squared: 0.2033, Adjusted R-squared: 0.2004
## F-statistic: 68.53 on 2 and 537 DF, p-value: < 2.2e-16
Lets analyze the coefficients:
\(b_1 = -106.075\). Meaning without schooling or experience our individual’s annual income would be -$106,075.7. Although you don’t make negative dollars in a real-world setting, so we can neglect this coefficient from our study.
\(b_2S = 9.5868\). This model is insinuating that for every additional year of schooling, we can expect annual income to rise $9,586.8. Holding other regressors fixed within the model.
\(b_3EXP = 2.1624\). Meaning we can predict annual income to rise $2,162 for every additional year of experience added. Holding other regressors fixed within the model.
\(R^{2} = 0.2033\). Means that 20.33% of the variation in annual income is represented in this model.
alpha = 0.05 #Significance level of the test
df = 537 #Degrees of freedom of the t distriution
test_type = "twosided" #Type of test
TTestFun(alpha,df,test_type,1001,5) #Visualizing the t-testThe t-value (\(t_v\)) measures the distance between the hypothesized value of \(\beta\) and the estimated value of \(\beta\). The farther the distance, the less likely our null hypothesis is true. If our t-value is past such critical value (\(t_c\)), then we can reject our null hypothesis. As we observe we notice \(t_v\) > \(t_c\) for both variables, meaning we can reject our null hypothesis at significance leve 5% and conclude that schooling and experience has some influence on annual income.
A more convenient way to perform our testing is by observing the probability value (p-value). Under the assumption that the null is correct, the p-value gives us the probability of obtaining such result at extreme levels. Again, we test at a significance level of 5%, we can reject the null if this condition is met (\(p-value < .05\)).
p <- summary(md1)$coeff["S",4]#p-values for coefficient on S
paste("p-value(beta2) = ", p) #paste result## [1] "p-value(beta2) = 6.55857422728292e-27"
p <- summary(md1)$coeff["EXP",4]#p-values for coefficient on EXP
paste("p-value(beta3) = ", p) #paste result## [1] "p-value(beta3) = 1.65677834417056e-06"
Both estimates are significantly small, so we can reject the null hypothesis and conclude schooling and experience has some influence on annual income.
C_V <-cor(wage[,c("AI","S","EXP")])
corrplot(C_V, type="upper", order="hclust",
col=brewer.pal(n=8, name="RdYlBu"))cor_test_AI_S <- cor.test(wage$AI,wage$S) #Between AI and Schooling
cor_test_AI_EXP <- cor.test(wage$AI,wage$EXP) #Between AI and Experience
cor_test_S_EXP <- cor.test(wage$S,wage$EXP) #Between Schooling and Experience
print(cor_test_AI_S)##
## Pearson's product-moment correlation
##
## data: wage$AI and wage$S
## t = 10.442, df = 538, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3378185 0.4783059
## sample estimates:
## cor
## 0.4104952
##
## Pearson's product-moment correlation
##
## data: wage$AI and wage$EXP
## t = 2.5803, df = 538, p-value = 0.01014
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.02643067 0.19313750
## sample estimates:
## cor
## 0.1105617
##
## Pearson's product-moment correlation
##
## data: wage$S and wage$EXP
## t = -4.1965, df = 538, p-value = 3.171e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.25852829 -0.09508514
## sample estimates:
## cor
## -0.1780343
We observe a negative correlation between the two variables school, and experience. This could be due to the fact for every additional year a school some time in experience is forgone.
colors <- c("blue", "pink") #aasigning colors to 3d plot
colors <- colors[as.numeric(wage$FEMALE)] # blue = male, pink = female
plot3D <- scatterplot3d(wage$S, wage$EXP, wage$AI,
pch=20,
color = colors,
grid=T,
angle = 45,
box=FALSE,
xlab="Schooling",
ylab="EXP",
zlab="Annual Income (Thousands)",
main="Annual Income vs Schooling & Experience")
legend("top", legend = c("Male", "Female"),
col = c("blue", "pink") ,pch = 16) #adding the legend
plot3D$plane3d(md1,col="black") #Fitted planeVisuals of our model with gender (those that identify as “female”, or “male”) being identified. We see gender also has some type of difference on our annual income.
Before we explore the model let us visualize the data.
plotb <- ggplot(wage, aes(RACE, AI))
(plotb + geom_boxplot(aes(color=RACE), size = 1) +
theme_bw() +
labs( title = "Annual Income by race",
x = "Race",
y = "Annual Income (Thousands)") +
scale_x_discrete(labels = c("B" = "Black","O" = "Hispanic", "W" = "White"))+
scale_color_discrete(name = "Race", labels = c("Black", "Hispanic", "White"))+
theme(legend.position = 'right',
plot.title = element_text(size=22,hjust = 0.5), legend.title = element_text(size = 14)) + # plot.tittle (legend position,plot format,legend format )
scale_y_continuous(labels = scales::dollar_format(accuracy = 0.01,
decimal.mark = '.')))The income distribution among race.
plot2 <- ggplot(wage, aes(S, AI))
(plot2 + geom_jitter(aes(color = RACE)) +
theme_bw() +
labs( title = "Annual Income vs Schooling",
x = "Schooling",
y = "Annual Income (Thousands)") +
theme(legend.position = "top") +
scale_color_manual(name = "Race", labels = c("Black","Hispanic","White"),
values = c("red", "yellow", "blue")) +
scale_y_continuous(
labels = scales::dollar_format(accuracy = 0.01,
decimal.mark = '.')))plot3 <- ggplot(wage, aes(S, AI))
(plot3 + geom_jitter(aes(color = RACE)) +
theme_bw() +
labs( title = "Annual Income vs Schooling",
x = "Schooling",
y = "Annual Income (Thousands)") +
theme(legend.position = "top") +
scale_color_manual(name = "Race", labels = c("Black","Hispanic","White"),
values = c("red", "yellow", "blue")) +
scale_y_continuous(labels = scales::dollar_format(accuracy = 0.01,
decimal.mark = '.')) +
facet_wrap(FEMALE ~ RACE)) #0:Male 1:Female\(1 = Female\) \(0 = Male\)
plot4 <- ggplot(wage, aes(EXP, AI))
(plot4 + geom_jitter(aes(color = RACE)) +
theme_bw() +
labs( title = "Annual Income vs Schooling",
x = "Experience",
y = "Annual Income (Thousands)") +
theme(legend.position = "top") +
scale_color_manual(name = "Race", labels = c("Black","Hispanic","White"),
values = c("red", "yellow", "blue")) +
scale_y_continuous(labels = scales::dollar_format(accuracy = 0.01,
decimal.mark = '.')))\(1 = Female\) \(0 = Male\)
plot5 <- ggplot(wage, aes(EXP, AI))
(plot5 + geom_jitter(aes(color = RACE)) +
theme_bw() +
labs( title = "Annual Income vs Experience",
x = "Experience",
y = "Annual Income (Thousands)") +
theme(legend.position = "top") +
scale_color_manual(name = "Race", labels = c("Black","Hispanic","White"),
values = c("red", "yellow", "blue")) +
scale_y_continuous(labels = scales::dollar_format(accuracy = 0.01,
decimal.mark = '.')) +
facet_wrap(FEMALE ~ RACE)) #0:Male 1:Female\(1 = Female\) \(0 = Male\)
The objective is to see if race has any influence on annual income.
\[H_O: \beta_4 = 0\] \[H_A: \beta_4 \neq 0\]
\[H_O: \beta_5 = 0\] \[H_A: \beta_5 \neq 0\]
\[H_O: \beta_6 = 0\] \[H_A: \beta_6 \neq 0\]
\[AI_i = \beta_1 + \beta_2S_i + \beta_3EXP_i + \beta_4FEMALE_i + \beta_5ETHBLACK_i + \beta_6ETHHISP_i + u_i\] New variables for the model:
\(FEMALE\) Gender of individual
\(ETHBLACK\) Black individual
\(ETHHISP\) Hispanic individual
White individuals, and males are omitted
\[\widehat{AI}_i = b_1 + b_2S_i + b_3EXP_i + b_4FEMALE_i + b_5ETHBLACK_i + b_6ETHHISP_i \]
md2 <- lm(AI ~ S + EXP + FEMALE + ETHBLACK + ETHHISP, wage) #predicted model wiith sex 'Male' & race 'White' being omitted
summary(md2)##
## Call:
## lm(formula = AI ~ S + EXP + FEMALE + ETHBLACK + ETHHISP, data = wage)
##
## Residuals:
## Min 1Q Median 3Q Max
## -86.01 -22.25 -6.77 11.65 442.63
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -76.8515 15.5599 -4.939 1.05e-06 ***
## S 9.2241 0.8317 11.091 < 2e-16 ***
## EXP 1.4448 0.4517 3.198 0.00146 **
## FEMALE1 -24.6748 4.0102 -6.153 1.49e-09 ***
## ETHBLACK1 -2.9084 6.3558 -0.458 0.64742
## ETHHISP1 5.4748 8.4285 0.650 0.51625
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 45.06 on 534 degrees of freedom
## Multiple R-squared: 0.2583, Adjusted R-squared: 0.2513
## F-statistic: 37.19 on 5 and 534 DF, p-value: < 2.2e-16
Lets analyze the coefficients:
\(b_2S = 9.2241\). we can predict annual income to rise $9,224.1 for every additional unit (one year) of schooling added.
\(b_3EXP = 1.4448\). Meaning we can predict annual income to rise $1,444.8 for every additional year of experience added.
\(b_4FEMALE = -24.6749\). We can see that compared to men; women earn -$24,674.9 less in annual income. We gain this result holding other regressors fixed, while also omitting the gender male within the model.
\(b_5ETHBLACK = -2.9085\). We can see that compared to White people; black people earn -$2,908.5 less in annual income. We obtain this result holding other regressors fixed, while also omitting the white race within the model.
\(b_6ETHHISP = 5.4749\). Compared to White people, Hispanics earn $5,474.9 more in annual income. We obtain this result holding other regressors fixed, while also omitting the white race within the model.
\(R^{2} = .2583\). Means that 25.83% of the variation is represented in this model.
## [1] "p-value(beta4) = 1.49267470924784e-09"
## [1] "p-value(beta5) = 0.647421725759299"
## [1] "p-value(beta6) = 0.516253056658268"
Now we observe five variations:
\(b_4FEMALE\): We can reject the null hypothesis and say gender has some impact on annual income. Concluding this result as statistically significant. \(p-value < .05\)
\(b_5ETHBLACK\): We fail to reject the null hypothesis and conclude this result is not statistically significant. \(p-value > .05\)
\(b_6ETHHISP\): We fail to reject the null hypothesis and conclude this result is not statistically significant. \(p-value > .05\)
Suppose we want to see if women of their respective races experience a disparity in annual income.
\[AI_i = \beta_1 + \beta_2S_i + \beta_3EXP_i + \beta_4FEMALE_i + \beta_5ETHBLACK_i + \beta_6ETHHISP_i + \beta_7ETHBLACK*F_i + \beta_8ETHHISP*F_i + u_i\] New variables for the model:
\(FEMALE*ETHBLACK\) Interaction term for black women
\(FEMALE*ETHHISP\) Interaction term for hispanic women
White individuals, and males are omitted
\[\widehat{AI}_i = b_1 + b_2S_i + b_3EXP_i + b_4FEMALE_i + b_5ETHBLACK_i + b_6ETHHISP_i + b_7ETHBLACK*F_i + b_6ETHHISP*F_i \]
md3 <- lm(AI ~ S + EXP + FEMALE + ETHBLACK + ETHHISP +
ETHBLACK*FEMALE + ETHHISP*FEMALE , wage)
summary(md3)##
## Call:
## lm(formula = AI ~ S + EXP + FEMALE + ETHBLACK + ETHHISP + ETHBLACK *
## FEMALE + ETHHISP * FEMALE, data = wage)
##
## Residuals:
## Min 1Q Median 3Q Max
## -90.00 -22.45 -6.37 11.70 442.32
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -75.8702 15.5720 -4.872 1.46e-06 ***
## S 9.1800 0.8323 11.029 < 2e-16 ***
## EXP 1.4571 0.4520 3.224 0.00134 **
## FEMALE1 -25.8595 4.3378 -5.961 4.56e-09 ***
## ETHBLACK1 -13.3621 10.1396 -1.318 0.18814
## ETHHISP1 8.8920 10.5314 0.844 0.39887
## FEMALE1:ETHBLACK1 16.7465 12.8090 1.307 0.19164
## FEMALE1:ETHHISP1 -10.2512 17.4543 -0.587 0.55724
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 45.06 on 532 degrees of freedom
## Multiple R-squared: 0.2613, Adjusted R-squared: 0.2516
## F-statistic: 26.89 on 7 and 532 DF, p-value: < 2.2e-16
\(b_7F*B = 16.7465\). The interaction between both binary variables show the difference between black women and white men. It states that on average black women make $16,746.5 less annually when compared to white men. Although results are not statistically significant.
\(b_7F*H = -10.2509\). Both binary variables show the difference between Hispanic women and white men. It states the average difference Hispanic women makes $10,251.2 more compared to white men. Although results are not statistically significant.
stargazer(md1, md2, md3,
type="text",
intercept.bottom = FALSE,
style = 'qje',
dep.var.labels="Annual Income",
covariate.labels=c(NA,"School","Experience","Female",
"Hispanic","Black","Black Female",
"Hispanic Female"),
out="md_multi.htm")##
## ===========================================================================================
## Annual Income
## (1) (2) (3)
## -------------------------------------------------------------------------------------------
## Constant -106.075*** -76.851*** -75.870***
## (14.855) (15.560) (15.572)
##
## School 9.587*** 9.224*** 9.180***
## (0.845) (0.832) (0.832)
##
## Experience 2.162*** 1.445*** 1.457***
## (0.446) (0.452) (0.452)
##
## Female -24.675*** -25.860***
## (4.010) (4.338)
##
## Hispanic -2.908 -13.362
## (6.356) (10.140)
##
## Black 5.475 8.892
## (8.428) (10.531)
##
## Black Female 16.746
## (12.809)
##
## Hispanic Female -10.251
## (17.454)
##
## N 540 540 540
## R2 0.203 0.258 0.261
## Adjusted R2 0.200 0.251 0.252
## Residual Std. Error 46.572 (df = 537) 45.063 (df = 534) 45.055 (df = 532)
## F Statistic 68.530*** (df = 2; 537) 37.191*** (df = 5; 534) 26.888*** (df = 7; 532)
## ===========================================================================================
## Notes: ***Significant at the 1 percent level.
## **Significant at the 5 percent level.
## *Significant at the 10 percent level.
To determine the best model, we observe our adjusted R value. The bigger the value amongst the given models, the better fit for our data. ‘md3’ seems to be the better fit out of the three.
plot6 <- ggplot(wage, aes(EXP, AI))
(plot6 + geom_jitter(aes(color = RACE)) +
geom_smooth(method = 'lm', data = md3, formula = "y~x" ,color = 'black', show.legend = F) +
theme_bw() +
labs( title = "Annual Income vs Experience",
x = "Experience",
y = "Annual Income (Thousands)") +
theme(legend.position = "top") +
scale_color_manual(name = "Race", labels = c("Black","Hispanic","White"),
values = c("red", "yellow", "blue")) +
scale_y_continuous(labels = scales::dollar_format(accuracy = 0.01,
decimal.mark = '.')) +
facet_wrap(FEMALE ~ RACE)) #0:Male 1:Femaleplot7 <- ggplot(wage, aes(S, AI))
(plot7 + geom_jitter(aes(color = RACE)) +
geom_smooth(method = 'lm', data = md3, formula = "y~x" ,color = 'black', show.legend = F) +
theme_bw() +
labs( title = "Annual Income vs Schooling",
x = "Schooling",
y = "Annual Income (Thousands)") +
theme(legend.position = "top") +
scale_color_manual(name = "Race", labels = c("Black","Hispanic","White"),
values = c("red", "yellow", "blue")) +
scale_y_continuous(labels = scales::dollar_format(accuracy = 0.01,
decimal.mark = '.')) +
facet_wrap(FEMALE ~ RACE)) #0:Male 1:FemaleThis project is not meant to neglect the actually disparities that are influenced by race. This data frame is not a reflection of actual income in America as it was custom made.