if(!require(wooldridge)){
install.packages("wooldridge")
library(wooldridge)
}
## Loading required package: wooldridge
if(!require(ggplot2)){
install.packages("ggplot2")
library(ggplot2)
}
## Loading required package: ggplot2
if(!require(DescTools)){
install.packages("DescTools")
library(DescTools)
}
## Loading required package: DescTools
library(timetk)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyquant)
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
##
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
##
## legend
## Loading required package: quantmod
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
##
## Attaching package: 'tidyquant'
## The following objects are masked from 'package:DescTools':
##
## IRR, NPV, PMT
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ purrr 1.0.2 ✔ tibble 3.2.1
## ✔ readr 2.1.4 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ xts::first() masks dplyr::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ xts::last() masks dplyr::last()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(fivethirtyeight)
## Some larger datasets need to be installed separately, like senators and
## house_district_forecast. To install these, we recommend you install the
## fivethirtyeightdata package by running:
## install.packages('fivethirtyeightdata', repos =
## 'https://fivethirtyeightdata.github.io/drat/', type = 'source')
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
##
## Attaching package: 'openintro'
##
## The following object is masked from 'package:fivethirtyeight':
##
## drug_use
##
## The following object is masked from 'package:DescTools':
##
## cards
##
## The following object is masked from 'package:wooldridge':
##
## prison
library(ggridges)
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(carData)
library(car)
##
## Attaching package: 'car'
##
## The following object is masked from 'package:openintro':
##
## densityPlot
##
## The following object is masked from 'package:purrr':
##
## some
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:DescTools':
##
## Recode
library(ggpubr)
library(DescTools)
data('wage1')
summary(wage1)
## wage educ exper tenure
## Min. : 0.530 Min. : 0.00 Min. : 1.00 Min. : 0.000
## 1st Qu.: 3.330 1st Qu.:12.00 1st Qu.: 5.00 1st Qu.: 0.000
## Median : 4.650 Median :12.00 Median :13.50 Median : 2.000
## Mean : 5.896 Mean :12.56 Mean :17.02 Mean : 5.105
## 3rd Qu.: 6.880 3rd Qu.:14.00 3rd Qu.:26.00 3rd Qu.: 7.000
## Max. :24.980 Max. :18.00 Max. :51.00 Max. :44.000
## nonwhite female married numdep
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.000
## Median :0.0000 Median :0.0000 Median :1.0000 Median :1.000
## Mean :0.1027 Mean :0.4791 Mean :0.6084 Mean :1.044
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:2.000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :6.000
## smsa northcen south west
## Min. :0.0000 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :1.0000 Median :0.000 Median :0.0000 Median :0.0000
## Mean :0.7224 Mean :0.251 Mean :0.3555 Mean :0.1692
## 3rd Qu.:1.0000 3rd Qu.:0.750 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.000 Max. :1.0000 Max. :1.0000
## construc ndurman trcommpu trade
## Min. :0.00000 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.00000 Median :0.0000 Median :0.00000 Median :0.0000
## Mean :0.04563 Mean :0.1141 Mean :0.04373 Mean :0.2871
## 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :1.00000 Max. :1.0000 Max. :1.00000 Max. :1.0000
## services profserv profocc clerocc
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.1008 Mean :0.2586 Mean :0.3669 Mean :0.1673
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## servocc lwage expersq tenursq
## Min. :0.0000 Min. :-0.6349 Min. : 1.0 Min. : 0.00
## 1st Qu.:0.0000 1st Qu.: 1.2030 1st Qu.: 25.0 1st Qu.: 0.00
## Median :0.0000 Median : 1.5369 Median : 182.5 Median : 4.00
## Mean :0.1407 Mean : 1.6233 Mean : 473.4 Mean : 78.15
## 3rd Qu.:0.0000 3rd Qu.: 1.9286 3rd Qu.: 676.0 3rd Qu.: 49.00
## Max. :1.0000 Max. : 3.2181 Max. :2601.0 Max. :1936.00
Section 1: learn about the distribution of wage
Ideally, the distribution should be normal.
histogram/density, QQ plots, and hypotheis test
histogram
ggplot(data = wage1, aes(x = wage1$wage)) +
geom_histogram() +
labs(title = "wages", x = "wage")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

density
ggplot(data = wage1, aes(x = wage1$wage)) +
geom_density() +
labs(title = "wages", x = "wage")

QQ Plots
Which distribution model best describe the distribution of
wages?
The density curve looks bell-shaped, so we check for normality.
PlotQQ(wage1$wage, main="QQ Plot for normal distribution")

From the density plot, it seems that exponential distribution is a
good fit.
PlotQQ(wage1$wage, function(p) qexp(p, rate=1/10), main="QQ Plot for exponential distribution")

Hypothesis test for normality
shapiro.test(wage1$wage)
##
## Shapiro-Wilk normality test
##
## data: wage1$wage
## W = 0.80273, p-value < 2.2e-16
Section 2: regression analysis
Learn about the relationship between wages and education/experience.
Which is more important for earning a high wage?
data('wage1')
summary(wage1)
## wage educ exper tenure
## Min. : 0.530 Min. : 0.00 Min. : 1.00 Min. : 0.000
## 1st Qu.: 3.330 1st Qu.:12.00 1st Qu.: 5.00 1st Qu.: 0.000
## Median : 4.650 Median :12.00 Median :13.50 Median : 2.000
## Mean : 5.896 Mean :12.56 Mean :17.02 Mean : 5.105
## 3rd Qu.: 6.880 3rd Qu.:14.00 3rd Qu.:26.00 3rd Qu.: 7.000
## Max. :24.980 Max. :18.00 Max. :51.00 Max. :44.000
## nonwhite female married numdep
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.000
## Median :0.0000 Median :0.0000 Median :1.0000 Median :1.000
## Mean :0.1027 Mean :0.4791 Mean :0.6084 Mean :1.044
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:2.000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :6.000
## smsa northcen south west
## Min. :0.0000 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :1.0000 Median :0.000 Median :0.0000 Median :0.0000
## Mean :0.7224 Mean :0.251 Mean :0.3555 Mean :0.1692
## 3rd Qu.:1.0000 3rd Qu.:0.750 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.000 Max. :1.0000 Max. :1.0000
## construc ndurman trcommpu trade
## Min. :0.00000 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.00000 Median :0.0000 Median :0.00000 Median :0.0000
## Mean :0.04563 Mean :0.1141 Mean :0.04373 Mean :0.2871
## 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :1.00000 Max. :1.0000 Max. :1.00000 Max. :1.0000
## services profserv profocc clerocc
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.1008 Mean :0.2586 Mean :0.3669 Mean :0.1673
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## servocc lwage expersq tenursq
## Min. :0.0000 Min. :-0.6349 Min. : 1.0 Min. : 0.00
## 1st Qu.:0.0000 1st Qu.: 1.2030 1st Qu.: 25.0 1st Qu.: 0.00
## Median :0.0000 Median : 1.5369 Median : 182.5 Median : 4.00
## Mean :0.1407 Mean : 1.6233 Mean : 473.4 Mean : 78.15
## 3rd Qu.:0.0000 3rd Qu.: 1.9286 3rd Qu.: 676.0 3rd Qu.: 49.00
## Max. :1.0000 Max. : 3.2181 Max. :2601.0 Max. :1936.00
Linear regression models for wages against education/experience. The
estimates of \(\beta_1\) tells about
how strong the correlation is.
# education
fit_1 <- lm(log(wage)~educ, data=wage1)
summary(fit_1)
##
## Call:
## lm(formula = log(wage) ~ educ, data = wage1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.21158 -0.36393 -0.07263 0.29712 1.52339
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.583773 0.097336 5.998 3.74e-09 ***
## educ 0.082744 0.007567 10.935 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4801 on 524 degrees of freedom
## Multiple R-squared: 0.1858, Adjusted R-squared: 0.1843
## F-statistic: 119.6 on 1 and 524 DF, p-value: < 2.2e-16
# experience
fit_2 <- lm(log(wage)~exper, data=wage1)
summary(fit_2)
##
## Call:
## lm(formula = log(wage) ~ exper, data = wage1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.19701 -0.41580 -0.06912 0.31710 1.54254
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.549043 0.036995 41.872 <2e-16 ***
## exper 0.004362 0.001700 2.565 0.0106 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5287 on 524 degrees of freedom
## Multiple R-squared: 0.0124, Adjusted R-squared: 0.01052
## F-statistic: 6.581 on 1 and 524 DF, p-value: 0.01058
# both
fit_3 <- lm(log(wage)~exper+educ, data=wage1)
summary(fit_3)
##
## Call:
## lm(formula = log(wage) ~ exper + educ, data = wage1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.05800 -0.30136 -0.04539 0.30601 1.44425
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.216854 0.108595 1.997 0.0464 *
## exper 0.010347 0.001555 6.653 7.24e-11 ***
## educ 0.097936 0.007622 12.848 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4614 on 523 degrees of freedom
## Multiple R-squared: 0.2493, Adjusted R-squared: 0.2465
## F-statistic: 86.86 on 2 and 523 DF, p-value: < 2.2e-16
Fit a curve using built-in geom_smooth() function in ggplot.
ggplot(data = wage1, aes(x = wage1$educ, y = wage1$wage)) +
geom_point() +
geom_smooth() +
labs(title = "wages vs. education", x = "education", y = "wage")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(data = wage1, aes(x = wage1$exper, y = wage1$wage)) +
geom_point() +
geom_smooth() +
labs(title = "wages vs. experience", x = "experience", y = "wage")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
