if(!require(wooldridge)){
  install.packages("wooldridge")
  library(wooldridge)
}

## Loading required package: wooldridge

if(!require(ggplot2)){
  install.packages("ggplot2")
  library(ggplot2)
}

## Loading required package: ggplot2

if(!require(DescTools)){
  install.packages("DescTools")
  library(DescTools)
}

## Loading required package: DescTools

library(timetk)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyquant)

## Loading required package: lubridate

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

## Loading required package: PerformanceAnalytics

## Loading required package: xts

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################

## 
## Attaching package: 'xts'

## The following objects are masked from 'package:dplyr':
## 
##     first, last

## 
## Attaching package: 'PerformanceAnalytics'

## The following object is masked from 'package:graphics':
## 
##     legend

## Loading required package: quantmod

## Loading required package: TTR

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

## 
## Attaching package: 'tidyquant'

## The following objects are masked from 'package:DescTools':
## 
##     IRR, NPV, PMT

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0     ✔ stringr 1.5.0
## ✔ purrr   1.0.2     ✔ tibble  3.2.1
## ✔ readr   2.1.4     ✔ tidyr   1.3.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ xts::first()    masks dplyr::first()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ xts::last()     masks dplyr::last()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggplot2)
library(fivethirtyeight)

## Some larger datasets need to be installed separately, like senators and
## house_district_forecast. To install these, we recommend you install the
## fivethirtyeightdata package by running:
## install.packages('fivethirtyeightdata', repos =
## 'https://fivethirtyeightdata.github.io/drat/', type = 'source')

library(openintro)

## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
## 
## Attaching package: 'openintro'
## 
## The following object is masked from 'package:fivethirtyeight':
## 
##     drug_use
## 
## The following object is masked from 'package:DescTools':
## 
##     cards
## 
## The following object is masked from 'package:wooldridge':
## 
##     prison

library(ggridges)
library(gridExtra)

## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine

library(carData)
library(car)

## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:openintro':
## 
##     densityPlot
## 
## The following object is masked from 'package:purrr':
## 
##     some
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:DescTools':
## 
##     Recode

library(ggpubr)
library(DescTools)

data('wage1')
summary(wage1)

##       wage             educ           exper           tenure      
##  Min.   : 0.530   Min.   : 0.00   Min.   : 1.00   Min.   : 0.000  
##  1st Qu.: 3.330   1st Qu.:12.00   1st Qu.: 5.00   1st Qu.: 0.000  
##  Median : 4.650   Median :12.00   Median :13.50   Median : 2.000  
##  Mean   : 5.896   Mean   :12.56   Mean   :17.02   Mean   : 5.105  
##  3rd Qu.: 6.880   3rd Qu.:14.00   3rd Qu.:26.00   3rd Qu.: 7.000  
##  Max.   :24.980   Max.   :18.00   Max.   :51.00   Max.   :44.000  
##     nonwhite          female          married           numdep     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.000  
##  Median :0.0000   Median :0.0000   Median :1.0000   Median :1.000  
##  Mean   :0.1027   Mean   :0.4791   Mean   :0.6084   Mean   :1.044  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:2.000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :6.000  
##       smsa           northcen         south             west       
##  Min.   :0.0000   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.0000   Median :0.000   Median :0.0000   Median :0.0000  
##  Mean   :0.7224   Mean   :0.251   Mean   :0.3555   Mean   :0.1692  
##  3rd Qu.:1.0000   3rd Qu.:0.750   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.000   Max.   :1.0000   Max.   :1.0000  
##     construc          ndurman          trcommpu           trade       
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.0000  
##  Median :0.00000   Median :0.0000   Median :0.00000   Median :0.0000  
##  Mean   :0.04563   Mean   :0.1141   Mean   :0.04373   Mean   :0.2871  
##  3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:1.0000  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.00000   Max.   :1.0000  
##     services         profserv         profocc          clerocc      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.1008   Mean   :0.2586   Mean   :0.3669   Mean   :0.1673  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##     servocc           lwage            expersq          tenursq       
##  Min.   :0.0000   Min.   :-0.6349   Min.   :   1.0   Min.   :   0.00  
##  1st Qu.:0.0000   1st Qu.: 1.2030   1st Qu.:  25.0   1st Qu.:   0.00  
##  Median :0.0000   Median : 1.5369   Median : 182.5   Median :   4.00  
##  Mean   :0.1407   Mean   : 1.6233   Mean   : 473.4   Mean   :  78.15  
##  3rd Qu.:0.0000   3rd Qu.: 1.9286   3rd Qu.: 676.0   3rd Qu.:  49.00  
##  Max.   :1.0000   Max.   : 3.2181   Max.   :2601.0   Max.   :1936.00

Section 1: learn about the distribution of wage

Ideally, the distribution should be normal.

histogram/density, QQ plots, and hypotheis test

histogram

ggplot(data = wage1, aes(x = wage1$wage)) +
  geom_histogram() +
  labs(title = "wages", x = "wage")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

density

ggplot(data = wage1, aes(x = wage1$wage)) +
  geom_density() +
  labs(title = "wages", x = "wage")

QQ Plots

Which distribution model best describe the distribution of wages?

The density curve looks bell-shaped, so we check for normality.

PlotQQ(wage1$wage, main="QQ Plot for normal distribution")

From the density plot, it seems that exponential distribution is a good fit.

PlotQQ(wage1$wage, function(p) qexp(p, rate=1/10), main="QQ Plot for exponential distribution")

Hypothesis test for normality

shapiro.test(wage1$wage)

## 
##  Shapiro-Wilk normality test
## 
## data:  wage1$wage
## W = 0.80273, p-value < 2.2e-16

Section 2: regression analysis

Learn about the relationship between wages and education/experience. Which is more important for earning a high wage?

data('wage1')
summary(wage1)

##       wage             educ           exper           tenure      
##  Min.   : 0.530   Min.   : 0.00   Min.   : 1.00   Min.   : 0.000  
##  1st Qu.: 3.330   1st Qu.:12.00   1st Qu.: 5.00   1st Qu.: 0.000  
##  Median : 4.650   Median :12.00   Median :13.50   Median : 2.000  
##  Mean   : 5.896   Mean   :12.56   Mean   :17.02   Mean   : 5.105  
##  3rd Qu.: 6.880   3rd Qu.:14.00   3rd Qu.:26.00   3rd Qu.: 7.000  
##  Max.   :24.980   Max.   :18.00   Max.   :51.00   Max.   :44.000  
##     nonwhite          female          married           numdep     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.000  
##  Median :0.0000   Median :0.0000   Median :1.0000   Median :1.000  
##  Mean   :0.1027   Mean   :0.4791   Mean   :0.6084   Mean   :1.044  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:2.000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :6.000  
##       smsa           northcen         south             west       
##  Min.   :0.0000   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.0000   Median :0.000   Median :0.0000   Median :0.0000  
##  Mean   :0.7224   Mean   :0.251   Mean   :0.3555   Mean   :0.1692  
##  3rd Qu.:1.0000   3rd Qu.:0.750   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.000   Max.   :1.0000   Max.   :1.0000  
##     construc          ndurman          trcommpu           trade       
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.0000  
##  Median :0.00000   Median :0.0000   Median :0.00000   Median :0.0000  
##  Mean   :0.04563   Mean   :0.1141   Mean   :0.04373   Mean   :0.2871  
##  3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:1.0000  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.00000   Max.   :1.0000  
##     services         profserv         profocc          clerocc      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.1008   Mean   :0.2586   Mean   :0.3669   Mean   :0.1673  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##     servocc           lwage            expersq          tenursq       
##  Min.   :0.0000   Min.   :-0.6349   Min.   :   1.0   Min.   :   0.00  
##  1st Qu.:0.0000   1st Qu.: 1.2030   1st Qu.:  25.0   1st Qu.:   0.00  
##  Median :0.0000   Median : 1.5369   Median : 182.5   Median :   4.00  
##  Mean   :0.1407   Mean   : 1.6233   Mean   : 473.4   Mean   :  78.15  
##  3rd Qu.:0.0000   3rd Qu.: 1.9286   3rd Qu.: 676.0   3rd Qu.:  49.00  
##  Max.   :1.0000   Max.   : 3.2181   Max.   :2601.0   Max.   :1936.00

Linear regression models for wages against education/experience. The estimates of \(\beta_1\) tells about how strong the correlation is.

# education
fit_1 <- lm(log(wage)~educ, data=wage1)
summary(fit_1)

## 
## Call:
## lm(formula = log(wage) ~ educ, data = wage1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.21158 -0.36393 -0.07263  0.29712  1.52339 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.583773   0.097336   5.998 3.74e-09 ***
## educ        0.082744   0.007567  10.935  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4801 on 524 degrees of freedom
## Multiple R-squared:  0.1858, Adjusted R-squared:  0.1843 
## F-statistic: 119.6 on 1 and 524 DF,  p-value: < 2.2e-16

# experience
fit_2 <- lm(log(wage)~exper, data=wage1)
summary(fit_2)

## 
## Call:
## lm(formula = log(wage) ~ exper, data = wage1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.19701 -0.41580 -0.06912  0.31710  1.54254 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.549043   0.036995  41.872   <2e-16 ***
## exper       0.004362   0.001700   2.565   0.0106 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5287 on 524 degrees of freedom
## Multiple R-squared:  0.0124, Adjusted R-squared:  0.01052 
## F-statistic: 6.581 on 1 and 524 DF,  p-value: 0.01058

# both 
fit_3 <- lm(log(wage)~exper+educ, data=wage1)
summary(fit_3)

## 
## Call:
## lm(formula = log(wage) ~ exper + educ, data = wage1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.05800 -0.30136 -0.04539  0.30601  1.44425 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.216854   0.108595   1.997   0.0464 *  
## exper       0.010347   0.001555   6.653 7.24e-11 ***
## educ        0.097936   0.007622  12.848  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4614 on 523 degrees of freedom
## Multiple R-squared:  0.2493, Adjusted R-squared:  0.2465 
## F-statistic: 86.86 on 2 and 523 DF,  p-value: < 2.2e-16

Fit a curve using built-in geom_smooth() function in ggplot.

ggplot(data = wage1, aes(x = wage1$educ, y = wage1$wage)) +
  geom_point() +
  geom_smooth() +
  labs(title = "wages vs. education", x = "education", y = "wage")

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(data = wage1, aes(x = wage1$exper, y = wage1$wage)) +
  geom_point() +
  geom_smooth() +
  labs(title = "wages vs. experience", x = "experience", y = "wage")

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

The Income Distribution and Its Influence Factors

Xuanyang Lin

2023-10-29

Section 1: learn about the distribution of wage

Ideally, the distribution should be normal.

histogram/density, QQ plots, and hypotheis test

histogram

density

QQ Plots

Which distribution model best describe the distribution of wages?

The density curve looks bell-shaped, so we check for normality.

From the density plot, it seems that exponential distribution is a good fit.

Hypothesis test for normality

Section 2: regression analysis

Learn about the relationship between wages and education/experience. Which is more important for earning a high wage?

Linear regression models for wages against education/experience. The estimates of \(\beta_1\) tells about how strong the correlation is.

Fit a curve using built-in geom_smooth() function in ggplot.