library(haven)
library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(broom)
library(zoo)
## Warning: package 'zoo' was built under R version 3.4.2
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(lmtest)
## Warning: package 'lmtest' was built under R version 3.4.2
#importing IPUMS data
ipums<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")

Preliminary Investigation for Total Incomes versus Earned Income in both Genders, Males, and Females

Reducing the dataset by only taking people in the labor force (labforce==2) and codes (inctot positive and not 9999999) - which might be redundant, but I did it anyway.

Preliminary investigation for total incomes versus earned income:

myworkpums<-ipums%>%
  filter(incwage>0)%>%
  filter(incwage<9999998)%>%
  filter(labforce==2)

myworkfpums<-ipums%>% #females
  filter(incwage>0)%>%
  filter(incwage<9999998)%>%
  filter(labforce==2)%>%
  filter(sex==2)

myworkmpums<-ipums%>% #males
  filter(incwage>0)%>%
  filter(incwage<9999998)%>%
  filter(labforce==2)%>%
  filter(sex==1)

Both Genders

Creating the regression line as shown in class:

myall<-lm(incwage~inctot, data=myworkpums)
coef(myall)
## (Intercept)      inctot 
## 3858.021061    0.847338

The result is a regression line for Wages and Saleries = $3858.02 + .847338 * Total Income.

Plotting the regression line and values:

ggplot(myworkpums, aes(x=incwage, y=inctot))+
  geom_point()+geom_smooth(method = "lm", se = FALSE)+
  ggtitle(label = "Regression Line  Wages and Saleries = $3858.02 + .847338 * Total Income")+
  xlab("Wages and Saleries")+
  ylab("Total Income")

Normality of residuals (which=2):

plot(myall,which=2)

Summary

summary(myall)
## 
## Call:
## lm(formula = incwage ~ inctot, data = myworkpums)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -569018   -2331     722    4538   96594 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.858e+03  6.629e+01    58.2   <2e-16 ***
## inctot      8.473e-01  7.733e-04  1095.8   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18920 on 133979 degrees of freedom
## Multiple R-squared:  0.8996, Adjusted R-squared:  0.8996 
## F-statistic: 1.201e+06 on 1 and 133979 DF,  p-value: < 2.2e-16
confint(myall)
##                    2.5 %       97.5 %
## (Intercept) 3728.0876550 3987.9544670
## inctot         0.8458223    0.8488536

Breusch-Pagan Test:

bptest(myall)
## 
##  studentized Breusch-Pagan test
## 
## data:  myall
## BP = 24829, df = 1, p-value < 2.2e-16

Shapiro Wilk test does not work for more than 5000 samples. I use the Kolmogorov-Smirnov test only:

ks.test(x=myworkpums$incwage,y=myworkpums$inctot,alternative='two.sided')
## Warning in ks.test(x = myworkpums$incwage, y = myworkpums$inctot,
## alternative = "two.sided"): p-value will be approximate in the presence of
## ties
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  myworkpums$incwage and myworkpums$inctot
## D = 0.038334, p-value < 2.2e-16
## alternative hypothesis: two-sided

Males

Creating the regression line as shown in class:

mymale<-lm(incwage~inctot, data=myworkmpums)
coef(mymale)
##  (Intercept)       inctot 
## 4994.7239836    0.8374517

The result is a regression line for Wages and Saleries = $4994.72 + .8374517 * Total Income.

Plotting the regression line and values:

ggplot(myworkmpums, aes(x=incwage, y=inctot))+
  geom_point()+geom_smooth(method = "lm", se = FALSE)+
  ggtitle(label = "Regression Line  Wages and Saleries = $4994.72 + .8374517 * Total Income")+
  xlab("Wages and Saleries")+
  ylab("Total Income")

Normality of residuals (which=2):

plot(mymale,which=2)

Summary

summary(mymale)
## 
## Call:
## lm(formula = incwage ~ inctot, data = myworkmpums)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -562394   -3044     694    5571  101962 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4.995e+03  1.102e+02   45.32   <2e-16 ***
## inctot      8.375e-01  1.077e-03  777.24   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22570 on 69540 degrees of freedom
## Multiple R-squared:  0.8968, Adjusted R-squared:  0.8968 
## F-statistic: 6.041e+05 on 1 and 69540 DF,  p-value: < 2.2e-16
confint(mymale)
##                    2.5 %       97.5 %
## (Intercept) 4778.7216647 5210.7263025
## inctot         0.8353399    0.8395635

Breusch-Pagan Test:

bptest(mymale)
## 
##  studentized Breusch-Pagan test
## 
## data:  mymale
## BP = 13966, df = 1, p-value < 2.2e-16

Shapiro Wilk test does not work for more than 5000 samples. I use the Kolmogorov-Smirnov test only:

ks.test(x=myworkmpums$incwage,y=myworkmpums$inctot,alternative='two.sided')
## Warning in ks.test(x = myworkmpums$incwage, y = myworkmpums$inctot,
## alternative = "two.sided"): p-value will be approximate in the presence of
## ties
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  myworkmpums$incwage and myworkmpums$inctot
## D = 0.037848, p-value < 2.2e-16
## alternative hypothesis: two-sided

Females

Creating the regression line as shown in class:

myfemale<-lm(incwage~inctot, data=myworkfpums)
coef(myfemale)
##  (Intercept)       inctot 
## 2268.4354830    0.8722747

The result is a regression line for Wages and Saleries = $2268.44 + .8722747 * Total Income.

Plotting the regression line and values:

ggplot(myworkfpums, aes(x=incwage, y=inctot))+
  geom_point()+geom_smooth(method = "lm", se = FALSE)+
  ggtitle(label = "Regression Line  Wages and Saleries = $2268.44 + .8722747 * Total Income")+
  xlab("Wages and Saleries")+
  ylab("Total Income")

Normality of residuals (which=2):

plot(myfemale,which=2)

Summary

summary(myfemale)
## 
## Call:
## lm(formula = incwage ~ inctot, data = myworkfpums)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -378935   -1310     925    3735   81775 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.268e+03  7.315e+01   31.01   <2e-16 ***
## inctot      8.723e-01  1.158e-03  753.38   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13840 on 64437 degrees of freedom
## Multiple R-squared:  0.898,  Adjusted R-squared:  0.898 
## F-statistic: 5.676e+05 on 1 and 64437 DF,  p-value: < 2.2e-16
confint(myfemale)
##                    2.5 %      97.5 %
## (Intercept) 2125.0683351 2411.802631
## inctot         0.8700053    0.874544

Breusch-Pagan Test:

bptest(myfemale)
## 
##  studentized Breusch-Pagan test
## 
## data:  myfemale
## BP = 9173, df = 1, p-value < 2.2e-16

Shapiro Wilk test does not work for more than 5000 samples. I use the Kolmogorov-Smirnov test only:

ks.test(x=myworkfpums$incwage,y=myworkfpums$inctot,alternative='two.sided')
## Warning in ks.test(x = myworkfpums$incwage, y = myworkfpums$inctot,
## alternative = "two.sided"): p-value will be approximate in the presence of
## ties
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  myworkfpums$incwage and myworkfpums$inctot
## D = 0.041264, p-value < 2.2e-16
## alternative hypothesis: two-sided

Discussion

All three regressions showed significance in every test I threw at it. This comes to no surprise. Total income depends on earned income if we think of it intuitively. Sure, there are ouliers, for example, people that are so rich that they do not work, but have other means of income such as shares, dividents, or real estate. It also does not come to a surprise that females start at significantly lower incomes, shown by a lower Y-intersect, however, their coefficient is higher, meaning that their average difference is shrinking faster. This does not refelct the total income earned though.