library(haven)
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(broom)
library(zoo)
## Warning: package 'zoo' was built under R version 3.4.2
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(lmtest)
## Warning: package 'lmtest' was built under R version 3.4.2
#importing IPUMS data
ipums<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")
myworkpums<-ipums%>%
filter(incwage>0)%>%
filter(incwage<9999998)%>%
filter(labforce==2)
myworkfpums<-ipums%>% #females
filter(incwage>0)%>%
filter(incwage<9999998)%>%
filter(labforce==2)%>%
filter(sex==2)
myworkmpums<-ipums%>% #males
filter(incwage>0)%>%
filter(incwage<9999998)%>%
filter(labforce==2)%>%
filter(sex==1)
myall<-lm(incwage~inctot, data=myworkpums)
coef(myall)
## (Intercept) inctot
## 3858.021061 0.847338
The result is a regression line for Wages and Saleries = $3858.02 + .847338 * Total Income.
ggplot(myworkpums, aes(x=incwage, y=inctot))+
geom_point()+geom_smooth(method = "lm", se = FALSE)+
ggtitle(label = "Regression Line Wages and Saleries = $3858.02 + .847338 * Total Income")+
xlab("Wages and Saleries")+
ylab("Total Income")
Normality of residuals (which=2):
plot(myall,which=2)
summary(myall)
##
## Call:
## lm(formula = incwage ~ inctot, data = myworkpums)
##
## Residuals:
## Min 1Q Median 3Q Max
## -569018 -2331 722 4538 96594
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.858e+03 6.629e+01 58.2 <2e-16 ***
## inctot 8.473e-01 7.733e-04 1095.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18920 on 133979 degrees of freedom
## Multiple R-squared: 0.8996, Adjusted R-squared: 0.8996
## F-statistic: 1.201e+06 on 1 and 133979 DF, p-value: < 2.2e-16
confint(myall)
## 2.5 % 97.5 %
## (Intercept) 3728.0876550 3987.9544670
## inctot 0.8458223 0.8488536
Breusch-Pagan Test:
bptest(myall)
##
## studentized Breusch-Pagan test
##
## data: myall
## BP = 24829, df = 1, p-value < 2.2e-16
Shapiro Wilk test does not work for more than 5000 samples. I use the Kolmogorov-Smirnov test only:
ks.test(x=myworkpums$incwage,y=myworkpums$inctot,alternative='two.sided')
## Warning in ks.test(x = myworkpums$incwage, y = myworkpums$inctot,
## alternative = "two.sided"): p-value will be approximate in the presence of
## ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: myworkpums$incwage and myworkpums$inctot
## D = 0.038334, p-value < 2.2e-16
## alternative hypothesis: two-sided
mymale<-lm(incwage~inctot, data=myworkmpums)
coef(mymale)
## (Intercept) inctot
## 4994.7239836 0.8374517
The result is a regression line for Wages and Saleries = $4994.72 + .8374517 * Total Income.
ggplot(myworkmpums, aes(x=incwage, y=inctot))+
geom_point()+geom_smooth(method = "lm", se = FALSE)+
ggtitle(label = "Regression Line Wages and Saleries = $4994.72 + .8374517 * Total Income")+
xlab("Wages and Saleries")+
ylab("Total Income")
Normality of residuals (which=2):
plot(mymale,which=2)
summary(mymale)
##
## Call:
## lm(formula = incwage ~ inctot, data = myworkmpums)
##
## Residuals:
## Min 1Q Median 3Q Max
## -562394 -3044 694 5571 101962
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.995e+03 1.102e+02 45.32 <2e-16 ***
## inctot 8.375e-01 1.077e-03 777.24 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22570 on 69540 degrees of freedom
## Multiple R-squared: 0.8968, Adjusted R-squared: 0.8968
## F-statistic: 6.041e+05 on 1 and 69540 DF, p-value: < 2.2e-16
confint(mymale)
## 2.5 % 97.5 %
## (Intercept) 4778.7216647 5210.7263025
## inctot 0.8353399 0.8395635
Breusch-Pagan Test:
bptest(mymale)
##
## studentized Breusch-Pagan test
##
## data: mymale
## BP = 13966, df = 1, p-value < 2.2e-16
Shapiro Wilk test does not work for more than 5000 samples. I use the Kolmogorov-Smirnov test only:
ks.test(x=myworkmpums$incwage,y=myworkmpums$inctot,alternative='two.sided')
## Warning in ks.test(x = myworkmpums$incwage, y = myworkmpums$inctot,
## alternative = "two.sided"): p-value will be approximate in the presence of
## ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: myworkmpums$incwage and myworkmpums$inctot
## D = 0.037848, p-value < 2.2e-16
## alternative hypothesis: two-sided
myfemale<-lm(incwage~inctot, data=myworkfpums)
coef(myfemale)
## (Intercept) inctot
## 2268.4354830 0.8722747
The result is a regression line for Wages and Saleries = $2268.44 + .8722747 * Total Income.
ggplot(myworkfpums, aes(x=incwage, y=inctot))+
geom_point()+geom_smooth(method = "lm", se = FALSE)+
ggtitle(label = "Regression Line Wages and Saleries = $2268.44 + .8722747 * Total Income")+
xlab("Wages and Saleries")+
ylab("Total Income")
Normality of residuals (which=2):
plot(myfemale,which=2)
summary(myfemale)
##
## Call:
## lm(formula = incwage ~ inctot, data = myworkfpums)
##
## Residuals:
## Min 1Q Median 3Q Max
## -378935 -1310 925 3735 81775
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.268e+03 7.315e+01 31.01 <2e-16 ***
## inctot 8.723e-01 1.158e-03 753.38 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13840 on 64437 degrees of freedom
## Multiple R-squared: 0.898, Adjusted R-squared: 0.898
## F-statistic: 5.676e+05 on 1 and 64437 DF, p-value: < 2.2e-16
confint(myfemale)
## 2.5 % 97.5 %
## (Intercept) 2125.0683351 2411.802631
## inctot 0.8700053 0.874544
Breusch-Pagan Test:
bptest(myfemale)
##
## studentized Breusch-Pagan test
##
## data: myfemale
## BP = 9173, df = 1, p-value < 2.2e-16
Shapiro Wilk test does not work for more than 5000 samples. I use the Kolmogorov-Smirnov test only:
ks.test(x=myworkfpums$incwage,y=myworkfpums$inctot,alternative='two.sided')
## Warning in ks.test(x = myworkfpums$incwage, y = myworkfpums$inctot,
## alternative = "two.sided"): p-value will be approximate in the presence of
## ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: myworkfpums$incwage and myworkfpums$inctot
## D = 0.041264, p-value < 2.2e-16
## alternative hypothesis: two-sided
All three regressions showed significance in every test I threw at it. This comes to no surprise. Total income depends on earned income if we think of it intuitively. Sure, there are ouliers, for example, people that are so rich that they do not work, but have other means of income such as shares, dividents, or real estate. It also does not come to a surprise that females start at significantly lower incomes, shown by a lower Y-intersect, however, their coefficient is higher, meaning that their average difference is shrinking faster. This does not refelct the total income earned though.