knitr::opts_chunk$set(echo = TRUE)
library(haven)
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(broom)
ipums<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")
names(ipums) #print the column names
## [1] "year" "datanum" "serial" "hhwt" "statefip"
## [6] "met2013" "puma" "gq" "ownershp" "ownershpd"
## [11] "bedrooms" "pernum" "perwt" "famsize" "nchild"
## [16] "nchlt5" "eldch" "nsibs" "relate" "related"
## [21] "sex" "age" "marst" "birthyr" "fertyr"
## [26] "race" "raced" "hispan" "hispand" "bpl"
## [31] "bpld" "citizen" "yrsusa1" "language" "languaged"
## [36] "speakeng" "educ" "educd" "empstat" "empstatd"
## [41] "labforce" "occ" "ind" "inctot" "incwage"
## [46] "poverty" "hwsei" "migrate1" "migrate1d" "carpool"
## [51] "trantime"
Construct a continuous outcome variable
fit<-lm(nchild~incwage, data=ipums, na.action= na.exclude)
coef(fit)
## (Intercept) incwage
## 6.067085e-01 -5.645765e-07
Regression line is as follows:
nchild= .6067 - .0000005645* wage and salary income
ggplot(ipums, aes(x=incwage, y=nchild))+geom_point()+ geom_smooth(method= "lm", se= FALSE)
## Don't know how to automatically pick scale for object of type labelled. Defaulting to continuous.

Estimate the OLS regression model for my outcome (nchild)
summary(fit)
##
## Call:
## lm(formula = nchild ~ incwage, data = ipums, na.action = na.exclude)
##
## Residuals:
## <Labelled double>
## Min 1Q Median 3Q Max
## -0.6067 -0.6067 -0.5666 0.3933 8.6282
##
## Labels:
## value label
## 0 0 children present
## 1 1 child present
## 2 2
## 3 3
## 4 4
## 5 5
## 6 6
## 7 7
## 8 8
## 9 9+
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.067e-01 1.911e-03 317.4 <2e-16 ***
## incwage -5.646e-07 4.433e-09 -127.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.921 on 300550 degrees of freedom
## Multiple R-squared: 0.05121, Adjusted R-squared: 0.05121
## F-statistic: 1.622e+04 on 1 and 300550 DF, p-value: < 2.2e-16
Determine confidence intervals for the model
confint(fit)
## 2.5 % 97.5 %
## (Intercept) 6.029622e-01 6.104547e-01
## incwage -5.732645e-07 -5.558885e-07
Evaluate model assumptions
plot(fit, which=1)

Test for trend using Breush-Pagan test
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
bptest(fit)
##
## studentized Breusch-Pagan test
##
## data: fit
## BP = 7352.8, df = 1, p-value < 2.2e-16
Determine normality of residuals
Attempted to plot(fit, which=2) but got error message about atomic vectors
Attempted Shapiro Wilk test but sample size too big
Determine normality of residuals using Kolmogorov-Smirnov test
ks.test(resid(fit), y= pnorm)
## Warning in ks.test(resid(fit), y = pnorm): ties should not be present for
## the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: resid(fit)
## D = 0.27202, p-value < 2.2e-16
## alternative hypothesis: two-sided
Discuss the results of the model
Based on my model, for every unit of wage and salary that increases, the fewer children one has (there is a negative correlation). The model was found to be statistically significant at the .05 level. However, the R squared value of .05121 indicates that wage and salary income can account for only about 5% of the variation in the number of children one has. After plotting, I found that the residuals are constantly varying with respect to the fitted values, indicating heteroscedasticity. The Breusch-Pagan test indicated that the variance was not constant and the Kolmogorov-Smirnov test indicated that residuals are not normally distributed. This isn’t great so I might consider adding more predictor values to my model such as education or age.