knitr::opts_chunk$set(echo = TRUE)
library(haven)
library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(broom)

ipums<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")
names(ipums) #print the column names
##  [1] "year"      "datanum"   "serial"    "hhwt"      "statefip" 
##  [6] "met2013"   "puma"      "gq"        "ownershp"  "ownershpd"
## [11] "bedrooms"  "pernum"    "perwt"     "famsize"   "nchild"   
## [16] "nchlt5"    "eldch"     "nsibs"     "relate"    "related"  
## [21] "sex"       "age"       "marst"     "birthyr"   "fertyr"   
## [26] "race"      "raced"     "hispan"    "hispand"   "bpl"      
## [31] "bpld"      "citizen"   "yrsusa1"   "language"  "languaged"
## [36] "speakeng"  "educ"      "educd"     "empstat"   "empstatd" 
## [41] "labforce"  "occ"       "ind"       "inctot"    "incwage"  
## [46] "poverty"   "hwsei"     "migrate1"  "migrate1d" "carpool"  
## [51] "trantime"

Construct a continuous outcome variable

fit<-lm(nchild~incwage, data=ipums, na.action= na.exclude)
coef(fit)
##   (Intercept)       incwage 
##  6.067085e-01 -5.645765e-07

Regression line is as follows:

nchild= .6067 - .0000005645* wage and salary income

ggplot(ipums, aes(x=incwage, y=nchild))+geom_point()+ geom_smooth(method= "lm", se= FALSE)
## Don't know how to automatically pick scale for object of type labelled. Defaulting to continuous.

Estimate the OLS regression model for my outcome (nchild)

summary(fit)
## 
## Call:
## lm(formula = nchild ~ incwage, data = ipums, na.action = na.exclude)
## 
## Residuals:
## <Labelled double>
##     Min      1Q  Median      3Q     Max 
## -0.6067 -0.6067 -0.5666  0.3933  8.6282 
## 
## Labels:
##  value              label
##      0 0 children present
##      1    1 child present
##      2                  2
##      3                  3
##      4                  4
##      5                  5
##      6                  6
##      7                  7
##      8                  8
##      9                 9+
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.067e-01  1.911e-03   317.4   <2e-16 ***
## incwage     -5.646e-07  4.433e-09  -127.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.921 on 300550 degrees of freedom
## Multiple R-squared:  0.05121,    Adjusted R-squared:  0.05121 
## F-statistic: 1.622e+04 on 1 and 300550 DF,  p-value: < 2.2e-16

Determine confidence intervals for the model

confint(fit)
##                     2.5 %        97.5 %
## (Intercept)  6.029622e-01  6.104547e-01
## incwage     -5.732645e-07 -5.558885e-07

Evaluate model assumptions

plot(fit, which=1)

Test for trend using Breush-Pagan test

library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
bptest(fit)
## 
##  studentized Breusch-Pagan test
## 
## data:  fit
## BP = 7352.8, df = 1, p-value < 2.2e-16

Determine normality of residuals

Attempted to plot(fit, which=2) but got error message about atomic vectors

Attempted Shapiro Wilk test but sample size too big

Determine normality of residuals using Kolmogorov-Smirnov test

ks.test(resid(fit), y= pnorm)
## Warning in ks.test(resid(fit), y = pnorm): ties should not be present for
## the Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  resid(fit)
## D = 0.27202, p-value < 2.2e-16
## alternative hypothesis: two-sided

Discuss the results of the model

Based on my model, for every unit of wage and salary that increases, the fewer children one has (there is a negative correlation). The model was found to be statistically significant at the .05 level. However, the R squared value of .05121 indicates that wage and salary income can account for only about 5% of the variation in the number of children one has. After plotting, I found that the residuals are constantly varying with respect to the fitted values, indicating heteroscedasticity. The Breusch-Pagan test indicated that the variance was not constant and the Kolmogorov-Smirnov test indicated that residuals are not normally distributed. This isn’t great so I might consider adding more predictor values to my model such as education or age.