library(readr)
library(dplyr) #to manipulate data
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2) #to visualize data
library(broom) #to make results printable
library(tidyr)
#Using IPUMS Data
library(haven)
ipums<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")
newpums<-ipums%>%
filter(relate==3)%>% #Only including the child's relationship to the household head
mutate(n_child=as.numeric(nchild))%>% #Predictor Variable: Number of Own Children in the Household
mutate(fam_size=as.numeric(famsize)) #Outcome Variable: Number of own family members in household
fit<-lm(fam_size~n_child, data=newpums)
coef(fit)
## (Intercept) n_child
## 4.239698 0.655249
ggplot(newpums, aes(x=n_child, y=fam_size))+geom_point()+geom_smooth(method = "lm", se = FALSE)+
ggtitle("Family Size by the Number of Children")+
xlab("Number of Children")+
ylab("Family Size")
. The number of children impacts the family size in a linear manner, meaning that the relationship between number of children and the family size is constant for all values of number of children.
. So this model says that we can predict the family size using the number of children and a random error term.
summary(fit)
##
## Call:
## lm(formula = fam_size ~ n_child, data = newpums)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.2397 -1.2397 -0.2397 0.7603 15.7603
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.239698 0.005343 793.44 <2e-16 ***
## n_child 0.655249 0.014702 44.57 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.465 on 77332 degrees of freedom
## Multiple R-squared: 0.02504, Adjusted R-squared: 0.02503
## F-statistic: 1986 on 1 and 77332 DF, p-value: < 2.2e-16
confint(fit)
## 2.5 % 97.5 %
## (Intercept) 4.229225 4.250171
## n_child 0.626434 0.684064
plot(fit, which=1)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
bptest(fit)
##
## studentized Breusch-Pagan test
##
## data: fit
## BP = 1.8033, df = 1, p-value = 0.1793
plot(fit, which=2)
ks.test(resid(fit), y = pnorm)
## Warning in ks.test(resid(fit), y = pnorm): ties should not be present for
## the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: resid(fit)
## D = 0.23224, p-value < 2.2e-16
## alternative hypothesis: two-sided