Use IPUMS data from Github

Filter data for only Household Heads and number of members in family to create famsizenew

HHO<-subset(ipums,relate==1)
HHO$famsizenew<-as.numeric(HHO$famsize)

Summarize data for Household Heads and number of family members from low to high

summary(HHO$famsizenew)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   2.000   2.378   3.000  20.000

Use Variable for Families of Foreign born and U.S. born Household Heads

HHO$USborn <- ifelse(HHO$bpl<=120, "USHead", "ForeignHead")

Descriptive statistics for USHead and ForeignHead

summary(HHO$famsizenew[HHO$USborn=="USHead"])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    1.00    2.00    2.29    3.00   20.00
summary(HHO$famsizenew[HHO$USborn=="ForeignHead"])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   2.934   4.000  15.000
ggplot(HHO)+
  geom_boxplot(aes(x= as.factor(HHO$USborn), y=HHO$famsizenew))+
  ggtitle(label = "Family Size Boxplot for Foreign HH and U.S. HH")+
  ylab(label = "Family Size")+
  xlab(label = "Foreign Head      vs.      U.S. Head")

Use linear model to test for equality of family size between Foreign Head and U.S. Head

library(broom)
HHO_fit <- lm(famsizenew ~ USborn, data=HHO)
tidy(HHO_fit)
##           term   estimate  std.error statistic p.value
## 1  (Intercept)  2.9344448 0.01098588 267.11061       0
## 2 USbornUSHead -0.6441438 0.01181550 -54.51685       0

Analyzing errors in the model

qqnorm(rstudent(HHO_fit), main="Q-Q Plot for Model Residuals")
qqline(rstudent(HHO_fit), col="red")

Shapiro test not good due to large sample size so use Anderson-Darling Normal test to assess non-normal distribution

library(nortest)
ad.test(resid(HHO_fit))
## 
##  Anderson-Darling normality test
## 
## data:  resid(HHO_fit)
## A = 5032.7, p-value < 2.2e-16

Attempt transformations to address non-normality

First transformation is natural log transform

HHO_log<-lm(log(famsizenew)~USborn, data=HHO)
tidy(HHO_log)
##           term   estimate   std.error statistic p.value
## 1  (Intercept)  0.9053601 0.004402847 205.63063       0
## 2 USbornUSHead -0.2286906 0.004735337 -48.29448       0

QQPlot for HHO_log

qqnorm(rstudent(HHO_log), main="Q-Q Plot for Model Residuals")
qqline(rstudent(HHO_log), col="red")

Using Anderson Darling on HHO_log

ad.test(resid(HHO_log))
## 
##  Anderson-Darling normality test
## 
## data:  resid(HHO_log)
## A = 4283.5, p-value < 2.2e-16

Second transformation using sqrt

HHO_sqrt<-lm(sqrt(famsizenew)~USborn, data=HHO)
tidy(HHO_sqrt)
##           term   estimate   std.error statistic p.value
## 1  (Intercept)  1.6438702 0.003334192 493.03400       0
## 2 USbornUSHead -0.1870618 0.003585981 -52.16474       0

QQplot for HHO_sqrt

qqnorm(rstudent(HHO_sqrt), main="Q-Q Plot for Model Residuals")
qqline(rstudent(HHO_sqrt), col="red")

Using Anderson-Darling on HHO_sqrt

ad.test(resid(HHO_sqrt))
## 
##  Anderson-Darling normality test
## 
## data:  resid(HHO_sqrt)
## A = 3880.7, p-value < 2.2e-16

Third transformation using inverse

HHO_inverse<-lm(I(1/famsizenew)~USborn, data=HHO)
tidy(HHO_inverse)
##           term   estimate   std.error statistic p.value
## 1  (Intercept) 0.48544443 0.002368842 204.92901       0
## 2 USbornUSHead 0.09986373 0.002547730  39.19714       0

QQplot for HHO_inverse could not be used due to “atomic vector error”

Do the transformations help my case? Not really. HHO_sqrt was a little better but not good enough.