library(haven)
ipums<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")
## `curl` package not installed, falling back to using `url()`
names(ipums) #print the column names
##  [1] "year"      "datanum"   "serial"    "hhwt"      "statefip" 
##  [6] "met2013"   "puma"      "gq"        "pernum"    "perwt"    
## [11] "famsize"   "nchild"    "nchlt5"    "eldch"     "nsibs"    
## [16] "relate"    "related"   "sex"       "age"       "marst"    
## [21] "birthyr"   "fertyr"    "race"      "raced"     "hispan"   
## [26] "hispand"   "bpl"       "bpld"      "citizen"   "yrsusa1"  
## [31] "language"  "languaged" "speakeng"  "educ"      "educd"    
## [36] "empstat"   "empstatd"  "labforce"  "occ"       "ind"      
## [41] "inctot"    "incwage"   "poverty"   "hwsei"     "migrate1" 
## [46] "migrate1d" "carpool"   "trantime"
library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
newpums2<-ipums%>%
  mutate(familysize= ifelse(famsize%in%c(999998,999999), NA, famsize))%>%
filter(relate==1) %>%
  mutate(birthplace= ifelse(bpl%in%c(001:120), "American", "Foreign")) 

We analyze the means for the American population vs. the foreign and the results show that the American population has on average .64 less children per family. Americans have slighly smaller families when compared to the mean average of all the foreign populations.

 newpums2%>%
  group_by(birthplace)%>%
  summarise(means=mean(familysize, na.rm=T))
## # A tibble: 2 x 2
##   birthplace    means
##        <chr>    <dbl>
## 1   American 2.290301
## 2    Foreign 2.934445

In looking at some additional summary statistics, we see that the foreign population has a higher mean on the box plot than the American population.However, the Welch Two Sample t-test shows that th difference between the foreign and the American family size is not significant with a distribution that contains a lot of variability based on the degree of freedom of 19138 and a p-value of 2.2.

newpums2%>%
   ggplot(aes(x=birthplace, y=familysize))+geom_boxplot()

t.test(familysize~birthplace, data=newpums2)
## 
##  Welch Two Sample t-test
## 
## data:  familysize by birthplace
## t = -46.032, df = 19138, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.6715719 -0.6167158
## sample estimates:
## mean in group American  mean in group Foreign 
##               2.290301               2.934445

The difference between the American mean of 2.29 and the foreign mean of 2.93 is .64 as observed in through the “tidy” function.

super1<-lm(familysize~birthplace, data=newpums2)
library(broom)
tidy(super1)
##                term  estimate  std.error statistic p.value
## 1       (Intercept) 2.2903010 0.00434931 526.58946       0
## 2 birthplaceForeign 0.6441438 0.01181550  54.51685       0

After evaluating for normality of errors, we find some discrepancies.

qqnorm(rstudent(super1), main="Q-Q Plot for Model Residuals")
qqline(rstudent(super1), col="red")

We apply a log transformation to address the anormalities. We see that the second graph “Q-Q Plot for Model Residuals Version 2” does the best job however, there is still some skewness possibly from some extreme values in the data set. The rest of the transformation are not helpful.

super12<-lm(log(familysize)~birthplace, data=newpums2)

super13<-lm(sqrt(familysize)~birthplace, data=newpums2)

super14<-lm(I(1/familysize)~birthplace, data=newpums2)

qqnorm(rstudent(super12), main="Q-Q Plot for Model Residuals Version 1")
qqline(rstudent(super12), col="red")

qqnorm(rstudent(super13), main="Q-Q Plot for Model Residuals Version 2")
qqline(rstudent(super13), col="red")

qqnorm(rstudent(super14), main="Q-Q Plot for Model Residuals Version 3")

#qqline(rstudent(super14), col="red")