In this homework exercise, certain data manipulation tasks are given and handled. These tasks include the use of some of the more common functions in statistics.
library('wooldridge'); data(bwght)
data <- apply_labels(bwght,
faminc="1988 family income, $1000s",
cigtax = "cig. tax in home state, 1988",
cigprice = "cig. price in home state, 1988",
bwght = "birth weight, ounces",
fatheduc = "father's yrs of educ",
motheduc = "mother's yrs of educ",
parity = "birth order of child",
male = "=1 if male child",
white = "=1 if white",
cigs = "cigs smked per day while preg",
lbwght = "log of bwght",
bwghtlbs = "birth weight, pounds",
packs = "packs smked per day while preg",
lfaminc = "log(faminc)")
knitr::kable(data%>%label, col.names = 'Full Description',
caption = 'Labeling data variables with full names')
| Full Description | |
|---|---|
| faminc | 1988 family income, $1000s |
| cigtax | cig. tax in home state, 1988 |
| cigprice | cig. price in home state, 1988 |
| bwght | birth weight, ounces |
| fatheduc | father’s yrs of educ |
| motheduc | mother’s yrs of educ |
| parity | birth order of child |
| male | =1 if male child |
| white | =1 if white |
| cigs | cigs smked per day while preg |
| lbwght | log of bwght |
| bwghtlbs | birth weight, pounds |
| packs | packs smked per day while preg |
| lfaminc | log(faminc) |
The values of packs is only limited to 18 numbers. Therefore, a summarized tabulation of lbwght is given against packs.
data <- na.omit(data)
group_by(data, packs) %>%
summarise(n(), mean(lbwght), median(lbwght), var(lbwght)) %>%
knitr::kable(caption = "Tabulation of lbwght against packs",
col.names = c('Packs', '$n_{lbwght}$', '$\\mu_{lbwght}$'
, '$me_{lbwght}$', '$\\sigma^2_{lbwght}$') )
| Packs | \(n_{lbwght}\) | \(\mu_{lbwght}\) | \(me_{lbwght}\) | \(\sigma^2_{lbwght}\) |
|---|---|---|---|---|
| 0.00 | 1030 | 4.778286 | 4.804021 | 0.0350396 |
| 0.05 | 3 | 4.804115 | 4.736198 | 0.0157109 |
| 0.10 | 2 | 4.754592 | 4.754592 | 0.0647148 |
| 0.15 | 5 | 4.747467 | 4.779123 | 0.0375609 |
| 0.20 | 4 | 4.804990 | 4.787492 | 0.0135416 |
| 0.25 | 17 | 4.718279 | 4.779123 | 0.0197346 |
| 0.30 | 6 | 4.799497 | 4.827801 | 0.0345327 |
| 0.35 | 3 | 4.741390 | 4.736198 | 0.0192901 |
| 0.40 | 4 | 4.451185 | 4.578259 | 0.1368672 |
| 0.50 | 45 | 4.717769 | 4.727388 | 0.0292399 |
| 0.60 | 5 | 4.662641 | 4.634729 | 0.0204066 |
| 0.75 | 17 | 4.675230 | 4.709530 | 0.0228788 |
| 1.00 | 42 | 4.680499 | 4.731715 | 0.0352207 |
| 1.50 | 3 | 4.602582 | 4.543295 | 0.0173657 |
| 2.00 | 5 | 4.621004 | 4.564348 | 0.0431191 |
select(data, cigs, faminc, bwght, motheduc, fatheduc) %>%
gather() %>% group_by(key) %>%
summarise_all(funs(q1=quantile(.,.25), mean, median
, q3=quantile(., .75), sd ) ) %>%
knitr::kable(caption = "Summary of statistical values for variables")
| key | q1 | mean | median | q3 | sd |
|---|---|---|---|---|---|
| bwght | 108.0 | 119.529807 | 120.0 | 132.0 | 20.141239 |
| cigs | 0.0 | 1.769102 | 0.0 | 0.0 | 5.343771 |
| faminc | 18.5 | 32.219144 | 27.5 | 42.5 | 17.956198 |
| fatheduc | 12.0 | 13.191436 | 12.0 | 16.0 | 2.741274 |
| motheduc | 12.0 | 13.125105 | 12.0 | 15.0 | 2.417436 |
data %>% mutate(quantile = ntile(faminc, 10)) %>%
group_by(quantile) %>%
summarise(mean(motheduc/fatheduc), mean(faminc)) %>%
knitr::kable(caption = "Ratio of Mother's years of
education to father's years of education in different family income deciles",
col.names = c('Income quantile','Mean of family income',
'Mean of education years ratio'))
| Income quantile | Mean of family income | Mean of education years ratio |
|---|---|---|
| 1 | 1.1173627 | 7.333333 |
| 2 | 1.0374821 | 14.348740 |
| 3 | 1.0334289 | 18.743697 |
| 4 | 1.0260207 | 22.500000 |
| 5 | 1.0444308 | 26.701681 |
| 6 | 1.0000899 | 31.281513 |
| 7 | 1.0681879 | 35.693277 |
| 8 | 1.0177144 | 42.710084 |
| 9 | 0.9747711 | 58.088235 |
| 10 | 0.9950973 | 65.000000 |
data %>% mutate(bwght_kg = bwght*0.0283495) %>%
group_by(parity) %>%
summarise(mean(bwght_kg), median(bwght_kg)) %>%
knitr::kable()
| parity | mean(bwght_kg) | median(bwght_kg) |
|---|---|---|
| 1 | 3.355020 | 3.373591 |
| 2 | 3.418711 | 3.458639 |
| 3 | 3.467555 | 3.486988 |
| 4 | 3.440684 | 3.501163 |
| 5 | 3.586212 | 3.642911 |
| 6 | 3.345241 | 3.359416 |
ggplot(data = bwght) +
aes(x = fatheduc, y = faminc) +
geom_point(color = '#0c4c8a') +
geom_smooth(span = 0.1, method = lm, formula = 'y ~ x') +
labs(title = 'Plot of family income against years of education of father',
x = "Father's years of education",
y = 'Family income') +
theme_minimal()
Plot of family income against years of education of father
From the above plot, we can deduce that, on average, family income tends to increase with the years of the father’s education. This could serve as a building block for formulating a hypothesis regarding the causality between years of education and income.
qplot(faminc, bwght, data = data, facets = . ~ white) + theme(plot.subtitle = element_text(vjust = 1),
plot.caption = element_text(vjust = 1),
panel.grid.major = element_line(colour = "olivedrab",
linetype = "dashed"), panel.background = element_rect(fill = "aliceblue",
colour = "dodgerblue3", size = 1,
linetype = "solid"), plot.background = element_rect(fill = "white",
colour = "aquamarine4", size = 0.9)) +labs(title = "Birth weight against family income
based on race",
x = "Family income", y = "Birth weight")
Birth weight against family income based on race
qplot(faminc, bwght, data = data, facets = . ~ male)
Birth weight against family income based on gender
Again, quantiles are used to divide family income.
data %>% mutate(quantile4 = ntile(faminc, 4)) %>%
ggplot() +
geom_point() +
aes(motheduc, cigs) +
facet_wrap(~quantile4, nrow = 2) +
theme_light()
number of cigs by years of mother education