In this homework exercise, certain data manipulation tasks are given and handled. These tasks include the use of some of the more common functions in statistics.

Labeling Data

Labeling the imported data

library('wooldridge'); data(bwght)
data <- apply_labels(bwght,
                     faminc="1988 family income, $1000s",
                     cigtax     =   "cig. tax in home state, 1988",
                     cigprice   =   "cig. price in home state, 1988",
                     bwght      =   "birth weight, ounces",
                     fatheduc   =   "father's yrs of educ",
                     motheduc   =   "mother's yrs of educ",
                     parity     =   "birth order of child",
                     male       =   "=1 if male child",
                     white      =   "=1 if white",
                     cigs       =   "cigs smked per day while preg",
                     lbwght     =   "log of bwght",
                     bwghtlbs   =   "birth weight, pounds",
                     packs      =   "packs smked per day while preg",
                     lfaminc    =   "log(faminc)")
knitr::kable(data%>%label, col.names = 'Full Description',
             caption = 'Labeling data variables with full names')
Labeling data variables with full names
Full Description
faminc 1988 family income, $1000s
cigtax cig. tax in home state, 1988
cigprice cig. price in home state, 1988
bwght birth weight, ounces
fatheduc father’s yrs of educ
motheduc mother’s yrs of educ
parity birth order of child
male =1 if male child
white =1 if white
cigs cigs smked per day while preg
lbwght log of bwght
bwghtlbs birth weight, pounds
packs packs smked per day while preg
lfaminc log(faminc)

Tabulate variables

The values of packs is only limited to 18 numbers. Therefore, a summarized tabulation of lbwght is given against packs.

data <- na.omit(data)
group_by(data, packs) %>% 
    summarise(n(), mean(lbwght), median(lbwght), var(lbwght)) %>%
    knitr::kable(caption = "Tabulation of lbwght against packs",
                 col.names = c('Packs', '$n_{lbwght}$', '$\\mu_{lbwght}$'
                               , '$me_{lbwght}$', '$\\sigma^2_{lbwght}$') )
Tabulation of lbwght against packs
Packs \(n_{lbwght}\) \(\mu_{lbwght}\) \(me_{lbwght}\) \(\sigma^2_{lbwght}\)
0.00 1030 4.778286 4.804021 0.0350396
0.05 3 4.804115 4.736198 0.0157109
0.10 2 4.754592 4.754592 0.0647148
0.15 5 4.747467 4.779123 0.0375609
0.20 4 4.804990 4.787492 0.0135416
0.25 17 4.718279 4.779123 0.0197346
0.30 6 4.799497 4.827801 0.0345327
0.35 3 4.741390 4.736198 0.0192901
0.40 4 4.451185 4.578259 0.1368672
0.50 45 4.717769 4.727388 0.0292399
0.60 5 4.662641 4.634729 0.0204066
0.75 17 4.675230 4.709530 0.0228788
1.00 42 4.680499 4.731715 0.0352207
1.50 3 4.602582 4.543295 0.0173657
2.00 5 4.621004 4.564348 0.0431191

Summary of Variables

select(data, cigs, faminc, bwght, motheduc, fatheduc) %>%
    gather() %>% group_by(key) %>%
    summarise_all(funs(q1=quantile(.,.25), mean, median 
                       , q3=quantile(., .75), sd ) ) %>%
    knitr::kable(caption = "Summary of statistical values for variables")
Summary of statistical values for variables
key q1 mean median q3 sd
bwght 108.0 119.529807 120.0 132.0 20.141239
cigs 0.0 1.769102 0.0 0.0 5.343771
faminc 18.5 32.219144 27.5 42.5 17.956198
fatheduc 12.0 13.191436 12.0 16.0 2.741274
motheduc 12.0 13.125105 12.0 15.0 2.417436

Ratio by Deciles

data %>% mutate(quantile = ntile(faminc, 10)) %>%
    group_by(quantile) %>% 
    summarise(mean(motheduc/fatheduc), mean(faminc)) %>%
    knitr::kable(caption = "Ratio of Mother's years of 
                 education to father's years of education in different family income deciles",
                 col.names = c('Income quantile','Mean of family income',
                               'Mean of education years ratio'))
Ratio of Mother’s years of education to father’s years of education in different family income deciles
Income quantile Mean of family income Mean of education years ratio
1 1.1173627 7.333333
2 1.0374821 14.348740
3 1.0334289 18.743697
4 1.0260207 22.500000
5 1.0444308 26.701681
6 1.0000899 31.281513
7 1.0681879 35.693277
8 1.0177144 42.710084
9 0.9747711 58.088235
10 0.9950973 65.000000

Birth Weight mean and median

data %>% mutate(bwght_kg = bwght*0.0283495) %>% 
    group_by(parity) %>%
    summarise(mean(bwght_kg), median(bwght_kg)) %>%
    knitr::kable()
parity mean(bwght_kg) median(bwght_kg)
1 3.355020 3.373591
2 3.418711 3.458639
3 3.467555 3.486988
4 3.440684 3.501163
5 3.586212 3.642911
6 3.345241 3.359416

Scatterplot for income

ggplot(data = bwght) +
  aes(x = fatheduc, y = faminc) +
  geom_point(color = '#0c4c8a') +
  geom_smooth(span = 0.1, method = lm, formula = 'y ~ x') +
  labs(title = 'Plot of family income against years of education of father',
    x = "Father's years of education",
    y = 'Family income') +
  theme_minimal()
Plot of family income against years of education of father

Plot of family income against years of education of father

From the above plot, we can deduce that, on average, family income tends to increase with the years of the father’s education. This could serve as a building block for formulating a hypothesis regarding the causality between years of education and income.

Relation between birth weight and income

Based on race

qplot(faminc, bwght, data = data, facets = . ~ white) + theme(plot.subtitle = element_text(vjust = 1), 
    plot.caption = element_text(vjust = 1), 
    panel.grid.major = element_line(colour = "olivedrab", 
        linetype = "dashed"), panel.background = element_rect(fill = "aliceblue", 
        colour = "dodgerblue3", size = 1, 
        linetype = "solid"), plot.background = element_rect(fill = "white", 
        colour = "aquamarine4", size = 0.9)) +labs(title = "Birth weight against family income
                                                   based on race", 
    x = "Family income", y = "Birth weight")
Birth weight against family income based on race

Birth weight against family income based on race

Based on gender

qplot(faminc, bwght, data = data, facets = . ~ male)
Birth weight against family income based on gender

Birth weight against family income based on gender

Plotting cigs by mothedoc

Again, quantiles are used to divide family income.

data %>% mutate(quantile4 = ntile(faminc, 4)) %>%
    ggplot() +
    geom_point() +
    aes(motheduc, cigs) +
    facet_wrap(~quantile4, nrow = 2) +
    theme_light()
number of cigs by years of mother education

number of cigs by years of mother education