In this homework exercise, certain data manipulation tasks are given and handled. These tasks include the use of some of the more common functions in statistics.

Labeling Data

Labeling the imported data

library('wooldridge'); data(bwght)
data <- apply_labels(bwght,
                     faminc="1988 family income, $1000s",
                     cigtax     =   "cig. tax in home state, 1988",
                     cigprice   =   "cig. price in home state, 1988",
                     bwght      =   "birth weight, ounces",
                     fatheduc   =   "father's yrs of educ",
                     motheduc   =   "mother's yrs of educ",
                     parity     =   "birth order of child",
                     male       =   "=1 if male child",
                     white      =   "=1 if white",
                     cigs       =   "cigs smked per day while preg",
                     lbwght     =   "log of bwght",
                     bwghtlbs   =   "birth weight, pounds",
                     packs      =   "packs smked per day while preg",
                     lfaminc    =   "log(faminc)")
knitr::kable(data%>%label, col.names = 'Full Description',
             caption = 'Labeling data variables with full names')

Labeling data variables with full names
	Full Description
faminc	1988 family income, $1000s
cigtax	cig. tax in home state, 1988
cigprice	cig. price in home state, 1988
bwght	birth weight, ounces
fatheduc	father’s yrs of educ
motheduc	mother’s yrs of educ
parity	birth order of child
male	=1 if male child
white	=1 if white
cigs	cigs smked per day while preg
lbwght	log of bwght
bwghtlbs	birth weight, pounds
packs	packs smked per day while preg
lfaminc	log(faminc)

Tabulate variables

The values of packs is only limited to 18 numbers. Therefore, a summarized tabulation of lbwght is given against packs.

data <- na.omit(data)
group_by(data, packs) %>% 
    summarise(n(), mean(lbwght), median(lbwght), var(lbwght)) %>%
    knitr::kable(caption = "Tabulation of lbwght against packs",
                 col.names = c('Packs', '$n_{lbwght}$', '$\\mu_{lbwght}$'
                               , '$me_{lbwght}$', '$\\sigma^2_{lbwght}$') )

Tabulation of lbwght against packs
Packs	$n_{lbwght}$	$\mu_{lbwght}$	$me_{lbwght}$	$\sigma^2_{lbwght}$
0.00	1030	4.778286	4.804021	0.0350396
0.05	3	4.804115	4.736198	0.0157109
0.10	2	4.754592	4.754592	0.0647148
0.15	5	4.747467	4.779123	0.0375609
0.20	4	4.804990	4.787492	0.0135416
0.25	17	4.718279	4.779123	0.0197346
0.30	6	4.799497	4.827801	0.0345327
0.35	3	4.741390	4.736198	0.0192901
0.40	4	4.451185	4.578259	0.1368672
0.50	45	4.717769	4.727388	0.0292399
0.60	5	4.662641	4.634729	0.0204066
0.75	17	4.675230	4.709530	0.0228788
1.00	42	4.680499	4.731715	0.0352207
1.50	3	4.602582	4.543295	0.0173657
2.00	5	4.621004	4.564348	0.0431191

Summary of Variables

select(data, cigs, faminc, bwght, motheduc, fatheduc) %>%
    gather() %>% group_by(key) %>%
    summarise_all(funs(q1=quantile(.,.25), mean, median 
                       , q3=quantile(., .75), sd ) ) %>%
    knitr::kable(caption = "Summary of statistical values for variables")

Summary of statistical values for variables
key	q1	mean	median	q3	sd
bwght	108.0	119.529807	120.0	132.0	20.141239
cigs	0.0	1.769102	0.0	0.0	5.343771
faminc	18.5	32.219144	27.5	42.5	17.956198
fatheduc	12.0	13.191436	12.0	16.0	2.741274
motheduc	12.0	13.125105	12.0	15.0	2.417436

Ratio by Deciles

data %>% mutate(quantile = ntile(faminc, 10)) %>%
    group_by(quantile) %>% 
    summarise(mean(motheduc/fatheduc), mean(faminc)) %>%
    knitr::kable(caption = "Ratio of Mother's years of 
                 education to father's years of education in different family income deciles",
                 col.names = c('Income quantile','Mean of family income',
                               'Mean of education years ratio'))

Ratio of Mother’s years of education to father’s years of education in different family income deciles
Income quantile	Mean of family income	Mean of education years ratio
1	1.1173627	7.333333
2	1.0374821	14.348740
3	1.0334289	18.743697
4	1.0260207	22.500000
5	1.0444308	26.701681
6	1.0000899	31.281513
7	1.0681879	35.693277
8	1.0177144	42.710084
9	0.9747711	58.088235
10	0.9950973	65.000000

Birth Weight mean and median

data %>% mutate(bwght_kg = bwght*0.0283495) %>% 
    group_by(parity) %>%
    summarise(mean(bwght_kg), median(bwght_kg)) %>%
    knitr::kable()

parity	mean(bwght_kg)	median(bwght_kg)
1	3.355020	3.373591
2	3.418711	3.458639
3	3.467555	3.486988
4	3.440684	3.501163
5	3.586212	3.642911
6	3.345241	3.359416

Scatterplot for income

ggplot(data = bwght) +
  aes(x = fatheduc, y = faminc) +
  geom_point(color = '#0c4c8a') +
  geom_smooth(span = 0.1, method = lm, formula = 'y ~ x') +
  labs(title = 'Plot of family income against years of education of father',
    x = "Father's years of education",
    y = 'Family income') +
  theme_minimal()

Plot of family income against years of education of father

From the above plot, we can deduce that, on average, family income tends to increase with the years of the father’s education. This could serve as a building block for formulating a hypothesis regarding the causality between years of education and income.

Relation between birth weight and income

Based on race

qplot(faminc, bwght, data = data, facets = . ~ white) + theme(plot.subtitle = element_text(vjust = 1), 
    plot.caption = element_text(vjust = 1), 
    panel.grid.major = element_line(colour = "olivedrab", 
        linetype = "dashed"), panel.background = element_rect(fill = "aliceblue", 
        colour = "dodgerblue3", size = 1, 
        linetype = "solid"), plot.background = element_rect(fill = "white", 
        colour = "aquamarine4", size = 0.9)) +labs(title = "Birth weight against family income
                                                   based on race", 
    x = "Family income", y = "Birth weight")

Birth weight against family income based on race

Based on gender

qplot(faminc, bwght, data = data, facets = . ~ male)

Birth weight against family income based on gender

Plotting cigs by mothedoc

Again, quantiles are used to divide family income.

data %>% mutate(quantile4 = ntile(faminc, 4)) %>%
    ggplot() +
    geom_point() +
    aes(motheduc, cigs) +
    facet_wrap(~quantile4, nrow = 2) +
    theme_light()

number of cigs by years of mother education

STATA HW 1

Mohammad Ali Mohammadi

October 17, 2018