Use the data set gcookbook::heightweight as the ‘population’
pop <- gcookbook::heightweight
1.1) Calculate the count, mean, and standard deviation of the population weightLb
pop_count <- nrow(pop)
pop_mean_wt <- round(mean(pop$weightLb),2)
pop_sd_wt <- round(sd(pop$weightLb),2)
There are 236 observations in this data set:
1.2) Plot a density histogram of the population weightLB
pop_hist <- ggplot(data=pop, aes(x=weightLb)) +
geom_histogram(aes(y=..density..),
color="black",
fill="white") +
labs(
title = "Density Histogram of Weight (lb)",
x = "Weight (lb)",
y = "Density") +
geom_vline(aes(xintercept=pop_mean_wt),
linetype="dashed",
size=0.75) +
geom_text(aes(x=pop_mean_wt, label=paste0("Mean: ",pop_mean_wt), y=0.032),
angle=90,
vjust=-.7,
size=3)
pop_hist
2.1) Create a random sample of 15 from the population. Calculate the mean, standard deviation, and .90 and .99 CI of the mean predicted sample weights.
# Establish sample data set of 15 observations
n <- 15
samp <- pop %>% sample_n(n, replace=FALSE)
samp_count <- nrow(samp)
samp_mean_wt <- round(mean(samp$weightLb),2)
samp_sd_wt <- round(sd(samp$weightLb),2)
# Find standard error of sample data set for CI calculations below
samp_se <- samp_sd_wt / sqrt(n)
# Find upper and lower bounds of 90% confidence interval
alpha_90 <- 0.1
t_score_90 <- qt(p=alpha_90/2, df=n - 1)
margin_error_90 <- t_score_90 * samp_se
ubound_90 <- round(samp_mean_wt - margin_error_90, 1)
lbound_90 <- round(samp_mean_wt + margin_error_90, 1)
# Find upper and lower bounds of 99% confidence interval
alpha_99 <- 0.01
t_score_99 <- qt(p=alpha_99/2, df=n - 1)
margin_error_99 <- t_score_99 * samp_se
ubound_99 <- round(samp_mean_wt - margin_error_99, 1)
lbound_99 <- round(samp_mean_wt + margin_error_99, 1)
There are 15 randomly selected observations in this sample data set
2.2) Overlay sample data set calculations over original histogram
I used ggtitle() to create a subtitle label for my plot that includes text and values.
plot_subtitle <- ggtitle(paste("Population (black):", pop_count, "obs.", "\nSample (red):", samp_count, "obs."))
To create the combined plots, I used the original geom_histogram plot for the overall population, and geom_density for the sample data. Then I added all of the required vlines and labels.
Note: Because the means for both data sets ended up nearly on top of each other, I had to format them differently than I normally would have. I used solid lines for the means, because they were easier to see than using dashed. I then used dashed lines for the upper and lower CI bounds to distinguish them from the means.
combined_plots <- ggplot(data=pop, aes(x=weightLb)) +
geom_histogram(aes(y=..density..),
bins=20,
color="white",
fill="black") +
geom_density(data=samp, aes(x=weightLb),
color=NA,
fill="red3",
alpha=.25) +
labs(
title = "Density Histogram of Weight (lb)",
subtitle = plot_subtitle,
x = "Weight (lb)",
y = "Density") +
geom_vline(aes(xintercept=pop_mean_wt),
size=1) +
geom_vline(aes(xintercept=samp_mean_wt),
color="red3",
size=.75,
alpha=.75) +
geom_vline(aes(xintercept=lbound_90),
color="blue",
linetype="dashed") +
geom_vline(aes(xintercept=ubound_90),
color="blue",
linetype="dashed") +
geom_vline(aes(xintercept=lbound_99),
color="seagreen",
linetype="dashed") +
geom_vline(aes(xintercept=ubound_99),
color="seagreen",
linetype="dashed") +
geom_label(aes(x=pop_mean_wt, y=.03, label=paste0("Pop. mean: ",pop_mean_wt)),
size=2.5,
color="white",
fill="black",
hjust="right") +
geom_label(aes(x=samp_mean_wt, y=.028, label=paste0("Samp. mean: ",samp_mean_wt)),
size=2.5,
color="red3",
hjust="right") +
geom_label(aes(x=ubound_90, y=.025, label=paste0("90%CI: [",lbound_90,", ",ubound_90,"]")),
size=2.5,
color="blue",
hjust="left") +
geom_label(aes(x=ubound_99, y=.023, label=paste0("99%CI: [",lbound_99,", ",ubound_99,"]")),
size=2.5,
color="seagreen",
hjust="left") +
theme_bw() +
theme(plot.subtitle = element_text(size = 9, color = "gray31"))
combined_plots