HW 5 - CI of Estimate

1) Histogram and mean of population

Use the data set gcookbook::heightweight as the ‘population’

pop <- gcookbook::heightweight

1.1) Calculate the count, mean, and standard deviation of the population weightLb

pop_count <- nrow(pop)
pop_mean_wt <- round(mean(pop$weightLb),2)
pop_sd_wt <- round(sd(pop$weightLb),2)

There are 236 observations in this data set:

Mean weight: 101.01
STD: 18.93

1.2) Plot a density histogram of the population weightLB

pop_hist <- ggplot(data=pop, aes(x=weightLb)) +
  geom_histogram(aes(y=..density..),
                 color="black",
                 fill="white") +
  labs(
    title = "Density Histogram of Weight (lb)",
    x = "Weight (lb)",
    y = "Density") +
  geom_vline(aes(xintercept=pop_mean_wt),
             linetype="dashed",
             size=0.75) +
  geom_text(aes(x=pop_mean_wt, label=paste0("Mean: ",pop_mean_wt), y=0.032), 
            angle=90, 
            vjust=-.7, 
            size=3)
pop_hist

2) Histogram and mean of sample

2.1) Create a random sample of 15 from the population. Calculate the mean, standard deviation, and .90 and .99 CI of the mean predicted sample weights.

# Establish sample data set of 15 observations
n <- 15
samp <- pop %>% sample_n(n, replace=FALSE)
samp_count <- nrow(samp)
samp_mean_wt <- round(mean(samp$weightLb),2)
samp_sd_wt <- round(sd(samp$weightLb),2)

# Find standard error of sample data set for CI calculations below
samp_se <- samp_sd_wt / sqrt(n)

# Find upper and lower bounds of 90% confidence interval
alpha_90 <- 0.1
t_score_90 <- qt(p=alpha_90/2, df=n - 1)
margin_error_90 <- t_score_90 * samp_se
ubound_90 <- round(samp_mean_wt - margin_error_90, 1)
lbound_90 <- round(samp_mean_wt + margin_error_90, 1)

# Find upper and lower bounds of 99% confidence interval
alpha_99 <- 0.01
t_score_99 <- qt(p=alpha_99/2, df=n - 1)
margin_error_99 <- t_score_99 * samp_se
ubound_99 <- round(samp_mean_wt - margin_error_99, 1)
lbound_99 <- round(samp_mean_wt + margin_error_99, 1)

There are 15 randomly selected observations in this sample data set

Sample mean: 100.7
STD: 19.75
90% confidence interval:
- Upper bounds: 109.7
- Lower bounds: 91.7
99% confidence interval:
- Upper bounds: 115.9
- Lower bounds: 85.5

2.2) Overlay sample data set calculations over original histogram

I used ggtitle() to create a subtitle label for my plot that includes text and values.

plot_subtitle <- ggtitle(paste("Population (black):", pop_count, "obs.", "\nSample (red):", samp_count, "obs."))

To create the combined plots, I used the original geom_histogram plot for the overall population, and geom_density for the sample data. Then I added all of the required vlines and labels.

Note: Because the means for both data sets ended up nearly on top of each other, I had to format them differently than I normally would have. I used solid lines for the means, because they were easier to see than using dashed. I then used dashed lines for the upper and lower CI bounds to distinguish them from the means.

combined_plots <- ggplot(data=pop, aes(x=weightLb)) +
  geom_histogram(aes(y=..density..),
                 bins=20,
                 color="white",
                 fill="black") +
  geom_density(data=samp, aes(x=weightLb),
               color=NA,
               fill="red3",
               alpha=.25) +
  labs(
    title = "Density Histogram of Weight (lb)",
    subtitle = plot_subtitle,
    x = "Weight (lb)",
    y = "Density") +
  geom_vline(aes(xintercept=pop_mean_wt),
             size=1) +
  geom_vline(aes(xintercept=samp_mean_wt),
             color="red3",
             size=.75,
             alpha=.75) +
  geom_vline(aes(xintercept=lbound_90),
             color="blue",
             linetype="dashed") +
  geom_vline(aes(xintercept=ubound_90),
             color="blue",
             linetype="dashed") +
  geom_vline(aes(xintercept=lbound_99),
             color="seagreen",
             linetype="dashed") +
  geom_vline(aes(xintercept=ubound_99),
             color="seagreen",
             linetype="dashed") +
  geom_label(aes(x=pop_mean_wt, y=.03, label=paste0("Pop. mean: ",pop_mean_wt)),
             size=2.5,
             color="white",
             fill="black",
             hjust="right") +
  geom_label(aes(x=samp_mean_wt, y=.028, label=paste0("Samp. mean: ",samp_mean_wt)),
             size=2.5,
             color="red3",
             hjust="right") +
  geom_label(aes(x=ubound_90, y=.025, label=paste0("90%CI: [",lbound_90,", ",ubound_90,"]")),
             size=2.5,
             color="blue",
             hjust="left") +
  geom_label(aes(x=ubound_99, y=.023, label=paste0("99%CI: [",lbound_99,", ",ubound_99,"]")),
             size=2.5,
             color="seagreen",
             hjust="left") +
  theme_bw() +
  theme(plot.subtitle = element_text(size = 9, color = "gray31"))
combined_plots

HW 5 - CI of Estimate

Steph Bradley

2022-02-21

1) Histogram and mean of population

2) Histogram and mean of sample