pacman::p_load(tidyverse, openintro, knitr)  # an alternative way of loading libraries
                                             # you'll need to install the pacman package
# knitr package has the kable() function which outputs data in nicely formatted tables see exercised 3 & 4

glimpse(arbuthnot)
## Rows: 82
## Columns: 3
## $ year  <int> 1629, 1630, 1631, 1632, 1633, 1634, 1635, 1636, 1637, 1638, 1639~
## $ boys  <int> 5218, 4858, 4422, 4994, 5158, 5035, 5106, 4917, 4703, 5359, 5366~
## $ girls <int> 4683, 4457, 4102, 4590, 4839, 4820, 4928, 4605, 4457, 4952, 4784~

Exercise 1 - view the column of girls data in the arbuthnot file

arbuthnot$girls
##  [1] 4683 4457 4102 4590 4839 4820 4928 4605 4457 4952 4784 5332 5200 4910 4617
## [16] 3997 3919 3395 3536 3181 2746 2722 2840 2908 2959 3179 3349 3382 3289 3013
## [31] 2781 3247 4107 4803 4881 5681 4858 4319 5322 5560 5829 5719 6061 6120 5822
## [46] 5738 5717 5847 6203 6033 6041 6299 6533 6744 7158 7127 7246 7119 7214 7101
## [61] 7167 7302 7392 7316 7483 6647 6713 7229 7767 7626 7452 7061 7514 7656 7683
## [76] 5738 7779 7417 7687 7623 7380 7288

Exercise 2 - dot and line graphs for girls data

The graph shows a general upward trend for the number of girls baptized overall. With a severe decline from 1640 to 1660, another severe drop in the mid 1660s, and another in the early 1700s, corresponding with plague epidemics.

G1 <- ggplot(data = arbuthnot)    #use the arbuthnot data assign it to G1 variable

G1 + geom_point(aes(x = year, y = girls),                # dot plot of girls data
                color = "hotpink", size = 2) +           # make the dots pink and larger
   labs(title = "Baptismal trend for girls, dotplot")    # add plot title

G1 + geom_line(aes(x = year, y = girls),                 # line graph of girls data
                color = "hotpink1", size = 1.25) +       # make the line pink and thicker
   labs(title = "Baptismal trend for girls, line graph") # add title

Exercise 3 - proportion of boys baptized

The ratio of boys baptized is slightly greater than 50%, with a downward trend toward 51% in the early 1700’s.

arbuthnot <- arbuthnot %>%
  mutate(total = boys + girls)            # add total column to arbuthnot

arbuthnot <- arbuthnot %>%  
             mutate( boy_ratio = boys / total)    # add boys ration column

G1 <- ggplot(data = arbuthnot)                    # use updated arbuthnot file for G1

G1 + geom_line(aes(x = year, y = boys / total),      # line graph of boys ratio column
               color = "dodgerblue", size = 1.25) +  # make the line blue and thicker
   labs(title = "Baptismal proportion for boys")     # add title

arbuthnot %>%
 summarize(Minimum = min(boys), Maximum = max(boys)) %>%  # present min & max boys data
  kable(caption = "Boys baptized")                        # in a table with a title
Boys baptized
Minimum Maximum
2890 8426

Exercise 4 - describe the present dataset and find it’s date range

There are three columns and 63 rows in the present dataset. The columns are year, boys and girls. The years in the dataset run from 1940 to 2002.

glimpse(present)     # look at the present file
## Rows: 63
## Columns: 3
## $ year  <dbl> 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950~
## $ boys  <dbl> 1211684, 1289734, 1444365, 1508959, 1435301, 1404587, 1691220, 1~
## $ girls <dbl> 1148715, 1223693, 1364631, 1427901, 1359499, 1330869, 1597452, 1~
present %>%                       # find the first & last year in the present data file
  summarize(min = min(year),
            max = max(year)
            ) %>%                              # present min & max boys data
      kable(caption = "present data date range",
            col.names = c("From", "To"))             # in a table with a title
present data date range
From To
1940 2002

Exercise 5 - compare the counts in arbuthnot and present datasets

The birth counts for the present dataset are much larger than the baptismal counts in the arbuthnot dataset. This isn’t surprising, the arbuthnot data is for a single city in the 17th and 18th centuries, while the present data is for the US in the 20th and 21st centuries. The minimal values for present data are more than 400 times greater than those in the arbuthnot file, and the maximum values are more than 250 times greater.

min(present$boys + present$girls) / min(arbuthnot$total)  # proportions of present data to 
## [1] 420.5985
                                                          # arbuthnot data
max(present$boys + present$girls) / max(arbuthnot$total)
## [1] 264.3745

Exercise 6 - graph the proportion of boys data in the present dataset

The proportion of boys born in the US is slightly higher than 51%, showing spikes after WWII and the Viet Nam war. Overall the is a trend toward 50%.

present <- present %>%
  mutate(total = boys + girls)                            # add total column to present data

present <- present %>% mutate( boy_ratio = boys / total)  # add ratio of boys to present data

G1 <- ggplot(data = present)
G1 + geom_line(aes(x = year, y = boy_ratio),              # plot the US boys proportion
               color = "blue", size = 1.25) +             # make the line blue, and give it a size
   labs(title = "Proportion of boys in US Births")        # give the graph a title

Exercise 7 - Sort the present dataset and find the maximum of total births

p1 <- present %>%
  arrange(desc(total))         # sort the present data descending on the total column

  head(p1)                     # look at the beginning of the data to find the max total births
## # A tibble: 6 x 5
##    year    boys   girls   total boy_ratio
##   <dbl>   <dbl>   <dbl>   <dbl>     <dbl>
## 1  1961 2186274 2082052 4268326     0.512
## 2  1960 2179708 2078142 4257850     0.512
## 3  1957 2179960 2074824 4254784     0.512
## 4  1959 2173638 2071158 4244796     0.512
## 5  1958 2152546 2051266 4203812     0.512
## 6  1962 2132466 2034896 4167362     0.512

The greatest number of total births in the US was 4,268,326 in 1961.