Intro to R and R Studio Lab 1

Install the necessary packages for this lab

# install.packages("tidyverse")
# install.packages("openintro")
# Added # after it was already installed

Access packages using library

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata

Dr. Arbuthnot’s Baptism Records

Load the data

arbuthnot <- arbuthnot

Summary of the dataset

glimpse(arbuthnot)
## Rows: 82
## Columns: 3
## $ year  <int> 1629, 1630, 1631, 1632, 1633, 1634, 1635, 1636, 1637, 1638, 1639…
## $ boys  <int> 5218, 4858, 4422, 4994, 5158, 5035, 5106, 4917, 4703, 5359, 5366…
## $ girls <int> 4683, 4457, 4102, 4590, 4839, 4820, 4928, 4605, 4457, 4952, 4784…

Some Exploration

boys <- arbuthnot$boys

Exercise 1

girls <- arbuthnot$girls
# Create a scatter plot
ggplot(data = arbuthnot, aes(x = year, y = girls)) + 
  geom_point()

# Create line graph

ggplot(data = arbuthnot, aes(x = year, y = girls)) +
  geom_line()

Exercise 2

?ggplot
# Can use R as calculator to find manually the total number of baptisms in 1629
5218 + 4863
## [1] 10081
# Faster way to calculate using data from above. Calculations below are for each year in the grid
arbuthnot$boys + arbuthnot$girls
##  [1]  9901  9315  8524  9584  9997  9855 10034  9522  9160 10311 10150 10850
## [13] 10670 10370  9410  8104  7966  7163  7332  6544  5825  5612  6071  6128
## [25]  6155  6620  7004  7050  6685  6170  5990  6971  8855 10019 10292 11722
## [37]  9972  8997 10938 11633 12335 11997 12510 12563 11895 11851 11775 12399
## [49] 12626 12601 12288 12847 13355 13653 14735 14702 14730 14694 14951 14588
## [61] 14771 15211 15054 14918 15159 13632 13976 14861 15829 16052 15363 14639
## [73] 15616 15687 15448 11851 16145 15369 16066 15862 15220 14928
# Saving vector of total number of baptisms to generate plots. Saving it as permanent column in data frame using this code:
arbuthnot <- arbuthnot %>%
  mutate(total = boys + girls)
# Line plot of total number of baptisms per year:
ggplot(data = arbuthnot, aes(x = year, y = total)) + 
  geom_line()

# Manually calculate raito for every year
5218 / 4683
## [1] 1.114243
# Calculate same as above but for every year with R
arbuthnot <- arbuthnot %>%
  mutate(boy_to_girl_ratio = boys / girls)
# Compute the proportions of newborns that are boys in 1629 manually
5218 / (5218 + 4683)
## [1] 0.5270175
# Calculate same as above for all years with R
arbuthnot <- arbuthnot %>%
  mutate(boy_ratio = boys / total)

Exercise 3

# Generate a plot of the proportion of boys born over time
ggplot (data = arbuthnot, aes(x = year, y = boys/total)) +
  geom_line()

# New variable more_boys tells whether the nymber of births of boys outnymbered that of girls:
arbuthnot <- arbuthnot %>%
  mutate(more_boys = boys > girls)

More Practice

# Find max and min values of columns 
arbuthnot %>%
  summarize(min = min(boys),
            max = max(boys)
            )
## # A tibble: 1 × 2
##     min   max
##   <int> <int>
## 1  2890  8426

Exercise 4

Load ‘present’ data

present <- present
dim(present)
## [1] 63  3

Years 1940 - 2002 are included in this dataset Dimensions are 63x3 Variable names: year, boys, girls

Exercise 5

How do these counts compare to Arbuthnot’s? Are they of similar magnitude?

# present total and arbuthnot total to compare

present$boys + present$girls
##  [1] 2360399 2513427 2808996 2936860 2794800 2735456 3288672 3699940 3535068
## [10] 3559529 3554149 3750850 3846986 3902120 4017362 4047295 4163090 4254784
## [19] 4203812 4244796 4257850 4268326 4167362 4098020 4027490 3760358 3606274
## [28] 3520959 3501564 3600206 3731386 3555970 3258411 3136965 3159958 3144198
## [37] 3167788 3326632 3333279 3494398 3612258 3629238 3680537 3638933 3669141
## [46] 3760561 3756547 3809394 3909510 4040958 4158212 4110907 4065014 4000240
## [55] 3952767 3899589 3891494 3880894 3941553 3959417 4058814 4025933 4021726
arbuthnot$boys + arbuthnot$girls
##  [1]  9901  9315  8524  9584  9997  9855 10034  9522  9160 10311 10150 10850
## [13] 10670 10370  9410  8104  7966  7163  7332  6544  5825  5612  6071  6128
## [25]  6155  6620  7004  7050  6685  6170  5990  6971  8855 10019 10292 11722
## [37]  9972  8997 10938 11633 12335 11997 12510 12563 11895 11851 11775 12399
## [49] 12626 12601 12288 12847 13355 13653 14735 14702 14730 14694 14951 14588
## [61] 14771 15211 15054 14918 15159 13632 13976 14861 15829 16052 15363 14639
## [73] 15616 15687 15448 11851 16145 15369 16066 15862 15220 14928

Present is much larger since it captures birth for all of the US rather than just London

Exercise 6

# Add total column
present <- present %>%
  mutate(total = boys + girls)
# Generate a plot of the proportion of boys born over time
ggplot (data = present, aes(x = year, y = boys/total)) +
  geom_point()

Proportion of boys born decreases over time

# Compare present ratio of boys to girls
present <- present %>%
  mutate(more_boys = boys > girls)

Arbuthnot’s observation is true as well in the US

Exercise 7

# Number of births per year in descending order:
present %>%
  arrange(desc(total))
## # A tibble: 63 × 5
##     year    boys   girls   total more_boys
##    <dbl>   <dbl>   <dbl>   <dbl> <lgl>    
##  1  1961 2186274 2082052 4268326 TRUE     
##  2  1960 2179708 2078142 4257850 TRUE     
##  3  1957 2179960 2074824 4254784 TRUE     
##  4  1959 2173638 2071158 4244796 TRUE     
##  5  1958 2152546 2051266 4203812 TRUE     
##  6  1962 2132466 2034896 4167362 TRUE     
##  7  1956 2133588 2029502 4163090 TRUE     
##  8  1990 2129495 2028717 4158212 TRUE     
##  9  1991 2101518 2009389 4110907 TRUE     
## 10  1963 2101632 1996388 4098020 TRUE     
## # … with 53 more rows

1961 had highest number of births