library(tidyverse)
library(openintro)
Exercise 1
Extract counts of girls baptized
## [1] 4683 4457 4102 4590 4839 4820 4928 4605 4457 4952 4784 5332 5200 4910 4617
## [16] 3997 3919 3395 3536 3181 2746 2722 2840 2908 2959 3179 3349 3382 3289 3013
## [31] 2781 3247 4107 4803 4881 5681 4858 4319 5322 5560 5829 5719 6061 6120 5822
## [46] 5738 5717 5847 6203 6033 6041 6299 6533 6744 7158 7127 7246 7119 7214 7101
## [61] 7167 7302 7392 7316 7483 6647 6713 7229 7767 7626 7452 7061 7514 7656 7683
## [76] 5738 7779 7417 7687 7623 7380 7288
Data Visualization
ggplot(data = arbuthnot, aes(x = year, y = girls)) + geom_point()

Plot with line
ggplot(data = arbuthnot, aes(x=year, y=girls)) + geom_line()

Exercise 2
Apparent trend in the number of girls baptized
# Adding new variable to the data frame
arbuthnot <- arbuthnot %>%
mutate(total = boys + girls)
# Plot for the trend
ggplot(data = arbuthnot, aes(x=year, y= total)) + geom_line()

# Create two new variables on data frame
arbuthnot <- arbuthnot %>%
mutate(boy_to_girl_ratio = boys / girls)
arbuthnot <- arbuthnot %>%
mutate(boy_ratio = boys /total)
Exercise 3
Plot proportion of boys over time
ggplot(data = arbuthnot, aes(x = year, y = boy_ratio)) + geom_line()

Add a data frame with logical data
arbuthnot <- arbuthnot %>%
mutate(more_boys = boys > girls)
Find min and max values of columns
arbuthnot %>%
summarise(min = min(boys), max = max(boys))
## # A tibble: 1 x 2
## min max
## <int> <int>
## 1 2890 8426
Exercise 4
Explore new data frame
# Show years included in the data set
present$year
## [1] 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954
## [16] 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969
## [31] 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984
## [46] 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999
## [61] 2000 2001 2002
The present data set includes years from 1940 to 2002.
# Dimensions of data frame
dim.data.frame(present)
## [1] 63 3
# Retrieve columns names
colnames(present)
## [1] "year" "boys" "girls"
Exercise 5
Compare Arbuthnot’s and present data set counts.
# Arbuthnot's dim
arbuthnot$year
## [1] 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643
## [16] 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658
## [31] 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673
## [46] 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688
## [61] 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703
## [76] 1704 1705 1706 1707 1708 1709 1710
# Arbuthnot dim
dim.data.frame(arbuthnot)
## [1] 82 7
Arbuthnot data set includes year from 1629 to 1710. Thus, Arbuthnot data set has 81 different years while The present data set has 62 different years.
They are not of a similar magnitude. Arbuthnot data set is 19 years longer than present data set.
Exercise 6
Proportion of boys over time.
# Create 2 new variables in present data set
present <- present %>%
mutate(total = boys + girls)
present <- present %>%
mutate(boy_ratio = boys / total)
# Plot proportion of boys born over time for present data set
ggplot(data = present, aes(x = year, y = boy_ratio)) + geom_line()

The proportion of born over time declines from 1940 to late 60’s then increases a bit between late 60’s to mid 70’s, then start decreasing again. Overall, we can say that the proportion of boys over time decreases.
Boys to girls ratio in US
present <- present %>%
mutate(more_boys = boys > girls)
present$more_boys
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [46] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [61] TRUE TRUE TRUE
As we can see, that is true. Arbuthnot’s observation about boys being born in greater proportion than girls holds up in the U.S as well.
present <- present %>%
mutate(boy_to_girl_ratio = boys / girls)
ggplot(data = present, aes(x = year, y = boy_to_girl_ratio)) + geom_line()

Exercise 7
Most number of birth in U.S
present %>%
arrange(desc(total))
## # A tibble: 63 x 7
## year boys girls total boy_ratio more_boys boy_to_girl_ratio
## <dbl> <dbl> <dbl> <dbl> <dbl> <lgl> <dbl>
## 1 1961 2186274 2082052 4268326 0.512 TRUE 1.05
## 2 1960 2179708 2078142 4257850 0.512 TRUE 1.05
## 3 1957 2179960 2074824 4254784 0.512 TRUE 1.05
## 4 1959 2173638 2071158 4244796 0.512 TRUE 1.05
## 5 1958 2152546 2051266 4203812 0.512 TRUE 1.05
## 6 1962 2132466 2034896 4167362 0.512 TRUE 1.05
## 7 1956 2133588 2029502 4163090 0.513 TRUE 1.05
## 8 1990 2129495 2028717 4158212 0.512 TRUE 1.05
## 9 1991 2101518 2009389 4110907 0.511 TRUE 1.05
## 10 1963 2101632 1996388 4098020 0.513 TRUE 1.05
## # ... with 53 more rows
As we can see from the above, the most total number of births in U.S was in 1961.
LS0tDQp0aXRsZTogIkxhYiAxOiBJbnRybyB0byBSIg0KYXV0aG9yOiAiSmVyZWQgQXRha3kiDQpkYXRlOiAiOS82LzIwMjAiDQpvdXRwdXQ6IG9wZW5pbnRybzo6bGFiX3JlcG9ydA0KLS0tDQoNCmBgYHtyIGxvYWQtcGFja2FnZXMsIG1lc3NhZ2U9RkFMU0V9DQpsaWJyYXJ5KHRpZHl2ZXJzZSkNCmxpYnJhcnkob3BlbmludHJvKQ0KYGBgDQoNCiMjIyBFeGVyY2lzZSAxDQpFeHRyYWN0IGNvdW50cyBvZiBnaXJscyBiYXB0aXplZA0KDQpgYGB7ciB2aWV3LWdpcmxzLWNvdW50c30NCmFyYnV0aG5vdCRnaXJscw0KYGBgDQoNCioqRGF0YSBWaXN1YWxpemF0aW9uKioNCg0KYGBge3J9DQoNCmdncGxvdChkYXRhID0gYXJidXRobm90LCBhZXMoeCA9IHllYXIsIHkgPSBnaXJscykpICsgZ2VvbV9wb2ludCgpDQoNCg0KYGBgDQoNCg0KDQoqKipQbG90IHdpdGggbGluZSoqKg0KDQpgYGB7cn0NCg0KZ2dwbG90KGRhdGEgPSBhcmJ1dGhub3QsIGFlcyh4PXllYXIsIHk9Z2lybHMpKSArIGdlb21fbGluZSgpDQoNCmBgYA0KDQoNCg0KIyMjIEV4ZXJjaXNlIDINCg0KQXBwYXJlbnQgdHJlbmQgaW4gdGhlIG51bWJlciBvZiBnaXJscyBiYXB0aXplZA0KDQpgYGB7ciB0cmVuZC1naXJsc30NCg0KIyBBZGRpbmcgbmV3IHZhcmlhYmxlIHRvIHRoZSBkYXRhIGZyYW1lDQoNCmFyYnV0aG5vdCA8LSBhcmJ1dGhub3QgJT4lDQogIG11dGF0ZSh0b3RhbCA9IGJveXMgKyBnaXJscykNCg0KIyBQbG90IGZvciB0aGUgdHJlbmQNCg0KZ2dwbG90KGRhdGEgPSBhcmJ1dGhub3QsIGFlcyh4PXllYXIsIHk9IHRvdGFsKSkgKyBnZW9tX2xpbmUoKQ0KYGBgDQpgYGB7cn0NCiMgQ3JlYXRlIHR3byBuZXcgdmFyaWFibGVzIG9uIGRhdGEgZnJhbWUNCg0KYXJidXRobm90IDwtIGFyYnV0aG5vdCAlPiUNCiAgbXV0YXRlKGJveV90b19naXJsX3JhdGlvID0gYm95cyAvIGdpcmxzKQ0KDQphcmJ1dGhub3QgPC0gYXJidXRobm90ICU+JQ0KICBtdXRhdGUoYm95X3JhdGlvID0gYm95cyAvdG90YWwpDQoNCmBgYA0KDQoNCiMjIyBFeGVyY2lzZSAzDQoNCioqKlBsb3QgcHJvcG9ydGlvbiBvZiBib3lzIG92ZXIgdGltZSoqKg0KDQpgYGB7ciBwbG90LXByb3AtYm95cy1hcmJ1dGhub3R9DQpnZ3Bsb3QoZGF0YSA9IGFyYnV0aG5vdCwgYWVzKHggPSB5ZWFyLCB5ID0gYm95X3JhdGlvKSkgKyBnZW9tX2xpbmUoKQ0KDQpgYGANCg0KKioqQWRkIGEgZGF0YSBmcmFtZSB3aXRoIGxvZ2ljYWwgZGF0YSoqKg0KYGBge3J9DQphcmJ1dGhub3QgPC0gYXJidXRobm90ICU+JQ0KICBtdXRhdGUobW9yZV9ib3lzID0gYm95cyA+IGdpcmxzKQ0KDQpgYGANCg0KKioqRmluZCBtaW4gYW5kIG1heCB2YWx1ZXMgb2YgY29sdW1ucyoqKg0KYGBge3J9DQoNCmFyYnV0aG5vdCAlPiUNCiAgc3VtbWFyaXNlKG1pbiA9IG1pbihib3lzKSwgbWF4ID0gbWF4KGJveXMpKQ0KYGBgDQojIyMgRXhlcmNpc2UgNA0KDQpFeHBsb3JlIG5ldyBkYXRhIGZyYW1lDQoNCmBgYHtyIGRpbS1wcmVzZW50fQ0KIyBTaG93IHllYXJzIGluY2x1ZGVkIGluIHRoZSBkYXRhIHNldA0KcHJlc2VudCR5ZWFyDQpgYGANClRoZSBwcmVzZW50IGRhdGEgc2V0IGluY2x1ZGVzIHllYXJzIGZyb20gMTk0MCB0byAyMDAyLg0KDQpgYGB7cn0NCiMgRGltZW5zaW9ucyBvZiBkYXRhIGZyYW1lDQpkaW0uZGF0YS5mcmFtZShwcmVzZW50KQ0KDQpgYGANCg0KYGBge3J9DQojIFJldHJpZXZlIGNvbHVtbnMgbmFtZXMNCmNvbG5hbWVzKHByZXNlbnQpDQoNCmBgYA0KIyMjIEV4ZXJjaXNlIDUNCg0KQ29tcGFyZSBBcmJ1dGhub3QncyBhbmQgcHJlc2VudCBkYXRhIHNldCBjb3VudHMuDQoNCmBgYHtyIGNvdW50LWNvbXBhcmV9DQojIEFyYnV0aG5vdCdzIGRpbQ0KDQphcmJ1dGhub3QkeWVhcg0KYGBgDQoNCmBgYHtyfQ0KDQojIEFyYnV0aG5vdCBkaW0NCg0KZGltLmRhdGEuZnJhbWUoYXJidXRobm90KQ0KYGBgDQoNCkFyYnV0aG5vdCBkYXRhIHNldCBpbmNsdWRlcyB5ZWFyIGZyb20gMTYyOSB0byAxNzEwLg0KVGh1cywgQXJidXRobm90IGRhdGEgc2V0IGhhcyA4MSBkaWZmZXJlbnQgeWVhcnMNCndoaWxlIFRoZSBwcmVzZW50IGRhdGEgc2V0IGhhcyA2MiBkaWZmZXJlbnQgeWVhcnMuDQoNClRoZXkgYXJlIG5vdCBvZiBhIHNpbWlsYXIgbWFnbml0dWRlLg0KQXJidXRobm90IGRhdGEgc2V0IGlzIDE5IHllYXJzIGxvbmdlciB0aGFuIHByZXNlbnQgZGF0YSBzZXQuDQoNCg0KIyMjIEV4ZXJjaXNlIDYNCg0KUHJvcG9ydGlvbiBvZiBib3lzIG92ZXIgdGltZS4NCg0KYGBge3IgcGxvdC1wcm9wLWJveXMtcHJlc2VudH0NCiMgQ3JlYXRlIDIgbmV3IHZhcmlhYmxlcyBpbiBwcmVzZW50IGRhdGEgc2V0DQoNCnByZXNlbnQgPC0gcHJlc2VudCAlPiUNCiAgbXV0YXRlKHRvdGFsID0gYm95cyArIGdpcmxzKQ0KDQpwcmVzZW50IDwtIHByZXNlbnQgJT4lDQogIG11dGF0ZShib3lfcmF0aW8gPSBib3lzIC8gdG90YWwpDQoNCiMgUGxvdCBwcm9wb3J0aW9uIG9mIGJveXMgYm9ybiBvdmVyIHRpbWUgZm9yIHByZXNlbnQgZGF0YSBzZXQNCg0KZ2dwbG90KGRhdGEgPSBwcmVzZW50LCBhZXMoeCA9IHllYXIsIHkgPSBib3lfcmF0aW8pKSArIGdlb21fbGluZSgpDQoNCmBgYA0KDQoNClRoZSBwcm9wb3J0aW9uIG9mIGJvcm4gb3ZlciB0aW1lIGRlY2xpbmVzIGZyb20gMTk0MCB0byBsYXRlIDYwJ3MgdGhlbiBpbmNyZWFzZXMNCmEgYml0IGJldHdlZW4gbGF0ZSA2MCdzIHRvIG1pZCA3MCdzLCB0aGVuIHN0YXJ0IGRlY3JlYXNpbmcgYWdhaW4uDQpPdmVyYWxsLCB3ZSBjYW4gc2F5IHRoYXQgdGhlIHByb3BvcnRpb24gb2YgYm95cyBvdmVyIHRpbWUgZGVjcmVhc2VzLg0KDQoNCioqKkJveXMgdG8gZ2lybHMgcmF0aW8gaW4gVVMqKioNCg0KYGBge3IgbW9yZV9ib3lzfQ0KDQpwcmVzZW50IDwtIHByZXNlbnQgJT4lDQogIG11dGF0ZShtb3JlX2JveXMgPSBib3lzID4gZ2lybHMpDQoNCnByZXNlbnQkbW9yZV9ib3lzDQpgYGANCg0KDQpBcyB3ZSBjYW4gc2VlLCB0aGF0IGlzIHRydWUuIEFyYnV0aG5vdCdzIG9ic2VydmF0aW9uIGFib3V0IGJveXMgYmVpbmcgYm9ybg0KaW4gZ3JlYXRlciBwcm9wb3J0aW9uIHRoYW4gZ2lybHMgaG9sZHMgdXAgaW4gdGhlIFUuUyBhcyB3ZWxsLg0KDQoNCmBgYHtyIHByb3AgYm95cy1naXJscyBwcmVzZW50fQ0KDQpwcmVzZW50IDwtIHByZXNlbnQgJT4lDQogIG11dGF0ZShib3lfdG9fZ2lybF9yYXRpbyA9IGJveXMgLyBnaXJscykNCg0KZ2dwbG90KGRhdGEgPSBwcmVzZW50LCBhZXMoeCA9IHllYXIsIHkgPSBib3lfdG9fZ2lybF9yYXRpbykpICsgZ2VvbV9saW5lKCkNCg0KDQpgYGANCg0KDQoNCiMjIyBFeGVyY2lzZSA3DQoNCk1vc3QgbnVtYmVyIG9mIGJpcnRoIGluIFUuUw0KDQpgYGB7ciBmaW5kLW1heC10b3RhbH0NCnByZXNlbnQgJT4lDQogIGFycmFuZ2UoZGVzYyh0b3RhbCkpDQoNCmBgYA0KDQoNCkFzIHdlIGNhbiBzZWUgZnJvbSB0aGUgYWJvdmUsIHRoZSBtb3N0IHRvdGFsIG51bWJlciBvZiBiaXJ0aHMgaW4gVS5TIHdhcyBpbiAxOTYxLg==