source("more/arbuthnot.R")
source("more/present.R")
library(ggplot2)
#arbuthnot
#dim(arbuthnot)
#names(arbuthnot)
#arbuthnot$boys
#What command would you use to extract just the counts of girls baptized?
sum(arbuthnot$girls)
## [1] 453841
#Trend shows that number of boys baptised are steady and more compared to girls on every year
trend <- ggplot(arbuthnot, aes(x = year))
trend <- trend + geom_line(aes(y = girls, colour = "Girls"))
trend <- trend + geom_line(aes(y = boys, colour = "Boys"))
trend <- trend + scale_colour_manual(values = c("blue", "red"))
trend <- trend + labs(y = "Count",x = "Year",colour = "Legend")
trend <- trend + theme(legend.position = c(0.1,0.8))
trend <- trend + ggtitle("Baptism: Boys vs Girls Trend") + theme(plot.title = element_text(hjust = 0.5))
trend

#Propotions
Prop_Boys = arbuthnot$boys / (arbuthnot$boys + arbuthnot$girls)
summary(Prop_Boys)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.5027 0.5118 0.5157 0.5170 0.5210 0.5362
ggplot(data = arbuthnot, aes(x = year, y = Prop_Boys)) + geom_line(color = "blue", size = 1) + ggtitle("Propotion of Boys") + theme(plot.title = element_text(hjust = 0.5)) + labs(y = "Propotion=Boys/(Boys+Girls)",x = "Year",colour = "Legend")

#The propotions of boys are between 0.50 to 0.53, which is steady and reliable data
#present
#1.What years are included in this data set? What are the dimensions of the data frame and what are the variable or column names?
present$year
## [1] 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953
## [15] 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967
## [29] 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981
## [43] 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995
## [57] 1996 1997 1998 1999 2000 2001 2002
dim(present)
## [1] 63 3
names(present)
## [1] "year" "boys" "girls"
arbuthnot$year
## [1] 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642
## [15] 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656
## [29] 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670
## [43] 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684
## [57] 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698
## [71] 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710
dim(arbuthnot)
## [1] 82 3
#2.How do these counts compare to Arbuthnot's? Are they on a similar scale?
str(arbuthnot)
## 'data.frame': 82 obs. of 3 variables:
## $ year : int 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 ...
## $ boys : int 5218 4858 4422 4994 5158 5035 5106 4917 4703 5359 ...
## $ girls: int 4683 4457 4102 4590 4839 4820 4928 4605 4457 4952 ...
str(present)
## 'data.frame': 63 obs. of 3 variables:
## $ year : num 1940 1941 1942 1943 1944 ...
## $ boys : num 1211684 1289734 1444365 1508959 1435301 ...
## $ girls: num 1148715 1223693 1364631 1427901 1359499 ...
summary(arbuthnot)
## year boys girls
## Min. :1629 Min. :2890 Min. :2722
## 1st Qu.:1649 1st Qu.:4759 1st Qu.:4457
## Median :1670 Median :6073 Median :5718
## Mean :1670 Mean :5907 Mean :5535
## 3rd Qu.:1690 3rd Qu.:7576 3rd Qu.:7150
## Max. :1710 Max. :8426 Max. :7779
summary(present)
## year boys girls
## Min. :1940 Min. :1211684 Min. :1148715
## 1st Qu.:1956 1st Qu.:1799857 1st Qu.:1711405
## Median :1971 Median :1924868 Median :1831679
## Mean :1971 Mean :1885600 Mean :1793915
## 3rd Qu.:1986 3rd Qu.:2058524 3rd Qu.:1965538
## Max. :2002 Max. :2186274 Max. :2082052
#arbuthnot data has 19 more year data covered compared to present data, also by looking at the summary information implies that #they are not on similar scale and present data has significant increase in number of births
#3.Make a plot that displays the boy-to-girl ratio for every year in the data set. What do you see? Does Arbuthnot's observation #about boys being born in greater proportion than girls hold up in the U.S.? Include the plot in your response.
boy2girl_ratio=present$boys / (present$boys + present$girls)
summary(boy2girl_ratio)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.5112 0.5121 0.5125 0.5125 0.5130 0.5143
ggplot(data = present, aes(x = year, y = boy2girl_ratio)) + geom_line(color = "blue", size = 1) + ggtitle("Boy-to-Girl Ratio") + theme(plot.title = element_text(hjust = 0.5)) + labs(y = "Boy-to_Girl Ratio=Boys/(Boys+Girls)",x = "Year",colour = "Legend")

#the present data Ratio is between 0.511 to 0.514, which is more steadier than arthorbot data 0.502 to 0.536
#4.In what year did we see the most total number of births in the U.S.?
present$total=present$boys+present$girls
year<-present[order(-present$total),]
year[1:1,]
#1961 is the most total number of births in US