source("more/arbuthnot.R")
source("more/present.R")
library(ggplot2)
#arbuthnot
#dim(arbuthnot)
#names(arbuthnot)
#arbuthnot$boys

#What command would you use to extract just the counts of girls baptized?
sum(arbuthnot$girls)
## [1] 453841
#Trend shows that number of boys baptised are steady and more compared to girls on every year
trend <- ggplot(arbuthnot, aes(x = year))
  trend <- trend + geom_line(aes(y = girls, colour = "Girls"))
  trend <- trend + geom_line(aes(y = boys, colour = "Boys"))
  trend <- trend + scale_colour_manual(values = c("blue", "red"))
  trend <- trend + labs(y = "Count",x = "Year",colour = "Legend")
  trend <- trend + theme(legend.position = c(0.1,0.8))
  trend <- trend + ggtitle("Baptism: Boys vs Girls Trend") + theme(plot.title = element_text(hjust = 0.5))
trend

#Propotions

Prop_Boys = arbuthnot$boys / (arbuthnot$boys + arbuthnot$girls)
summary(Prop_Boys)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.5027  0.5118  0.5157  0.5170  0.5210  0.5362
ggplot(data = arbuthnot, aes(x = year, y = Prop_Boys)) + geom_line(color = "blue", size = 1) + ggtitle("Propotion of Boys") + theme(plot.title = element_text(hjust = 0.5)) + labs(y = "Propotion=Boys/(Boys+Girls)",x = "Year",colour = "Legend")

#The propotions of boys are between 0.50 to 0.53, which is steady and reliable data
#present
#1.What years are included in this data set? What are the dimensions of the data frame and what are the variable or column names?
present$year
##  [1] 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953
## [15] 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967
## [29] 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981
## [43] 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995
## [57] 1996 1997 1998 1999 2000 2001 2002
dim(present)
## [1] 63  3
names(present)
## [1] "year"  "boys"  "girls"
arbuthnot$year
##  [1] 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642
## [15] 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656
## [29] 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670
## [43] 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684
## [57] 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698
## [71] 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710
dim(arbuthnot)
## [1] 82  3
#2.How do these counts compare to Arbuthnot's? Are they on a similar scale?
str(arbuthnot)
## 'data.frame':    82 obs. of  3 variables:
##  $ year : int  1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 ...
##  $ boys : int  5218 4858 4422 4994 5158 5035 5106 4917 4703 5359 ...
##  $ girls: int  4683 4457 4102 4590 4839 4820 4928 4605 4457 4952 ...
str(present)
## 'data.frame':    63 obs. of  3 variables:
##  $ year : num  1940 1941 1942 1943 1944 ...
##  $ boys : num  1211684 1289734 1444365 1508959 1435301 ...
##  $ girls: num  1148715 1223693 1364631 1427901 1359499 ...
summary(arbuthnot)
##       year           boys          girls     
##  Min.   :1629   Min.   :2890   Min.   :2722  
##  1st Qu.:1649   1st Qu.:4759   1st Qu.:4457  
##  Median :1670   Median :6073   Median :5718  
##  Mean   :1670   Mean   :5907   Mean   :5535  
##  3rd Qu.:1690   3rd Qu.:7576   3rd Qu.:7150  
##  Max.   :1710   Max.   :8426   Max.   :7779
summary(present)
##       year           boys             girls        
##  Min.   :1940   Min.   :1211684   Min.   :1148715  
##  1st Qu.:1956   1st Qu.:1799857   1st Qu.:1711405  
##  Median :1971   Median :1924868   Median :1831679  
##  Mean   :1971   Mean   :1885600   Mean   :1793915  
##  3rd Qu.:1986   3rd Qu.:2058524   3rd Qu.:1965538  
##  Max.   :2002   Max.   :2186274   Max.   :2082052
#arbuthnot data has 19 more year data covered compared to present data, also by looking at the summary information implies that #they are not on similar scale and present data has significant increase in number of births


#3.Make a plot that displays the boy-to-girl ratio for every year in the data set. What do you see? Does Arbuthnot's observation #about boys being born in greater proportion than girls hold up in the U.S.? Include the plot in your response.
boy2girl_ratio=present$boys / (present$boys + present$girls)
summary(boy2girl_ratio)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.5112  0.5121  0.5125  0.5125  0.5130  0.5143
ggplot(data = present, aes(x = year, y = boy2girl_ratio)) + geom_line(color = "blue", size = 1) + ggtitle("Boy-to-Girl Ratio") + theme(plot.title = element_text(hjust = 0.5)) + labs(y = "Boy-to_Girl Ratio=Boys/(Boys+Girls)",x = "Year",colour = "Legend")

#the present data Ratio is between 0.511 to 0.514, which is more steadier than arthorbot data 0.502 to 0.536

#4.In what year did we see the most total number of births in the U.S.?
present$total=present$boys+present$girls
year<-present[order(-present$total),]
year[1:1,]
#1961 is the most total number of births in US