library ("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library ("tidyr")
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(ggplot2)
library(gtable)
US_births_2000-2014_SSA.csv contains U.S. births data for the years 2000 to 2014, as provided by the Social Security Administration.
df<-data.frame(dataset)
head(df,10)
## year month date_of_month day_of_week births
## 1 2000 1 1 6 9083
## 2 2000 1 2 7 8006
## 3 2000 1 3 1 11363
## 4 2000 1 4 2 13032
## 5 2000 1 5 3 12558
## 6 2000 1 6 4 12466
## 7 2000 1 7 5 12516
## 8 2000 1 8 6 8934
## 9 2000 1 9 7 7949
## 10 2000 1 10 1 11668
updated<-df%>%mutate(day_of_week=recode(day_of_week,
"7"="Sunday",
"1"="Monday",
"2"="Tuesday",
"3"="Wednesday",
"4"="Thursday",
"5"="Friday",
"6"="Saturday"),
month=recode(month,
"1"="January",
"2"="February",
"3"="March",
"4"="April",
"5"="May",
"6"="June",
"7"="July",
"8"= "August",
"9"="September",
"10"="October",
"11"="November",
"12"="December"))
head(updated,3)
## year month date_of_month day_of_week births
## 1 2000 January 1 Saturday 9083
## 2 2000 January 2 Sunday 8006
## 3 2000 January 3 Monday 11363
##Find Avg, Min, Max, Total and Proportion for each year
yearlybirths<- updated %>%
group_by(year) %>%
summarise(avg_births = mean(births),
min_births = min(births),
max_births = max(births),
total_births= sum(births))%>%
mutate(total_birth_prop = prop.table(total_births))%>%
arrange(desc(year))
head(yearlybirths,5)
## # A tibble: 5 x 6
## year avg_births min_births max_births total_births total_birth_prop
## <int> <dbl> <int> <int> <int> <dbl>
## 1 2014 10988. 6749 13863 4010532 0.0645
## 2 2013 10886. 6609 13680 3973337 0.0639
## 3 2012 10931. 6325 14667 4000868 0.0643
## 4 2011 10978. 5728 14392 4006908 0.0644
## 5 2010 11112. 6159 14255 4055975 0.0652
total<-yearlybirths %>%
summarise(avg_births_2000_2014 = mean(avg_births),
min_births_2000_2014 = min(min_births),
max_births_2000_2014 = max(max_births),
total_births_2000_2014= sum(total_births))
total
## # A tibble: 1 x 4
## avg_births_2000_2~ min_births_2000_~ max_births_2000_~ total_births_2000~
## <dbl> <int> <int> <int>
## 1 11350. 5728 16081 62187024
ggplot(yearlybirths) +
geom_ribbon(aes(year,ymin=min_births,ymax=max_births),color="yellow",alpha=0.5) +
geom_line(aes(year,avg_births))+ labs( x="Year", y="Birth Average")+ggtitle("Birth Average")
g2<-ggplot(data=yearlybirths, aes(x=year, y=total_birth_prop, group=10)) +
geom_line(arrow = arrow())+
geom_point()+
theme(axis.text.x = element_text(angle = 90, hjust = 1))+
labs( x="Year", y="Birth Proportion")+ggtitle("The Busiest Birth Years")
g2
As we can see, the peak of deliveries is between 2006-2008 years.
##Find the busiest birth months
monthlybirths<- updated %>%
group_by(month) %>%
summarise(total_births= sum(births))%>%
mutate(total_birth_prop = prop.table(total_births))%>%
arrange(match(month,month.name))
g3<-ggplot(data=monthlybirths, aes(x=month, y=total_birth_prop, group=10)) +
geom_line(arrow = arrow())+
geom_point()+
theme(axis.text.x = element_text(angle = 90, hjust = 1))+
labs( x="Month", y="Birth Proportion")+ggtitle("The busiest Birth Months")
g3
August is the overall most popular month for birthdays, which makes sense, considering a late August birthday means December conception.Cold weather, snow, romantic fires and holiday parties seems to create just the right equation for the beginning of human mating season.February is the least common birth month. That’s also logical, seeing as nine months prior is May which marks longer, sunnier days, warmer temperatures and usually more outside activity.
weeklybirths<- updated %>%
group_by(year,day_of_week) %>%
summarise(total_births= sum(births))%>%
mutate(total_birth_prop = prop.table(total_births))
p <- ggplot(data = weeklybirths, mapping = aes(x = year, y = total_birth_prop))
p + geom_line(color="gray70", aes(group = day_of_week)) +
geom_smooth(size = 2, method = "loess", se = FALSE) +
facet_wrap(~ day_of_week, ncol = 7) +
labs(x = "Year",
y = "Births",
title = "Weekly birth")
As we can see, the least comon days are Saturday and Sunday. Many births are scheduled, either as induced deliveries or cesarean section, that explains why the least comon delivery days are Saturday and Sunday.
Some People Are Too Superstitious To Have A Baby On Friday The 13th. In order to investigate if there are less babies were born on Friday 13, I filtered the data by date of the week - Friday, summarise the births by day of the months and found the proportion of the total for each date of the month.
friday <- updated %>%
filter(day_of_week %like% "Friday")%>%
group_by(date_of_month)%>%
arrange(date_of_month)%>%
group_by(date_of_month) %>%
summarise(total_births= sum(births))%>%
mutate(total_birth_prop = prop.table(total_births))%>%
arrange(total_births)
head(friday,5)
## # A tibble: 5 x 3
## date_of_month total_births total_birth_prop
## <int> <int> <dbl>
## 1 31 179662 0.0182
## 2 29 286554 0.0291
## 3 24 289342 0.0294
## 4 13 298749 0.0303
## 5 26 302931 0.0308
Reference: https://www.unitypoint.org/livewell/article.aspx?id=ad80d6cc-6a53-49f3-b496-8c0f48b47936