Extract and prepare data

library ("dplyr")
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library ("tidyr")
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(ggplot2)
library(gtable)

US_births_2000-2014_SSA.csv contains U.S. births data for the years 2000 to 2014, as provided by the Social Security Administration.

df<-data.frame(dataset)
head(df,10)
##    year month date_of_month day_of_week births
## 1  2000     1             1           6   9083
## 2  2000     1             2           7   8006
## 3  2000     1             3           1  11363
## 4  2000     1             4           2  13032
## 5  2000     1             5           3  12558
## 6  2000     1             6           4  12466
## 7  2000     1             7           5  12516
## 8  2000     1             8           6   8934
## 9  2000     1             9           7   7949
## 10 2000     1            10           1  11668

Rename data

updated<-df%>%mutate(day_of_week=recode(day_of_week, 
       "7"="Sunday",
       "1"="Monday",
       "2"="Tuesday",
       "3"="Wednesday",
       "4"="Thursday",
       "5"="Friday",
       "6"="Saturday"),
month=recode(month, 
       "1"="January",
       "2"="February",
       "3"="March",
       "4"="April",
       "5"="May",
       "6"="June",
       "7"="July",
       "8"= "August",
       "9"="September",
       "10"="October",
       "11"="November",
       "12"="December"))
head(updated,3)
##   year   month date_of_month day_of_week births
## 1 2000 January             1    Saturday   9083
## 2 2000 January             2      Sunday   8006
## 3 2000 January             3      Monday  11363

Analyse total birth by year

##Find Avg, Min, Max, Total and Proportion for each year
yearlybirths<- updated %>%
    group_by(year) %>%
    summarise(avg_births = mean(births), 
              min_births = min(births),
              max_births = max(births),
              total_births= sum(births))%>%
         mutate(total_birth_prop = prop.table(total_births))%>%
  arrange(desc(year))
head(yearlybirths,5)
## # A tibble: 5 x 6
##    year avg_births min_births max_births total_births total_birth_prop
##   <int>      <dbl>      <int>      <int>        <int>            <dbl>
## 1  2014     10988.       6749      13863      4010532           0.0645
## 2  2013     10886.       6609      13680      3973337           0.0639
## 3  2012     10931.       6325      14667      4000868           0.0643
## 4  2011     10978.       5728      14392      4006908           0.0644
## 5  2010     11112.       6159      14255      4055975           0.0652

Avg, Min, Max, Total Births 2000-2014

total<-yearlybirths %>%
  summarise(avg_births_2000_2014 = mean(avg_births),
            min_births_2000_2014 = min(min_births),
              max_births_2000_2014 = max(max_births),
            total_births_2000_2014= sum(total_births))
total
## # A tibble: 1 x 4
##   avg_births_2000_2~ min_births_2000_~ max_births_2000_~ total_births_2000~
##                <dbl>             <int>             <int>              <int>
## 1             11350.              5728             16081           62187024
ggplot(yearlybirths) + 
  geom_ribbon(aes(year,ymin=min_births,ymax=max_births),color="yellow",alpha=0.5) +
  geom_line(aes(year,avg_births))+ labs( x="Year", y="Birth Average")+ggtitle("Birth Average")

The busiest Birth Years

g2<-ggplot(data=yearlybirths, aes(x=year, y=total_birth_prop, group=10)) +
  geom_line(arrow = arrow())+
  geom_point()+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))+
  labs( x="Year", y="Birth Proportion")+ggtitle("The Busiest Birth Years")
g2

As we can see, the peak of deliveries is between 2006-2008 years.

Analyse total birth by months

##Find the busiest birth months
monthlybirths<- updated %>%
    group_by(month) %>%
    summarise(total_births= sum(births))%>%
         mutate(total_birth_prop = prop.table(total_births))%>%
  arrange(match(month,month.name))

g3<-ggplot(data=monthlybirths, aes(x=month, y=total_birth_prop, group=10)) +
  geom_line(arrow = arrow())+
  geom_point()+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))+
  labs( x="Month", y="Birth Proportion")+ggtitle("The busiest Birth Months")
g3

August is the overall most popular month for birthdays, which makes sense, considering a late August birthday means December conception.Cold weather, snow, romantic fires and holiday parties seems to create just the right equation for the beginning of human mating season.February is the least common birth month. That’s also logical, seeing as nine months prior is May which marks longer, sunnier days, warmer temperatures and usually more outside activity.

The most common day of the week for babies to be born

weeklybirths<- updated %>%
    group_by(year,day_of_week) %>%
    summarise(total_births= sum(births))%>%
         mutate(total_birth_prop = prop.table(total_births))
 
p <- ggplot(data = weeklybirths, mapping = aes(x = year, y = total_birth_prop))
p + geom_line(color="gray70", aes(group = day_of_week)) +
    geom_smooth(size = 2, method = "loess", se = FALSE) +
 facet_wrap(~ day_of_week, ncol = 7) +
    labs(x = "Year",
         y = "Births",
         title = "Weekly birth")

As we can see, the least comon days are Saturday and Sunday. Many births are scheduled, either as induced deliveries or cesarean section, that explains why the least comon delivery days are Saturday and Sunday.

Investigate the Fridays Birth

Some People Are Too Superstitious To Have A Baby On Friday The 13th. In order to investigate if there are less babies were born on Friday 13, I filtered the data by date of the week - Friday, summarise the births by day of the months and found the proportion of the total for each date of the month.

friday <- updated %>% 
        filter(day_of_week %like% "Friday")%>%
  group_by(date_of_month)%>% 
arrange(date_of_month)%>%
    group_by(date_of_month) %>%
    summarise(total_births= sum(births))%>%
         mutate(total_birth_prop = prop.table(total_births))%>%
  arrange(total_births)
      
head(friday,5)
## # A tibble: 5 x 3
##   date_of_month total_births total_birth_prop
##           <int>        <int>            <dbl>
## 1            31       179662           0.0182
## 2            29       286554           0.0291
## 3            24       289342           0.0294
## 4            13       298749           0.0303
## 5            26       302931           0.0308

Reference: https://www.unitypoint.org/livewell/article.aspx?id=ad80d6cc-6a53-49f3-b496-8c0f48b47936