# USbirths2000-2014_SSA.csv` contains U.S. births data for the years 2000 to 2014, as provided by the Social Security Administration.
# 
# Header                    Definition
# ------                    ----------
# 
# year                      Year
# month                     Month
# date_of_month             Day number of the month
# day_of_week   Day of week   where 1 is Monday and 7 is Sunday
# births                      Number of births

Link Kaggle

library(tidyverse)

theme_set(theme_bw())
births <- read_csv("US_births_2000-2014_SSA.csv")

sample_n(births, 5)
# A tibble: 5 x 5
   year month date_of_month day_of_week births
  <dbl> <dbl>         <dbl>       <dbl>  <dbl>
1  2002     3            23           6   8569
2  2008     3            24           1  12352
3  2001     6            11           1  11776
4  2009     5            29           5  13290
5  2004     6            25           5  13074
births <- births %>% 
  mutate(year = as_factor(year)) %>% 
  mutate(month = as_factor(month)) %>% 
  mutate(date_of_month = as_factor(date_of_month)) %>% 
  mutate(day_of_week = as_factor(day_of_week))  

levels(births$month) <- c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")
levels(births$day_of_week) <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")

births
# A tibble: 5,479 x 5
   year  month   date_of_month day_of_week births
   <fct> <fct>   <fct>         <fct>        <dbl>
 1 2000  January 1             Saturday      9083
 2 2000  January 2             Sunday        8006
 3 2000  January 3             Monday       11363
 4 2000  January 4             Tuesday      13032
 5 2000  January 5             Wednesday    12558
 6 2000  January 6             Thursday     12466
 7 2000  January 7             Friday       12516
 8 2000  January 8             Saturday      8934
 9 2000  January 9             Sunday        7949
10 2000  January 10            Monday       11668
# ... with 5,469 more rows
ggplot(births, aes(births)) + 
  geom_histogram(bins = sqrt(nrow(births)), color = "midnightblue", fill = "white") +
  labs(title = "Distribution of births in US",
       subtitle = "2000 - 2014",
       x = "Number of births",
       y = "Count of days")

ggplot(births, aes(births, color = year)) + 
  geom_density() +
  labs(title = "Distribution of births in US",
       subtitle = "2000 - 2014",
       x = "Number of births",
       y = "Count of days")

summary(births$births)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   5728    8740   12343   11350   13082   16081 

It seems to be two means in our dataset (bi-modal)

ggplot(births, aes(y=births, x = year, color = year)) + 
  geom_boxplot() +
  labs(title = "Distribution of births in US",
       subtitle = "2000 - 2014",
       x = "Number of births",
       y = "Count of days") + 
  theme(legend.position = "none")

It seems to be an insignificant increase of births in 2006 and 2007

ggplot(births, aes(births, color = year)) + 
  geom_density() +
  facet_wrap(~month)

  labs(title = "Distribution of births in US",
       subtitle = "2000 - 2014",
       x = "Number of births",
       y = "Count of days")
$x
[1] "Number of births"

$y
[1] "Count of days"

$title
[1] "Distribution of births in US"

$subtitle
[1] "2000 - 2014"

attr(,"class")
[1] "labels"

After splitting the dataset by year and month I see the same bi-modal behavior

ggplot(births, aes(births, color = year)) + 
  geom_density() +
  facet_wrap(~day_of_week)

  labs(title = "Distribution of births in US",
       subtitle = "2000 - 2014",
       x = "Number of births",
       y = "Count of days")
$x
[1] "Number of births"

$y
[1] "Count of days"

$title
[1] "Distribution of births in US"

$subtitle
[1] "2000 - 2014"

attr(,"class")
[1] "labels"

After splitting the dataset by day_of_week it looks like on Saturdays and Sundays are less children born

ggplot(births, aes(births, color = year)) + 
  geom_density() +
  facet_grid(day_of_week~month)

  labs(title = "Distribution of births in US",
       subtitle = "2000 - 2014",
       x = "Number of births",
       y = "Count of days")
$x
[1] "Number of births"

$y
[1] "Count of days"

$title
[1] "Distribution of births in US"

$subtitle
[1] "2000 - 2014"

attr(,"class")
[1] "labels"

I see the same behavior if I split the dataset by year, day_of_week and month. There are some peaks around 12500 on Fridays

I want see if this difference between weekends and weekdays is significant. Even less children born on Sundays than on Saturdays.
There are almost 50% of more births during weekdays than during weekends with a significant p-value. I thought I could find a difference among weekdays (most on Thursdays and Fridays) but it was not the case.

births <- births %>% 
  mutate(weekend = if_else(day_of_week == "Sunday" | day_of_week == "Saturday", "Yes", "No"))


births %>% 
  group_by(weekend) %>% 
  summarise(mean = mean(births))
# A tibble: 2 x 2
  weekend   mean
  <chr>    <dbl>
1 No      12675.
2 Yes      8040.
births %>% 
  group_by(day_of_week) %>% 
  summarise(mean = mean(births)) 
# A tibble: 7 x 2
  day_of_week   mean
  <fct>        <dbl>
1 Monday      11898.
2 Tuesday     13122.
3 Wednesday   12911.
4 Thursday    12846.
5 Friday      12596.
6 Saturday     8563.
7 Sunday       7518.
library(infer)

t_test(x = births, 
       formula = births ~ weekend, 
       order = c("Yes", "No"),
       alternative = "two-sided")
# A tibble: 1 x 6
  statistic  t_df p_value alternative lower_ci upper_ci
      <dbl> <dbl>   <dbl> <chr>          <dbl>    <dbl>
1     -186. 4553.       0 two.sided     -4683.   -4585.

Conclusion

Weekends born less children than in weekdays (p-value is significant). Even lees children born on Sundays. Potentially, the physicians want to assist births in weekday than weekend. No significant difference in the days prior to weekend (Thursday and Fridays).