# USbirths2000-2014_SSA.csv` contains U.S. births data for the years 2000 to 2014, as provided by the Social Security Administration.
#
# Header Definition
# ------ ----------
#
# year Year
# month Month
# date_of_month Day number of the month
# day_of_week Day of week where 1 is Monday and 7 is Sunday
# births Number of births
library(tidyverse)
theme_set(theme_bw())
births <- read_csv("US_births_2000-2014_SSA.csv")
sample_n(births, 5)
# A tibble: 5 x 5
year month date_of_month day_of_week births
<dbl> <dbl> <dbl> <dbl> <dbl>
1 2002 3 23 6 8569
2 2008 3 24 1 12352
3 2001 6 11 1 11776
4 2009 5 29 5 13290
5 2004 6 25 5 13074
births <- births %>%
mutate(year = as_factor(year)) %>%
mutate(month = as_factor(month)) %>%
mutate(date_of_month = as_factor(date_of_month)) %>%
mutate(day_of_week = as_factor(day_of_week))
levels(births$month) <- c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December")
levels(births$day_of_week) <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")
births
# A tibble: 5,479 x 5
year month date_of_month day_of_week births
<fct> <fct> <fct> <fct> <dbl>
1 2000 January 1 Saturday 9083
2 2000 January 2 Sunday 8006
3 2000 January 3 Monday 11363
4 2000 January 4 Tuesday 13032
5 2000 January 5 Wednesday 12558
6 2000 January 6 Thursday 12466
7 2000 January 7 Friday 12516
8 2000 January 8 Saturday 8934
9 2000 January 9 Sunday 7949
10 2000 January 10 Monday 11668
# ... with 5,469 more rows
ggplot(births, aes(births)) +
geom_histogram(bins = sqrt(nrow(births)), color = "midnightblue", fill = "white") +
labs(title = "Distribution of births in US",
subtitle = "2000 - 2014",
x = "Number of births",
y = "Count of days")
ggplot(births, aes(births, color = year)) +
geom_density() +
labs(title = "Distribution of births in US",
subtitle = "2000 - 2014",
x = "Number of births",
y = "Count of days")
summary(births$births)
Min. 1st Qu. Median Mean 3rd Qu. Max.
5728 8740 12343 11350 13082 16081
It seems to be two means in our dataset (bi-modal)
ggplot(births, aes(y=births, x = year, color = year)) +
geom_boxplot() +
labs(title = "Distribution of births in US",
subtitle = "2000 - 2014",
x = "Number of births",
y = "Count of days") +
theme(legend.position = "none")
It seems to be an insignificant increase of births in 2006 and 2007
ggplot(births, aes(births, color = year)) +
geom_density() +
facet_wrap(~month)
labs(title = "Distribution of births in US",
subtitle = "2000 - 2014",
x = "Number of births",
y = "Count of days")
$x
[1] "Number of births"
$y
[1] "Count of days"
$title
[1] "Distribution of births in US"
$subtitle
[1] "2000 - 2014"
attr(,"class")
[1] "labels"
After splitting the dataset by year and month I see the same bi-modal behavior
ggplot(births, aes(births, color = year)) +
geom_density() +
facet_wrap(~day_of_week)
labs(title = "Distribution of births in US",
subtitle = "2000 - 2014",
x = "Number of births",
y = "Count of days")
$x
[1] "Number of births"
$y
[1] "Count of days"
$title
[1] "Distribution of births in US"
$subtitle
[1] "2000 - 2014"
attr(,"class")
[1] "labels"
After splitting the dataset by day_of_week it looks like on Saturdays and Sundays are less children born
ggplot(births, aes(births, color = year)) +
geom_density() +
facet_grid(day_of_week~month)
labs(title = "Distribution of births in US",
subtitle = "2000 - 2014",
x = "Number of births",
y = "Count of days")
$x
[1] "Number of births"
$y
[1] "Count of days"
$title
[1] "Distribution of births in US"
$subtitle
[1] "2000 - 2014"
attr(,"class")
[1] "labels"
I see the same behavior if I split the dataset by year, day_of_week and month. There are some peaks around 12500 on Fridays
I want see if this difference between weekends and weekdays is significant. Even less children born on Sundays than on Saturdays.
There are almost 50% of more births during weekdays than during weekends with a significant p-value. I thought I could find a difference among weekdays (most on Thursdays and Fridays) but it was not the case.
births <- births %>%
mutate(weekend = if_else(day_of_week == "Sunday" | day_of_week == "Saturday", "Yes", "No"))
births %>%
group_by(weekend) %>%
summarise(mean = mean(births))
# A tibble: 2 x 2
weekend mean
<chr> <dbl>
1 No 12675.
2 Yes 8040.
births %>%
group_by(day_of_week) %>%
summarise(mean = mean(births))
# A tibble: 7 x 2
day_of_week mean
<fct> <dbl>
1 Monday 11898.
2 Tuesday 13122.
3 Wednesday 12911.
4 Thursday 12846.
5 Friday 12596.
6 Saturday 8563.
7 Sunday 7518.
library(infer)
t_test(x = births,
formula = births ~ weekend,
order = c("Yes", "No"),
alternative = "two-sided")
# A tibble: 1 x 6
statistic t_df p_value alternative lower_ci upper_ci
<dbl> <dbl> <dbl> <chr> <dbl> <dbl>
1 -186. 4553. 0 two.sided -4683. -4585.
Weekends born less children than in weekdays (p-value is significant). Even lees children born on Sundays. Potentially, the physicians want to assist births in weekday than weekend. No significant difference in the days prior to weekend (Thursday and Fridays).