library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
births <- read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/births/US_births_1994-2003_CDC_NCHS.csv", header=TRUE)
#Remove the day of the week column
sort_births <- subset(births, select = -day_of_week)
#Spread the data to create separate columns for each day of the month
sort_births <- spread(sort_births,date_of_month,births)
head(sort_births)
## year month 1 2 3 4 5 6 7 8 9 10
## 1 1994 1 8096 7772 10142 11248 11053 11406 11251 8653 7910 10498
## 2 1994 2 11755 11483 11523 11677 8991 8309 10984 12152 11515 11623
## 3 1994 3 12127 11735 11984 12066 9215 8389 10996 12275 11780 11792
## 4 1994 4 10630 8782 7530 10909 11876 11811 11718 11532 8791 8183
## 5 1994 5 8145 11169 12023 11754 11958 11904 8641 8203 10914 11771
## 6 1994 6 12349 12166 11799 9182 8289 11130 12145 11784 11648 12006
## 11 12 13 14 15 16 17 18 19 20 21 22
## 1 11706 11567 11212 11570 8660 8123 10567 11541 11257 11682 11811 8833
## 2 11517 8945 8171 11551 12164 12009 11674 11887 8946 8402 10617 11810
## 3 11939 9087 8248 11092 12298 11865 11976 11799 8944 8243 11140 11964
## 4 11060 12146 11428 11709 11753 8790 7867 11094 11966 11585 11509 11553
## 5 11278 11822 11085 8830 8253 11103 12289 11668 11411 11645 8830 8449
## 6 8618 8171 10692 12074 11954 11852 11744 8907 8302 11337 12182 12213
## 23 24 25 26 27 28 29 30 31
## 1 8310 11125 11981 11514 11702 11666 8988 8096 10765
## 2 11776 11667 11905 8988 8195 11091 NA NA NA
## 3 11637 11904 11568 8957 8189 11051 12154 11540 11782
## 4 8613 8089 10909 12236 11701 11527 11474 8621 NA
## 5 11434 12562 12005 11979 12132 8840 8205 8468 11525
## 6 11939 11979 9047 8306 11309 12211 12245 12157 NA
#Generate a new column that contains the sum of births in that month
sort_births <- sort_births %>%
replace(is.na(.), 0) %>%
mutate(sum = rowSums(.[3:33]))
#Pull just the sum of births for each month and then spread the data so each row is the data for a full year with each month represented in a column
final_births <-
subset(sort_births, select = c(year, month, sum)) %>%
spread(month,sum) %>%
mutate(Total= rowSums(.[2:13]))
#Rename column name month values to strings
names(final_births) <-
c("Year","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec","Total")
#The year with the most births in this set is 2003 as shown below:
final_births[which.max(final_births$Total[1:10]),]
## Year Jan Feb Mar Apr May Jun Jul Aug Sep
## 10 2003 329803 307248 336920 330106 346754 337425 364226 360103 359644
## Oct Nov Dec Total
## 10 354048 320094 343579 4089950
#The average number of births a month between 1994 and 2003 were:
final_births %>%
summarise(mean = mean(Total))
## mean
## 1 3972214
```