DATA607_Project2

As posted by Uliana Plotnikova, I used the data for births in the US. The goal was to sort the data by years and months and identify trends.

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
births <- read.csv("https://raw.githubusercontent.com/fivethirtyeight/data/master/births/US_births_1994-2003_CDC_NCHS.csv", header=TRUE)

#Remove the day of the week column
sort_births <- subset(births, select = -day_of_week)
#Spread the data to create separate columns for each day of the month
sort_births <- spread(sort_births,date_of_month,births)
head(sort_births)

##   year month     1     2     3     4     5     6     7     8     9    10
## 1 1994     1  8096  7772 10142 11248 11053 11406 11251  8653  7910 10498
## 2 1994     2 11755 11483 11523 11677  8991  8309 10984 12152 11515 11623
## 3 1994     3 12127 11735 11984 12066  9215  8389 10996 12275 11780 11792
## 4 1994     4 10630  8782  7530 10909 11876 11811 11718 11532  8791  8183
## 5 1994     5  8145 11169 12023 11754 11958 11904  8641  8203 10914 11771
## 6 1994     6 12349 12166 11799  9182  8289 11130 12145 11784 11648 12006
##      11    12    13    14    15    16    17    18    19    20    21    22
## 1 11706 11567 11212 11570  8660  8123 10567 11541 11257 11682 11811  8833
## 2 11517  8945  8171 11551 12164 12009 11674 11887  8946  8402 10617 11810
## 3 11939  9087  8248 11092 12298 11865 11976 11799  8944  8243 11140 11964
## 4 11060 12146 11428 11709 11753  8790  7867 11094 11966 11585 11509 11553
## 5 11278 11822 11085  8830  8253 11103 12289 11668 11411 11645  8830  8449
## 6  8618  8171 10692 12074 11954 11852 11744  8907  8302 11337 12182 12213
##      23    24    25    26    27    28    29    30    31
## 1  8310 11125 11981 11514 11702 11666  8988  8096 10765
## 2 11776 11667 11905  8988  8195 11091    NA    NA    NA
## 3 11637 11904 11568  8957  8189 11051 12154 11540 11782
## 4  8613  8089 10909 12236 11701 11527 11474  8621    NA
## 5 11434 12562 12005 11979 12132  8840  8205  8468 11525
## 6 11939 11979  9047  8306 11309 12211 12245 12157    NA

#Generate a new column that contains the sum of births in that month
sort_births <- sort_births %>%
  replace(is.na(.), 0) %>%
  mutate(sum = rowSums(.[3:33]))

#Pull just the sum of births for each month and then spread the data so each row is the data for a full year with each month represented in a column
final_births <- 
  subset(sort_births, select = c(year, month, sum)) %>%
  spread(month,sum) %>%
  mutate(Total= rowSums(.[2:13]))

#Rename column name month values to strings
names(final_births) <-
  c("Year","Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec","Total") 


#The year with the most births in this set is 2003 as shown below:
final_births[which.max(final_births$Total[1:10]),]

##    Year    Jan    Feb    Mar    Apr    May    Jun    Jul    Aug    Sep
## 10 2003 329803 307248 336920 330106 346754 337425 364226 360103 359644
##       Oct    Nov    Dec   Total
## 10 354048 320094 343579 4089950

#The average number of births a month between 1994 and 2003 were:
final_births %>% 
  summarise(mean = mean(Total))

##      mean
## 1 3972214

```

DATA607_Project2

Misha Kollontai

9/30/2019

As posted by Uliana Plotnikova, I used the data for births in the US. The goal was to sort the data by years and months and identify trends.