Question 1
library(DataComputing)
## Loading required package: ggplot2
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: lubridate
## Loading required package: tidyr
## Loading required package: stringr
## Loading required package: knitr
## Loading required package: mosaic
## Loading required package: lattice
## Loading required package: car
## Loading required package: mosaicData
##
## Attaching package: 'mosaic'
## The following object is masked from 'package:car':
##
## logit
## The following object is masked from 'package:lubridate':
##
## interval
## The following objects are masked from 'package:dplyr':
##
## count, do, tally
## The following objects are masked from 'package:stats':
##
## binom.test, cor, cov, D, fivenum, IQR, median, prop.test,
## quantile, sd, t.test, var
## The following objects are masked from 'package:base':
##
## max, mean, min, prod, range, sample, sum
## Loading required package: manipulate
## Loading required package: base64enc
## Loading required package: curl
daily_births <- Birthdays %>%
group_by(date) %>%
tally(births)
ggplot(daily_births, aes(x = date, y = n)) + geom_point() + ylab("total")
Question 2a
daily_births <- Birthdays %>%
group_by(week=week(date)) %>%
tally(births)
ggplot(daily_births, aes(x = week, y = n)) + geom_point() + ylab("total")
Question 2b
daily_births <- Birthdays %>%
group_by(month=month(date)) %>%
tally(births)
ggplot(daily_births, aes(x = month, y = n)) + geom_point() + ylab("total")
Question 2c
daily_births <- Birthdays %>%
group_by(month=month(date)) %>%
tally(births)
ggplot(daily_births, aes(x = month, y = n)) + geom_point() + ylab("total")
Question 3
daily_births <- Birthdays %>%
group_by(wday=wday(date)) %>%
tally(births)
ggplot(daily_births, aes(x = wday, y = n)) + geom_point() + ylab("total")
Question 4
two_year_span <- Birthdays %>%
filter( year == 1983 | year == 1984) %>%
group_by(date, wday) %>%
tally(births)
ggplot(two_year_span, aes(x = date, y = n, col = wday)) + geom_point() + ylab("total")
The data shows that most births tend to happen on the weekdays then the weekends.
Question 5
Holidays <- read.csv("http://tiny.cc/dcf/US-Holidays.csv") %>%
mutate(date = lubridate::dmy(date))
Question 6
myHolidays <- Holidays %>%
filter( year == 1983 | year == 1984)
ggplot(two_year_span, aes(x = date, y = n)) + geom_point(aes(col=wday)) + ylab("total") + geom_vline(data = Holidays, aes(xintercept=as.numeric(date), col = wday(date,label=TRUE))) + geom_text(data = myHolidays, aes(x=as.POSIXct(myHolidays$date, wday(date)), y=9000, label = holiday, angle=65))
Question 7
Holidays$date <- as.Date(Holidays$date)
two_year_span$date <- as.Date(two_year_span$date)
join_holidays <- left_join(two_year_span, Holidays)
## Joining by: "date"
Question 8
join_holidays_mutated <- join_holidays %>%
mutate(is_holiday = ifelse(is.na(holiday), "no", "yes"))
join_holidays_mutated
## Source: local data frame [731 x 6]
## Groups: date [731]
##
## date wday n holiday year is_holiday
## (date) (fctr) (int) (fctr) (int) (chr)
## 1 1983-01-01 Sat 8174 New Year's Day 1983 yes
## 2 1983-01-02 Sun 8085 NA NA no
## 3 1983-01-03 Mon 9523 NA NA no
## 4 1983-01-04 Tues 10094 NA NA no
## 5 1983-01-05 Wed 9966 NA NA no
## 6 1983-01-06 Thurs 9990 NA NA no
## 7 1983-01-07 Fri 9947 NA NA no
## 8 1983-01-08 Sat 8525 NA NA no
## 9 1983-01-09 Sun 8287 NA NA no
## 10 1983-01-10 Mon 9930 NA NA no
## .. ... ... ... ... ... ...
Question 9
ggplot() + geom_point(data=join_holidays_mutated, aes(x=date, y=n, col=wday, size=is_holiday)) + ylab("total") + geom_vline(data = Holidays, aes(xintercept=as.numeric(date), col = wday(date, label = TRUE))) + geom_text(data = myHolidays, aes(x=as.Date(date), y=9000, label=holiday, angle=65))
There are many outliers in the data and the general trend is actually pretty uniform (I would say Thanksgiving is an exception in this pattern).