Note: I am using the sample data because I am worried about invasion of privacy using real Facebook data.
bd <- read.csv("../birthdaysExample.csv", header = TRUE)
str(bd)
## 'data.frame': 1033 obs. of 1 variable:
## $ dates: Factor w/ 348 levels "1/1/14","1/10/14",..: 78 258 322 219 131 241 33 46 287 331 ...
plot(bd, xlab = "Birthdays", ylab = "Count", main = "Test Birthday Plot")
We start out just seeing what kind of data is there - a simple plot shows the dates are not in date order but are in string order… Note that the given birthdays are all in the same year, so no age distributions can be done. The maximum number of birthdays on a given day is 8 and there are 3 days that are at the maximum.
Need to convert factors to actual dates, and will be convenient to add month and day columns:
bd$ddates <- strptime(bd$dates, format = "%m/%d/%y")
bd$dates <- as.Date(bd$dates, format = "%m/%d/%y")
bd$months <- as.numeric(strftime(bd$dates, format = "%m"))
bd$days <- as.numeric(strftime(bd$dates, "%d"))
head(bd)
## dates ddates months days
## 1 2014-11-25 2014-11-25 11 25
## 2 2014-06-08 2014-06-08 6 8
## 3 2014-09-12 2014-09-12 9 12
## 4 2014-05-26 2014-05-26 5 26
## 5 2014-02-20 2014-02-20 2 20
## 6 2014-06-19 2014-06-19 6 19
summary(bd)
## dates ddates months
## Min. :2014-01-01 Min. :2014-01-01 00:00:00 Min. : 1.000
## 1st Qu.:2014-03-28 1st Qu.:2014-03-28 00:00:00 1st Qu.: 3.000
## Median :2014-07-02 Median :2014-07-02 00:00:00 Median : 7.000
## Mean :2014-06-30 Mean :2014-06-30 10:19:27 Mean : 6.474
## 3rd Qu.:2014-09-28 3rd Qu.:2014-09-28 00:00:00 3rd Qu.: 9.000
## Max. :2014-12-31 Max. :2014-12-31 00:00:00 Max. :12.000
## days
## Min. : 1.0
## 1st Qu.: 8.0
## Median :16.0
## Mean :15.7
## 3rd Qu.:23.0
## Max. :31.0
I don’t want to take up space here with a table of birthdays:table(bd$dates)
March (98)
monthTable <- table(bd$months)
names(monthTable) <- month.name
which(monthTable == max(monthTable))
## March
## 3
table(bd$months)
##
## 1 2 3 4 5 6 7 8 9 10 11 12
## 89 79 98 81 72 93 86 91 96 89 87 72
# does not have month names
monthTable
## January February March April May June July
## 89 79 98 81 72 93 86
## August September October November December
## 91 96 89 87 72
# has month names - simplest and more complex
plot(monthTable)
ggplot(data = bd, aes(x=as.factor(months))) + geom_histogram(colour = "darkgreen", fill = "white", binwidth = 1 ) +
scale_x_discrete( labels = month.name)+ theme(axis.text.x = element_text(angle = 90, hjust = 1)) + labs(title = "Birthday Count by Month") + xlab("Month")
We saw from the initial plot that the maximum number of birthdays per day is 8
dateTable <- table(bd$dates)
which(dateTable ==max(dateTable) )
## 2014-02-06 2014-05-22 2014-07-16
## 37 135 188
no
#http://stackoverflow.com/questions/1330989/rotating-and-spacing-axis-labels-in-ggplot2
#Convert day of year to date format
#https ://stat.ethz.ch/pipermail/r-help/2012-March/308013.html
bt <- table (bd$dates)
length(names(bt))
## [1] 348
missing = 365 - length(names(bt))
missing
## [1] 17
all_days <- lapply(as.array(seq(0,364)), as.Date, origin = "2014-01-01" )
datefun2 <- function(x) {if (is.na(bt[as.character(x)])) x}
nulldates <- lapply(all_days,datefun2)
onlynulldates <- nulldates[!sapply(nulldates, is.null)]
# Help from https://stat.ethz.ch/pipermail/r-help/2007-October/143931.html
print("There are no birthdays on")
## [1] "There are no birthdays on"
do.call("c", onlynulldates)
## [1] "2014-02-08" "2014-02-21" "2014-02-22" "2014-03-06" "2014-04-16"
## [6] "2014-04-21" "2014-05-03" "2014-05-24" "2014-06-26" "2014-08-03"
## [11] "2014-08-06" "2014-08-23" "2014-11-11" "2014-11-13" "2014-12-06"
## [16] "2014-12-13" "2014-12-23"
ggplot(data = bd, aes(x=as.factor(days))) + geom_histogram(colour = "darkgreen", fill = "white", binwidth = 1 ) +
labs(title = "Birthday Count by Day of Month") + xlab("Day")