Load the libraries
library(ggplot2)
library(ggthemes)
library(stringr)
# Set the theme to theme_solarized where the font size is 20
theme_set(theme_solarized(20))
Load the birthdays data csv to a data frame
# header = TRUE, indicates that the first line in the file is
# the name of the variables
birthdays = read.csv('birthdays.csv', header = TRUE, strip.white=TRUE)
summary(birthdays)
## Title Start
## Abhishek Ramesh Keshav's birthday: 1 9/18/2015 0:00 : 3
## Adarsh Harindra nath's birthday : 1 10/26/2015 0:00: 2
## Aishwarya's birthday : 1 12/26/2015 0:00: 2
## Anthony Carfang's birthday : 1 5/18/2015 0:00 : 2
## Arjun GN's birthday : 1 5/28/2015 0:00 : 2
## Baady's birthday : 1 6/21/2015 0:00 : 2
## (Other) :42 (Other) :35
## End Duration
## 9/19/2015 0:00 : 3 24:00:00:48
## 10/27/2015 0:00: 2
## 12/27/2015 0:00: 2
## 5/19/2015 0:00 : 2
## 5/29/2015 0:00 : 2
## 6/22/2015 0:00 : 2
## (Other) :35
Strip time and get the data
# Strip time using the 'format' option, 'usetz' is set to FALSE to not have the time zone
birthdays$Start <- strptime(birthdays$Start, format = "%m/%d/%Y")
birthdays$End <- strptime(birthdays$End, format = "%m/%d/%Y")
# Format the date to get the day of the birthday
format(birthdays$Start, "%d")
## [1] "20" "29" "02" "15" "26" "05" "06" "29" "17" "22" "29" "06" "16" "18"
## [15] "18" "22" "28" "28" "01" "14" "21" "21" "23" "28" "01" "09" "12" "12"
## [29] "15" "31" "17" "18" "18" "18" "27" "14" "24" "26" "26" "28" "06" "10"
## [43] "01" "05" "26" "26" "05" "06"
# Format the date to get the month of the birthday
format(birthdays$Start, "%m")
## [1] "01" "01" "02" "02" "02" "03" "03" "03" "04" "04" "04" "05" "05" "05"
## [15] "05" "05" "05" "05" "06" "06" "06" "06" "06" "07" "08" "08" "08" "08"
## [29] "08" "08" "09" "09" "09" "09" "09" "10" "10" "10" "10" "10" "11" "11"
## [43] "12" "12" "12" "12" "01" "01"
# Add new columns
birthdays$month <- as.numeric(format(birthdays$Start, "%m"))
birthdays$day <- as.numeric(format(birthdays$Start, "%d"))
Create histograms
ggplot(aes(x = birthdays$month), data = birthdays) +
geom_histogram(color = 'black', fill = '#48CCDD', binwidth = 1) +
scale_x_continuous(limits = c(1,12), breaks = seq(1, 12, 1)) +
xlab('Month') +
ylab('Counts for the Birthday Month')

ggsave('birthdaymonth.jpg')
## Saving 7 x 5 in image
summary(birthdays$month)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 4.000 6.000 6.583 9.000 12.000
ggplot(aes(x = birthdays$day), data = birthdays) +
geom_histogram(color = 'black', fill = '#48CCDD', binwidth = 1) +
scale_x_continuous(limits = c(1, 31), breaks = seq(1, 31, 1)) +
xlab('Day') +
ylab('Counts for the Day of Birth')

ggsave('dayofbirthday.jpg')
## Saving 7 x 5 in image
Which month contains the most number of birthdays?
# Create a table
birthMonthTable <- table(birthdays$month)
# Get the month with most birthdays
mostCommonMonth <- which(birthMonthTable == max(birthMonthTable))
# Print the table with the month and the corresponding number of birthdays
birthMonthTable <- format(birthdays$Start, "%b")
# Change the order of factors in a factor variable(categorical variable)
birthMonthTable <- factor(birthMonthTable, levels=c("Jan", "Feb", "Mar", "Apr",
"May", "Jun", "Jul", "Aug",
"Sep", "Oct", "Nov", "Dec"))
# Print the most common month
month.abb[mostCommonMonth]
## [1] "May"
How many birthdays are in each month?
table(birthMonthTable)
## birthMonthTable
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 4 3 3 3 7 5 1 6 5 5 2 4
Which day of the year has the most number of birthdays?
# Create a table
birthDayTable <- table(birthdays$day)
# Get the month with most birthdays
mostCommonDay <- which(birthDayTable == max(birthDayTable))
# Print the most common day
# The output has the most frequently occuring numbers
# followed by the index numbers in the table
mostCommonDay
## 18 26
## 12 18
Do you have at least 365 friends that have birthdays on everyday of the year?
# NO, my data set was too small as it had only 48 observations