setwd("D:/R/Udacity/EDA_Course_Materials/lesson3")
library(rmarkdown)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Which month contains the most number of birthdays?
How many birthdays are in each month?
Which day of the year has the most number of birthdays?
Do I have at least 365 friends that have birthdays on everyday of the year?
csvdata <- read.csv('FBbd.csv')
#convert vector into necessary time format
bdates <- strptime(csvdata$Start, "%d/%m/%Y %H:%M")
names(unclass(bdates))
## [1] "sec" "min" "hour" "mday" "mon" "year" "wday"
## [8] "yday" "isdst" "zone" "gmtoff"
#extract days and months
days <- bdates$mday
mon <- as.numeric(bdates$mon) + 1
yday <- as.numeric(bdates$yday) + 1
#creat dataframes
df <- data.frame(days, mon, yday, count = 1)
Histogram shows that mostly my friends have birthdays in the range from March to July and most of them were born in March.
qplot(df$mon, binwidth = 1,
main = 'Birthdays per month',
xlab = 'Months',
ylab = 'Number of birthdays in month',
color = I('black'), fill = I('#A41470')) +
scale_x_continuous(breaks = seq(1,12,1), lim = c(1,13)) +
scale_y_continuous(breaks = seq(0,30,2), lim = c(0,30))
ggsave('BD_num.png')
## Saving 7 x 5 in image
by_mon<-group_by(df, mon)
summarise(by_mon, sum(count))
## Source: local data frame [12 x 2]
##
## mon sum(count)
## 1 1 20
## 2 2 15
## 3 3 29
## 4 4 25
## 5 5 27
## 6 6 26
## 7 7 25
## 8 8 20
## 9 9 14
## 10 10 14
## 11 11 21
## 12 12 16
Among my friends the most number of birthdays is on 1 of May.
# convert day of year into "day-month format
by_yday<-group_by(df, yday)
df_yday<-summarise(by_yday, total = sum(count))
df_yday$yday <- format(strptime(df_yday$yday, format="%j"), format="%d-%m")
ordered <- df_yday[order(-df_yday$total),]
I sorted days of birth of my friends. Final table shows that I have 252 observations so it already means that my friends’ birthdays are not every day of the year.
bd <- select(df, days, mon)
bd_ordered <- bd[order(as.numeric(bd$mon), as.numeric(bd$days)),]
row.names(bd_ordered) <- c(1:252)
bd_ordered
## days mon
## 1 1 1
## 2 1 1
## 3 1 1
## 4 1 1
## 5 6 1
## 6 6 1
## 7 11 1
## 8 11 1
## 9 16 1
## 10 17 1
## 11 19 1
## 12 22 1
## 13 22 1
## 14 23 1
## 15 23 1
## 16 24 1
## 17 27 1
## 18 29 1
## 19 30 1
## 20 31 1
## 21 2 2
## 22 5 2
## 23 8 2
## 24 10 2
## 25 15 2
## 26 19 2
## 27 19 2
## 28 19 2
## 29 19 2
## 30 20 2
## 31 20 2
## 32 21 2
## 33 22 2
## 34 22 2
## 35 24 2
## 36 2 3
## 37 5 3
## 38 5 3
## 39 6 3
## 40 7 3
## 41 8 3
## 42 9 3
## 43 10 3
## 44 11 3
## 45 11 3
## 46 11 3
## 47 12 3
## 48 13 3
## 49 15 3
## 50 15 3
## 51 15 3
## 52 17 3
## 53 17 3
## 54 17 3
## 55 18 3
## 56 20 3
## 57 21 3
## 58 21 3
## 59 23 3
## 60 24 3
## 61 24 3
## 62 27 3
## 63 28 3
## 64 29 3
## 65 1 4
## 66 2 4
## 67 4 4
## 68 4 4
## 69 7 4
## 70 8 4
## 71 12 4
## 72 12 4
## 73 15 4
## 74 16 4
## 75 16 4
## 76 16 4
## 77 16 4
## 78 18 4
## 79 19 4
## 80 21 4
## 81 21 4
## 82 22 4
## 83 22 4
## 84 23 4
## 85 24 4
## 86 26 4
## 87 29 4
## 88 29 4
## 89 29 4
## 90 1 5
## 91 1 5
## 92 1 5
## 93 1 5
## 94 1 5
## 95 2 5
## 96 2 5
## 97 8 5
## 98 8 5
## 99 8 5
## 100 9 5
## 101 9 5
## 102 10 5
## 103 11 5
## 104 11 5
## 105 17 5
## 106 19 5
## 107 19 5
## 108 20 5
## 109 20 5
## 110 22 5
## 111 24 5
## 112 25 5
## 113 25 5
## 114 25 5
## 115 31 5
## 116 31 5
## 117 1 6
## 118 1 6
## 119 2 6
## 120 2 6
## 121 3 6
## 122 3 6
## 123 4 6
## 124 5 6
## 125 7 6
## 126 12 6
## 127 12 6
## 128 13 6
## 129 13 6
## 130 13 6
## 131 14 6
## 132 15 6
## 133 16 6
## 134 21 6
## 135 22 6
## 136 24 6
## 137 25 6
## 138 25 6
## 139 26 6
## 140 27 6
## 141 29 6
## 142 29 6
## 143 3 7
## 144 3 7
## 145 5 7
## 146 6 7
## 147 7 7
## 148 8 7
## 149 10 7
## 150 11 7
## 151 12 7
## 152 15 7
## 153 16 7
## 154 16 7
## 155 17 7
## 156 17 7
## 157 17 7
## 158 19 7
## 159 20 7
## 160 20 7
## 161 22 7
## 162 23 7
## 163 24 7
## 164 26 7
## 165 28 7
## 166 28 7
## 167 30 7
## 168 1 8
## 169 4 8
## 170 5 8
## 171 6 8
## 172 8 8
## 173 10 8
## 174 10 8
## 175 11 8
## 176 17 8
## 177 18 8
## 178 18 8
## 179 18 8
## 180 19 8
## 181 23 8
## 182 24 8
## 183 26 8
## 184 28 8
## 185 28 8
## 186 28 8
## 187 31 8
## 188 10 9
## 189 12 9
## 190 15 9
## 191 17 9
## 192 17 9
## 193 19 9
## 194 20 9
## 195 20 9
## 196 20 9
## 197 22 9
## 198 23 9
## 199 25 9
## 200 26 9
## 201 27 9
## 202 2 10
## 203 4 10
## 204 5 10
## 205 9 10
## 206 9 10
## 207 12 10
## 208 13 10
## 209 15 10
## 210 15 10
## 211 17 10
## 212 17 10
## 213 18 10
## 214 18 10
## 215 21 10
## 216 3 11
## 217 3 11
## 218 4 11
## 219 4 11
## 220 5 11
## 221 5 11
## 222 5 11
## 223 5 11
## 224 6 11
## 225 7 11
## 226 10 11
## 227 11 11
## 228 12 11
## 229 13 11
## 230 13 11
## 231 17 11
## 232 20 11
## 233 22 11
## 234 28 11
## 235 28 11
## 236 30 11
## 237 6 12
## 238 6 12
## 239 8 12
## 240 10 12
## 241 13 12
## 242 15 12
## 243 17 12
## 244 21 12
## 245 21 12
## 246 23 12
## 247 23 12
## 248 25 12
## 249 27 12
## 250 28 12
## 251 29 12
## 252 31 12
qplot(data = by_yday, x = yday, binwidth = 1,
main = 'Birthdays during one year',
xlab = 'Days of year',
ylab = 'Number of birthday in that day',
fill = I('#A41470')) +
scale_x_continuous(breaks = seq(0,365,30), lim = c(1,366))