Births data frame has 7305 observations on the following 8 columns.
library(psych)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
I will target below questions.
theLink <- "https://raw.githubusercontent.com/amit-kapoor/r/master/data/births.csv"
# load data into data frame
df_births <- read.csv(file=theLink, header = TRUE, sep = ",")
# display header rows
head(df_births)
## X date births wday year month day_of_year day_of_month day_of_week
## 1 1 1969-01-01 8486 Wed 1969 1 1 1 3
## 2 2 1969-01-02 9002 Thu 1969 1 2 2 4
## 3 3 1969-01-03 9542 Fri 1969 1 3 3 5
## 4 4 1969-01-04 8960 Sat 1969 1 4 4 6
## 5 5 1969-01-05 8390 Sun 1969 1 5 5 7
## 6 6 1969-01-06 9560 Mon 1969 1 6 6 1
# summary of Birth data
summary(df_births)
## X date births wday
## Min. : 1 1969-01-01: 1 Min. : 6675 Fri:1044
## 1st Qu.:1827 1969-01-02: 1 1st Qu.: 8792 Mon:1043
## Median :3653 1969-01-03: 1 Median : 9622 Sat:1044
## Mean :3653 1969-01-04: 1 Mean : 9649 Sun:1043
## 3rd Qu.:5479 1969-01-05: 1 3rd Qu.:10510 Thu:1044
## Max. :7305 1969-01-06: 1 Max. :12851 Tue:1043
## (Other) :7299 Wed:1044
## year month day_of_year day_of_month
## Min. :1969 Min. : 1.000 Min. : 1.0 Min. : 1.00
## 1st Qu.:1974 1st Qu.: 4.000 1st Qu.: 93.0 1st Qu.: 8.00
## Median :1979 Median : 7.000 Median :184.0 Median :16.00
## Mean :1979 Mean : 6.523 Mean :183.8 Mean :15.73
## 3rd Qu.:1984 3rd Qu.:10.000 3rd Qu.:275.0 3rd Qu.:23.00
## Max. :1988 Max. :12.000 Max. :366.0 Max. :31.00
##
## day_of_week
## Min. :1
## 1st Qu.:2
## Median :4
## Mean :4
## 3rd Qu.:6
## Max. :7
##
# describe Birth data
describe(df_births)
## vars n mean sd median trimmed mad min max
## X 1 7305 3653.00 2108.92 3653 3653.00 2707.23 1 7305
## date* 2 7305 3653.00 2108.92 3653 3653.00 2707.23 1 7305
## births 3 7305 9648.94 1127.32 9622 9648.80 1275.04 6675 12851
## wday* 4 7305 4.00 2.00 4 4.00 2.97 1 7
## year 5 7305 1978.50 5.77 1979 1978.50 7.41 1969 1988
## month 6 7305 6.52 3.45 7 6.53 4.45 1 12
## day_of_year 7 7305 183.75 105.62 184 183.82 134.92 1 366
## day_of_month 8 7305 15.73 8.80 16 15.72 11.86 1 31
## day_of_week 9 7305 4.00 2.00 4 4.00 2.97 1 7
## range skew kurtosis se
## X 7304 0.00 -1.20 24.67
## date* 7304 0.00 -1.20 24.67
## births 6176 0.02 -0.67 13.19
## wday* 6 0.00 -1.25 0.02
## year 19 0.00 -1.21 0.07
## month 11 -0.01 -1.21 0.04
## day_of_year 365 0.00 -1.20 1.24
## day_of_month 30 0.01 -1.19 0.10
## day_of_week 6 0.00 -1.25 0.02
# unique values of month column
unique(df_births$month)
## [1] 1 2 3 4 5 6 7 8 9 10 11 12
# replace months numeric value to corressponding month name
df_births$month[df_births$month == 1] <- "Jan"
df_births$month[df_births$month == 2] <- "Feb"
df_births$month[df_births$month == 3] <- "Mar"
df_births$month[df_births$month == 4] <- "Apr"
df_births$month[df_births$month == 5] <- "May"
df_births$month[df_births$month == 6] <- "Jun"
df_births$month[df_births$month == 7] <- "Jul"
df_births$month[df_births$month == 8] <- "Aug"
df_births$month[df_births$month == 9] <- "Sep"
df_births$month[df_births$month == 10] <- "Oct"
df_births$month[df_births$month == 11] <- "Nov"
df_births$month[df_births$month == 12] <- "Dec"
head(df_births)
## X date births wday year month day_of_year day_of_month day_of_week
## 1 1 1969-01-01 8486 Wed 1969 Jan 1 1 3
## 2 2 1969-01-02 9002 Thu 1969 Jan 2 2 4
## 3 3 1969-01-03 9542 Fri 1969 Jan 3 3 5
## 4 4 1969-01-04 8960 Sat 1969 Jan 4 4 6
## 5 5 1969-01-05 8390 Sun 1969 Jan 5 5 7
## 6 6 1969-01-06 9560 Mon 1969 Jan 6 6 1
# unique values after replacement
unique(df_births$month)
## [1] "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov"
## [12] "Dec"
# maximum value in births columns
max(df_births$births)
## [1] 12851
# minimum value in births columns
min(df_births$births)
## [1] 6675
# group by births by year
df_tot_birth_year = aggregate(df_births$births, by=list(Category=df_births$year), FUN=sum)
df_tot_birth_year
## Category x
## 1 1969 3598578
## 2 1970 3734606
## 3 1971 3563074
## 4 1972 3265863
## 5 1973 3145670
## 6 1974 3170306
## 7 1975 3153366
## 8 1976 3176266
## 9 1977 3332048
## 10 1978 3338197
## 11 1979 3499737
## 12 1980 3617948
## 13 1981 3635480
## 14 1982 3685374
## 15 1983 3642731
## 16 1984 3673531
## 17 1985 3765054
## 18 1986 3760682
## 19 1987 3813211
## 20 1988 3913786
# year having maximum number of births
df_tot_birth_year$Category[df_tot_birth_year$x == max(df_tot_birth_year$x)]
## [1] 1988
# year having minimum number of births
df_tot_birth_year$Category[df_tot_birth_year$x == min(df_tot_birth_year$x)]
## [1] 1973
# group by births by month
df_tot_birth_month = aggregate(df_births$births, by=list(Category=df_births$month), FUN=sum)
df_tot_birth_month
## Category x
## 1 Apr 5560775
## 2 Aug 6309764
## 3 Dec 5904376
## 4 Feb 5362585
## 5 Jan 5759165
## 6 Jul 6220295
## 7 Jun 5758571
## 8 Mar 5868140
## 9 May 5785220
## 10 Nov 5695273
## 11 Oct 6055372
## 12 Sep 6205972
# month having maximum number of births
df_tot_birth_month$Category[df_tot_birth_month$x == max(df_tot_birth_month$x)]
## [1] "Aug"
# month having minimum number of births
df_tot_birth_month$Category[df_tot_birth_month$x == min(df_tot_birth_month$x)]
## [1] "Feb"
# plot births vs year
plot(x = df_births$births, y=df_births$year, xlab = "Births", ylab = "Year", main = "Births vs Year", col = 'red')
ggplot(df_births, aes(x=df_births$year, y=df_births$births)) + geom_point()
# histogram for births by frequency
hist(df_births$births, freq = TRUE, col="green")
# histogram for births by density
hist(df_births$births, freq = FALSE, col="blue")
# boxplot births and year
boxplot(df_births$births ~ df_births$year, col = "orange")
# boxplot births and month
boxplot(df_births$births ~ df_births$month, col = "yellow")
I have looked into births data for years and months available in data. I do see a very close normal distribution trend in births data above which doesnt seem much skewed.