Introduction

#Load libraries
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(readr)
library(magrittr)
#Load and view dataset
df<- read_csv("pop_dataset_0002.txt")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   region = col_character(),
##   age = col_double(),
##   gender = col_character(),
##   population = col_double()
## )
head(df)
## # A tibble: 6 x 4
##   region     age gender population
##   <chr>    <dbl> <chr>       <dbl>
## 1 SSC21184     0 M             114
## 2 SSC21184     0 F              95
## 3 SSC21184     1 M              88
## 4 SSC21184     1 F             107
## 5 SSC21184     2 M             122
## 6 SSC21184     2 F             120

#1 Using the correct code, check the data by completing the steps below. Then, answer each question.

1.4 What is the minimum age bin?

min(df$age)
## [1] 0

1.5 What is the maximum age bin?

max(df$age)
## [1] 55

1.6 What is the bin size for the age field?

count(distinct(df,age))
## # A tibble: 1 x 1
##       n
##   <int>
## 1    56

2 Perform the following data analysis by looking at some descriptive statistics on the complete dataset:

#Create new dataframe
df1<-df
head(df1)
## # A tibble: 6 x 4
##   region     age gender population
##   <fct>    <dbl> <fct>       <dbl>
## 1 SSC21184     0 M             114
## 2 SSC21184     0 F              95
## 3 SSC21184     1 M              88
## 4 SSC21184     1 F             107
## 5 SSC21184     2 M             122
## 6 SSC21184     2 F             120

2.1 Use the expected value for the age to find the mean age for the whole data sample.

#Create new variable: agepop to multiply age by population
df1$agepop <-(df1$age * df1$population)
#Divide total sum of ages (agepop) by the total pop to find the mean (rounded to 3)
mean<-sum(df1$agepop)/sum(df1$population)
print(paste('Mean age of total population:', mean))
## [1] "Mean age of total population: 27.8002663266396"
mean<-round(mean, digits = 3)
print(paste('Mean age of total population (rounded):', mean))
## [1] "Mean age of total population (rounded): 27.8"

2.2 Provide the standard deviation for the whole data sample.

#Subtract the mean from every number and square the result
df1$num_minus_mean <-df1$age-mean
df1$sqr_result <-df1$num_minus_mean*df1$num_minus_mean
#Sum of Squared Differences divided by the Count
variance<-sum(df1$sqr_result)/sum(df1$population-1)
#Find the square root of the variance
print(paste('SD for the whole data sample:', sqrt(variance)))
## [1] "SD for the whole data sample: 4.44709795287088"
print(paste('SD for the whole data sample (rounded):', round(sqrt(variance), digits=3)))
## [1] "SD for the whole data sample (rounded): 4.447"

3 Record the following statistics on the means from each region:

3.1 mean

df2<- df
head(df2)
## # A tibble: 6 x 4
##   region     age gender population
##   <fct>    <dbl> <fct>       <dbl>
## 1 SSC21184     0 M             114
## 2 SSC21184     0 F              95
## 3 SSC21184     1 M              88
## 4 SSC21184     1 F             107
## 5 SSC21184     2 M             122
## 6 SSC21184     2 F             120
#Create variable "agepop" to multiply age by population
df2$agepop<-df2$age * df2$population
head(df2)
## # A tibble: 6 x 5
##   region     age gender population agepop
##   <fct>    <dbl> <fct>       <dbl>  <dbl>
## 1 SSC21184     0 M             114      0
## 2 SSC21184     0 F              95      0
## 3 SSC21184     1 M              88     88
## 4 SSC21184     1 F             107    107
## 5 SSC21184     2 M             122    244
## 6 SSC21184     2 F             120    240
#Group region by total population and by age 
regionpop <-aggregate(population~region,df2,sum)
head(regionpop[order(regionpop$population),])
##      region population
## 10 SSC20099          3
## 14 SSC20127          3
## 17 SSC20151          3
## 51 SSC20346          3
## 62 SSC20383          3
## 66 SSC20398          3
regionagepop <-aggregate(agepop~region,df2,sum)
head(regionagepop[order(regionagepop$agepop),])
##       region agepop
## 384 SSC22193      6
## 164 SSC21026      9
## 51  SSC20346     18
## 17  SSC20151     21
## 71  SSC20422     27
## 83  SSC20503     33
#Join both created tables and divide agepop by pop to get mean by region (rounded to 2)
joined<-full_join(regionpop, regionagepop, by='region')
joined$mean<-joined$agepop/joined$population
joined$mean<-round(joined$mean, digits=1)
head(joined[order(joined$mean),])
##       region population agepop mean
## 384 SSC22193          3      6  2.0
## 164 SSC21026          3      9  3.0
## 201 SSC21211          6     33  5.5
## 282 SSC21701          6     33  5.5
## 51  SSC20346          3     18  6.0
## 17  SSC20151          3     21  7.0
#Find the mean of the region means
round(mean(joined$mean), digits=3)
## [1] 30.609
#Summary statistics
summary(joined$mean)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00   27.40   29.20   30.61   33.33   55.00

3.2 Standard deviation of the region means

round(sd(joined$mean), digits=3)
## [1] 7.997

3.3 Minimum of the region means

min (joined$mean)
## [1] 2

3.4 First quartile of the region means

quantile(joined$mean, 0.25)
##  25% 
## 27.4

3.5 median

median(joined$mean)
## [1] 29.2

3.6 third quartile

quantile(joined$mean, 0.75)
##    75% 
## 33.325

3.7 maximum

max(joined$mean)
## [1] 55

3.8 interquartile range

IQR(joined$mean)
## [1] 5.925

3.9 histogram plot of the distribution of means from each region.

gg<-ggplot(data=joined, aes(x=mean)) + geom_histogram(binwidth=1,
   color="black", 
   fill="dodgerblue3",
   bins=56) +
   ggtitle("Mean Age Distribution by Region") +
   xlab("Mean Age") +
   geom_vline(aes(xintercept=mean(mean, na.rm=T)),                                         color="red", linetype="dashed", size=1)
gg

#4 Consider the region with the smallest population: ## 4.1 Identify the region that has the least people and how many it has. If there is more than one such region, please select one.

#sum each population and filter down to min population (3)
regionpop <-aggregate(population~region,df2,sum)
minpop<-min(regionpop$population)
head(regionpop%>%filter(population==minpop))
##     region population
## 1 SSC20099          3
## 2 SSC20127          3
## 3 SSC20151          3
## 4 SSC20346          3
## 5 SSC20383          3
## 6 SSC20398          3

5 Consider the region with the largest population:

5.1 Plot the distribution of ages for the region with the most people.

#Find region with largest population
regionpop <-aggregate(population~region,df2,sum)
maxpop<-max(regionpop$population)
maxpopregion<-regionpop%>%filter(population==maxpop)
maxpopregion
##     region population
## 1 SSC22015      37948
#Filter to region SSC22015
filtered<-df2%>%
  filter(region=="SSC22015")
head(filtered)
## # A tibble: 6 x 5
##   region     age gender population agepop
##   <fct>    <dbl> <fct>       <dbl>  <dbl>
## 1 SSC22015     0 M             455      0
## 2 SSC22015     0 F             423      0
## 3 SSC22015     1 M             492    492
## 4 SSC22015     1 F             479    479
## 5 SSC22015     2 M             465    930
## 6 SSC22015     2 F             453    906
# Sum population
sumagepop <-aggregate(population~age,filtered,sum)
head(sumagepop)
##   age population
## 1   0        878
## 2   1        971
## 3   2        918
## 4   3        975
## 5   4        965
## 6   5        847
# plot region
plot(sumagepop)

5.2 Plot cumulative distribution for the region with the most people.

sumagepop$cdf <- cumsum(sumagepop$population)
plot(sumagepop$age, sumagepop$cdf)

5.3 Plot the cumulative distribution for males and females on the same plot.

sumgenderpop<-aggregate(population~gender+age, filtered, sum)
sumgenderpop$cdf <- cumsum(sumgenderpop$population)
qplot(sumgenderpop$age, sumgenderpop$cdf)

8 Imagine you have enough resources for two events to launch a new product:

8.1 Select a gender and age group which spans 3 to 5 years. This will be the primary customers for your hypothetical product.

8.2 Which two regions would you start with and why?

I would choose SSC22015 and SSC22569 as they have the highest population of boys in this age group. Both regions also have a higher ratio of boys to girls.

#Filter to boys and girls 2 years and under
baby_boy<-df2%>%group_by(region)%>%filter(gender=="M", age<=3)%>%summarise(male=sum(population))
baby_boy<-baby_boy[order(-baby_boy$male),]

baby_girl<-df2%>%group_by(region)%>%filter(gender=="F", age<=3)%>%summarise(female=sum(population))
#join the tables, exclude any regions with 0 for either gender, sort descending

babies<-left_join(baby_boy, baby_girl, by='region')
babies$totalpop<-babies$male+babies$female
babies$ratio<-round(babies$male/babies$female, digits=3)
babies<-babies%>%filter(male>0 & female>0)
babies<-babies[order(-babies$male),]
head(babies,2)
## # A tibble: 2 x 5
##   region    male female totalpop ratio
##   <fct>    <dbl>  <dbl>    <dbl> <dbl>
## 1 SSC22015  1890   1852     3742  1.02
## 2 SSC22569  1165   1113     2278  1.05

9 Demonstrate the following:

9.1 The central limit theorem by selecting k = 100 samples of size n age values each from the cumulative distribution for the region with the most people (Q5.1).

9.2 Repeat this process for n = 1, 10, 100 and 1000 observations.

9.3 Plot the resulting distribution along with a normal distribution, using the mean and standard deviation for each sample size given in Q9.2

#Filter to region SSC22015
filtered<-df2%>%
  filter(region=="SSC22015")
head(filtered)
## # A tibble: 6 x 5
##   region     age gender population agepop
##   <fct>    <dbl> <fct>       <dbl>  <dbl>
## 1 SSC22015     0 M             455      0
## 2 SSC22015     0 F             423      0
## 3 SSC22015     1 M             492    492
## 4 SSC22015     1 F             479    479
## 5 SSC22015     2 M             465    930
## 6 SSC22015     2 F             453    906
#create a list of the (repeated in list by the population for each age) 
list<-rep(filtered$age, times = filtered$agepop)

# set sample size
n_sample <- 1000

# set number of times the samples will be summed
n_sum <- 100


# set the cumulative list to zero then loop and attach CLT results
clist<-NULL
for (i in seq(1, n_sum)){
  result<-mean(sample(list, n_sum))
  clist<-c(clist, result)
}
clist
##   [1] 35.90 36.87 35.16 35.70 35.45 33.23 34.43 36.25 35.23 35.72 34.31 35.43
##  [13] 32.72 35.47 36.32 34.71 33.04 35.69 34.90 36.84 32.36 34.11 33.96 34.92
##  [25] 32.93 34.74 36.35 35.49 33.63 35.59 35.74 37.13 34.64 34.98 35.12 36.15
##  [37] 35.26 34.99 33.64 34.81 34.70 35.25 33.71 34.90 33.24 35.62 35.53 36.61
##  [49] 35.25 34.39 37.40 35.72 35.00 33.28 35.28 35.90 35.31 35.53 33.69 35.60
##  [61] 35.74 37.52 33.68 36.18 38.25 35.22 34.99 34.90 36.60 35.54 34.06 37.76
##  [73] 36.14 35.54 36.50 34.50 35.11 35.68 32.83 35.75 35.45 34.28 33.78 35.89
##  [85] 36.46 36.75 36.74 36.44 33.76 33.97 33.45 34.90 33.62 33.89 35.96 34.64
##  [97] 35.65 36.15 34.67 35.52

add a normal distribution curve for comparison

mean <- mean(clist)
sd <- sd(clist)
binwidth <- 0.005 * sqrt(n_sample) * 4 * (n_sum ^ 0.4)  

title_txt <- sprintf('CLT after summing %d samples of %d observations', n_sum, n_sample)

gg <- ggplot()
gg <- gg + geom_histogram(aes(x = clist), binwidth = binwidth, colour = "white", 
                          fill = "cornflowerblue", size = 0.1)
gg <- gg + stat_function(fun = function(x) dnorm(x, mean = mean, sd = sd) * n_sum * binwidth, 
                         color = "darkred", size = 1)
gg <- gg + labs(x = 'Age', y = 'Frequency', title = title_txt)
gg