Introduction

#Load libraries
library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(readr)
library(magrittr)

#Load and view dataset
df<- read_csv("pop_dataset_0002.txt")

## 
## -- Column specification --------------------------------------------------------
## cols(
##   region = col_character(),
##   age = col_double(),
##   gender = col_character(),
##   population = col_double()
## )

head(df)

## # A tibble: 6 x 4
##   region     age gender population
##   <chr>    <dbl> <chr>       <dbl>
## 1 SSC21184     0 M             114
## 2 SSC21184     0 F              95
## 3 SSC21184     1 M              88
## 4 SSC21184     1 F             107
## 5 SSC21184     2 M             122
## 6 SSC21184     2 F             120

#1 Using the correct code, check the data by completing the steps below. Then, answer each question.

1.1 Print out the dimensions of the data frame.

dim(df)

## [1] 56000     4

print(paste('Number of rows:', nrow(df)))

## [1] "Number of rows: 56000"

print(paste('Number of columns:', ncol(df)))

## [1] "Number of columns: 4"

1.2 Print out the names and type of each of the data frame’s columns.

sapply(df, class)

##      region         age      gender  population 
## "character"   "numeric" "character"   "numeric"

#Reassign characters as factors
df$region<-as.factor(df$region)
df$gender<-as.factor(df$gender)
sapply(df, class)

##     region        age     gender population 
##   "factor"  "numeric"   "factor"  "numeric"

1.3 Print out the number of unique regions in the dataset.

print(paste('Number of unique regions:', length(unique(df$region))))

## [1] "Number of unique regions: 500"

1.4 What is the minimum age bin?

min(df$age)

## [1] 0

1.5 What is the maximum age bin?

max(df$age)

## [1] 55

1.6 What is the bin size for the age field?

count(distinct(df,age))

## # A tibble: 1 x 1
##       n
##   <int>
## 1    56

2 Perform the following data analysis by looking at some descriptive statistics on the complete dataset:

#Create new dataframe
df1<-df
head(df1)

## # A tibble: 6 x 4
##   region     age gender population
##   <fct>    <dbl> <fct>       <dbl>
## 1 SSC21184     0 M             114
## 2 SSC21184     0 F              95
## 3 SSC21184     1 M              88
## 4 SSC21184     1 F             107
## 5 SSC21184     2 M             122
## 6 SSC21184     2 F             120

2.1 Use the expected value for the age to find the mean age for the whole data sample.

#Create new variable: agepop to multiply age by population
df1$agepop <-(df1$age * df1$population)

#Divide total sum of ages (agepop) by the total pop to find the mean (rounded to 3)
mean<-sum(df1$agepop)/sum(df1$population)
print(paste('Mean age of total population:', mean))

## [1] "Mean age of total population: 27.8002663266396"

mean<-round(mean, digits = 3)
print(paste('Mean age of total population (rounded):', mean))

## [1] "Mean age of total population (rounded): 27.8"

2.2 Provide the standard deviation for the whole data sample.

#Subtract the mean from every number and square the result
df1$num_minus_mean <-df1$age-mean
df1$sqr_result <-df1$num_minus_mean*df1$num_minus_mean

#Sum of Squared Differences divided by the Count
variance<-sum(df1$sqr_result)/sum(df1$population-1)

#Find the square root of the variance
print(paste('SD for the whole data sample:', sqrt(variance)))

## [1] "SD for the whole data sample: 4.44709795287088"

print(paste('SD for the whole data sample (rounded):', round(sqrt(variance), digits=3)))

## [1] "SD for the whole data sample (rounded): 4.447"

3 Record the following statistics on the means from each region:

3.1 mean

df2<- df
head(df2)

## # A tibble: 6 x 4
##   region     age gender population
##   <fct>    <dbl> <fct>       <dbl>
## 1 SSC21184     0 M             114
## 2 SSC21184     0 F              95
## 3 SSC21184     1 M              88
## 4 SSC21184     1 F             107
## 5 SSC21184     2 M             122
## 6 SSC21184     2 F             120

#Create variable "agepop" to multiply age by population
df2$agepop<-df2$age * df2$population
head(df2)

## # A tibble: 6 x 5
##   region     age gender population agepop
##   <fct>    <dbl> <fct>       <dbl>  <dbl>
## 1 SSC21184     0 M             114      0
## 2 SSC21184     0 F              95      0
## 3 SSC21184     1 M              88     88
## 4 SSC21184     1 F             107    107
## 5 SSC21184     2 M             122    244
## 6 SSC21184     2 F             120    240

#Group region by total population and by age 
regionpop <-aggregate(population~region,df2,sum)
head(regionpop[order(regionpop$population),])

##      region population
## 10 SSC20099          3
## 14 SSC20127          3
## 17 SSC20151          3
## 51 SSC20346          3
## 62 SSC20383          3
## 66 SSC20398          3

regionagepop <-aggregate(agepop~region,df2,sum)
head(regionagepop[order(regionagepop$agepop),])

##       region agepop
## 384 SSC22193      6
## 164 SSC21026      9
## 51  SSC20346     18
## 17  SSC20151     21
## 71  SSC20422     27
## 83  SSC20503     33

#Join both created tables and divide agepop by pop to get mean by region (rounded to 2)
joined<-full_join(regionpop, regionagepop, by='region')
joined$mean<-joined$agepop/joined$population
joined$mean<-round(joined$mean, digits=1)
head(joined[order(joined$mean),])

##       region population agepop mean
## 384 SSC22193          3      6  2.0
## 164 SSC21026          3      9  3.0
## 201 SSC21211          6     33  5.5
## 282 SSC21701          6     33  5.5
## 51  SSC20346          3     18  6.0
## 17  SSC20151          3     21  7.0

#Find the mean of the region means
round(mean(joined$mean), digits=3)

## [1] 30.609

#Summary statistics
summary(joined$mean)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00   27.40   29.20   30.61   33.33   55.00

3.2 Standard deviation of the region means

round(sd(joined$mean), digits=3)

## [1] 7.997

3.3 Minimum of the region means

min (joined$mean)

## [1] 2

3.4 First quartile of the region means

quantile(joined$mean, 0.25)

##  25% 
## 27.4

3.5 median

median(joined$mean)

## [1] 29.2

3.6 third quartile

quantile(joined$mean, 0.75)

##    75% 
## 33.325

3.7 maximum

max(joined$mean)

## [1] 55

3.8 interquartile range

IQR(joined$mean)

## [1] 5.925

3.9 histogram plot of the distribution of means from each region.

gg<-ggplot(data=joined, aes(x=mean)) + geom_histogram(binwidth=1,
   color="black", 
   fill="dodgerblue3",
   bins=56) +
   ggtitle("Mean Age Distribution by Region") +
   xlab("Mean Age") +
   geom_vline(aes(xintercept=mean(mean, na.rm=T)),                                         color="red", linetype="dashed", size=1)
gg

#4 Consider the region with the smallest population: ## 4.1 Identify the region that has the least people and how many it has. If there is more than one such region, please select one.

#sum each population and filter down to min population (3)
regionpop <-aggregate(population~region,df2,sum)
minpop<-min(regionpop$population)
head(regionpop%>%filter(population==minpop))

##     region population
## 1 SSC20099          3
## 2 SSC20127          3
## 3 SSC20151          3
## 4 SSC20346          3
## 5 SSC20383          3
## 6 SSC20398          3

5 Consider the region with the largest population:

5.1 Plot the distribution of ages for the region with the most people.

#Find region with largest population
regionpop <-aggregate(population~region,df2,sum)
maxpop<-max(regionpop$population)
maxpopregion<-regionpop%>%filter(population==maxpop)
maxpopregion

##     region population
## 1 SSC22015      37948

#Filter to region SSC22015
filtered<-df2%>%
  filter(region=="SSC22015")
head(filtered)

## # A tibble: 6 x 5
##   region     age gender population agepop
##   <fct>    <dbl> <fct>       <dbl>  <dbl>
## 1 SSC22015     0 M             455      0
## 2 SSC22015     0 F             423      0
## 3 SSC22015     1 M             492    492
## 4 SSC22015     1 F             479    479
## 5 SSC22015     2 M             465    930
## 6 SSC22015     2 F             453    906

# Sum population
sumagepop <-aggregate(population~age,filtered,sum)
head(sumagepop)

##   age population
## 1   0        878
## 2   1        971
## 3   2        918
## 4   3        975
## 5   4        965
## 6   5        847

# plot region
plot(sumagepop)

5.2 Plot cumulative distribution for the region with the most people.

sumagepop$cdf <- cumsum(sumagepop$population)
plot(sumagepop$age, sumagepop$cdf)

5.3 Plot the cumulative distribution for males and females on the same plot.

sumgenderpop<-aggregate(population~gender+age, filtered, sum)
sumgenderpop$cdf <- cumsum(sumgenderpop$population)
qplot(sumgenderpop$age, sumgenderpop$cdf)

6 Provide an analysis of trends in age against region population:

6.1 Plot the ratio of old to young people using 40 years old as a cut off, i.e., ‘young’ is defined as age < 40 and ‘old’ as ≥ 40.

#create variables for old and young populations
region_old<-df2%>%group_by(region)%>%filter(age>=40)%>%summarise(old=sum(population))
head(region_old)

## # A tibble: 6 x 2
##   region     old
##   <fct>    <dbl>
## 1 SSC20005    17
## 2 SSC20012   178
## 3 SSC20018    50
## 4 SSC20027   371
## 5 SSC20029    12
## 6 SSC20048   316

region_young<-df2%>%group_by(region)%>%filter(age<40)%>%summarise(young=sum(population))
head(region_young)

## # A tibble: 6 x 2
##   region   young
##   <fct>    <dbl>
## 1 SSC20005    16
## 2 SSC20012   247
## 3 SSC20018    50
## 4 SSC20027   766
## 5 SSC20029    39
## 6 SSC20048   608

#concatenate the tables & work out the ratio
oldyoungjoin<-full_join(region_old, region_young, by='region')
oldyoungjoin$totalpop<-oldyoungjoin$old+oldyoungjoin$young
oldyoungjoin$ratio<-round(oldyoungjoin$old/oldyoungjoin$young, digits=3)
head(oldyoungjoin)

## # A tibble: 6 x 5
##   region     old young totalpop ratio
##   <fct>    <dbl> <dbl>    <dbl> <dbl>
## 1 SSC20005    17    16       33 1.06 
## 2 SSC20012   178   247      425 0.721
## 3 SSC20018    50    50      100 1    
## 4 SSC20027   371   766     1137 0.484
## 5 SSC20029    12    39       51 0.308
## 6 SSC20048   316   608      924 0.52

#plot the ratios by total pop
ggplot(oldyoungjoin, aes(x = totalpop, y = ratio)) +
  geom_point() +
  labs(
    title = "Ratio of Old to Young people by region",
    x = "Population",
    y = "Ratio")

6.2 Comment on any trends you see in the data. What could cause such trends?

#create variables for small old and young populations

smallregion_old<-df2%>%group_by(region)%>%filter(age>=40)%>%summarise(old=sum(population))
head(smallregion_old)

## # A tibble: 6 x 2
##   region     old
##   <fct>    <dbl>
## 1 SSC20005    17
## 2 SSC20012   178
## 3 SSC20018    50
## 4 SSC20027   371
## 5 SSC20029    12
## 6 SSC20048   316

smallregion_young<-df2%>%group_by(region)%>%filter(age<40)%>%summarise(young=sum(population))
head(smallregion_young)

## # A tibble: 6 x 2
##   region   young
##   <fct>    <dbl>
## 1 SSC20005    16
## 2 SSC20012   247
## 3 SSC20018    50
## 4 SSC20027   766
## 5 SSC20029    39
## 6 SSC20048   608

#concatenate the tables & work out the ratio
smallregionjoin<-full_join(smallregion_old, smallregion_young, by='region')
smallregionjoin$totalpop<-smallregionjoin$old+smallregionjoin$young

smallregiontotalyoung<-smallregionjoin%>%filter(totalpop<=50)%>%pull(young)%>%sum()
smallregiontotalold<-smallregionjoin%>%filter(totalpop<=50)%>%pull(old)%>%sum()
print(paste('The total population for regions with populations of 50 or less:', smallregiontotalyoung+smallregiontotalold))

## [1] "The total population for regions with populations of 50 or less: 3518"

#work out ratio
smallregionoldtoyoungratio<-round(smallregiontotalold/smallregiontotalyoung, digits=3)
print(paste('The total Old/Young ratio for regions with populations of 50 or less:', smallregionoldtoyoungratio))

## [1] "The total Old/Young ratio for regions with populations of 50 or less: 0.882"

7 Provide an analysis of trends in gender ratio against region population:

7.1 Plot the gender ratio as a function of the population of the region.

#create variables for male and female populations
region_male<-df2%>%group_by(region)%>%filter(gender=="M")%>%summarise(male=sum(population))
head(region_male)

## # A tibble: 6 x 2
##   region    male
##   <fct>    <dbl>
## 1 SSC20005    13
## 2 SSC20012   217
## 3 SSC20018    60
## 4 SSC20027   599
## 5 SSC20029    30
## 6 SSC20048   476

region_female<-df2%>%group_by(region)%>%filter(gender=="F")%>%summarise(female=sum(population))
head(region_female)

## # A tibble: 6 x 2
##   region   female
##   <fct>     <dbl>
## 1 SSC20005     20
## 2 SSC20012    208
## 3 SSC20018     40
## 4 SSC20027    538
## 5 SSC20029     21
## 6 SSC20048    448

#concatenate the tables & work out the ratio
MFjoin<-full_join(region_male, region_female, by='region')
MFjoin$totalpop<-MFjoin$male+MFjoin$female
MFjoin$ratio<-round(MFjoin$male/MFjoin$female, digits=3)
MFjoin<-MFjoin%>%filter(male>0 & female>0)
head(MFjoin)

## # A tibble: 6 x 5
##   region    male female totalpop ratio
##   <fct>    <dbl>  <dbl>    <dbl> <dbl>
## 1 SSC20005    13     20       33  0.65
## 2 SSC20012   217    208      425  1.04
## 3 SSC20018    60     40      100  1.5 
## 4 SSC20027   599    538     1137  1.11
## 5 SSC20029    30     21       51  1.43
## 6 SSC20048   476    448      924  1.06

#plot the ratios by totalpop
ggplot(MFjoin, aes(x = totalpop, y = ratio)) +
  geom_point() +
  labs(
    title = "Ratio of Males to Females by region",
    x = "Population",
    y = "Ratio")

7.2 Comment on any trends you see in the data. What could cause such trends?

#create variables for small male and female populations

smallregion_male<-df2%>%group_by(region)%>%filter(gender=="M")%>%summarise(male=sum(population))
smallregion_female<-df2%>%group_by(region)%>%filter(gender=="F")%>%summarise(female=sum(population))

#concatenate the tables and filter to small populations
smallregiongenderjoin<-full_join(smallregion_male, smallregion_female, by='region')
smallregiongenderjoin$totalpop<-smallregiongenderjoin$male+smallregiongenderjoin$female
smallregiontotalmale<-smallregiongenderjoin%>%filter(totalpop<=50)%>%pull(male)%>%sum()
smallregiontotalfemale<-smallregiongenderjoin%>%filter(totalpop<=50)%>%pull(female)%>%sum()
print(paste('The total population for regions with populations of 50 or less:', smallregiontotalmale+smallregiontotalfemale))

## [1] "The total population for regions with populations of 50 or less: 3518"

#work out ratio
smallregionmaletofemaleratio<-round(smallregiontotalmale/smallregiontotalfemale, digits=3)
print(paste('The total Male/Female ratio for regions with populations of 50 or less:', smallregionmaletofemaleratio))

## [1] "The total Male/Female ratio for regions with populations of 50 or less: 1.013"

8 Imagine you have enough resources for two events to launch a new product:

8.1 Select a gender and age group which spans 3 to 5 years. This will be the primary customers for your hypothetical product.

8.2 Which two regions would you start with and why?

I would choose SSC22015 and SSC22569 as they have the highest population of boys in this age group. Both regions also have a higher ratio of boys to girls.

#Filter to boys and girls 2 years and under
baby_boy<-df2%>%group_by(region)%>%filter(gender=="M", age<=3)%>%summarise(male=sum(population))
baby_boy<-baby_boy[order(-baby_boy$male),]

baby_girl<-df2%>%group_by(region)%>%filter(gender=="F", age<=3)%>%summarise(female=sum(population))

#join the tables, exclude any regions with 0 for either gender, sort descending

babies<-left_join(baby_boy, baby_girl, by='region')
babies$totalpop<-babies$male+babies$female
babies$ratio<-round(babies$male/babies$female, digits=3)
babies<-babies%>%filter(male>0 & female>0)
babies<-babies[order(-babies$male),]
head(babies,2)

## # A tibble: 2 x 5
##   region    male female totalpop ratio
##   <fct>    <dbl>  <dbl>    <dbl> <dbl>
## 1 SSC22015  1890   1852     3742  1.02
## 2 SSC22569  1165   1113     2278  1.05

9 Demonstrate the following:

9.1 The central limit theorem by selecting k = 100 samples of size n age values each from the cumulative distribution for the region with the most people (Q5.1).

9.2 Repeat this process for n = 1, 10, 100 and 1000 observations.

9.3 Plot the resulting distribution along with a normal distribution, using the mean and standard deviation for each sample size given in Q9.2

#Filter to region SSC22015
filtered<-df2%>%
  filter(region=="SSC22015")
head(filtered)

## # A tibble: 6 x 5
##   region     age gender population agepop
##   <fct>    <dbl> <fct>       <dbl>  <dbl>
## 1 SSC22015     0 M             455      0
## 2 SSC22015     0 F             423      0
## 3 SSC22015     1 M             492    492
## 4 SSC22015     1 F             479    479
## 5 SSC22015     2 M             465    930
## 6 SSC22015     2 F             453    906

#create a list of the (repeated in list by the population for each age) 
list<-rep(filtered$age, times = filtered$agepop)

# set sample size
n_sample <- 1000

# set number of times the samples will be summed
n_sum <- 100


# set the cumulative list to zero then loop and attach CLT results
clist<-NULL
for (i in seq(1, n_sum)){
  result<-mean(sample(list, n_sum))
  clist<-c(clist, result)
}
clist

##   [1] 35.90 36.87 35.16 35.70 35.45 33.23 34.43 36.25 35.23 35.72 34.31 35.43
##  [13] 32.72 35.47 36.32 34.71 33.04 35.69 34.90 36.84 32.36 34.11 33.96 34.92
##  [25] 32.93 34.74 36.35 35.49 33.63 35.59 35.74 37.13 34.64 34.98 35.12 36.15
##  [37] 35.26 34.99 33.64 34.81 34.70 35.25 33.71 34.90 33.24 35.62 35.53 36.61
##  [49] 35.25 34.39 37.40 35.72 35.00 33.28 35.28 35.90 35.31 35.53 33.69 35.60
##  [61] 35.74 37.52 33.68 36.18 38.25 35.22 34.99 34.90 36.60 35.54 34.06 37.76
##  [73] 36.14 35.54 36.50 34.50 35.11 35.68 32.83 35.75 35.45 34.28 33.78 35.89
##  [85] 36.46 36.75 36.74 36.44 33.76 33.97 33.45 34.90 33.62 33.89 35.96 34.64
##  [97] 35.65 36.15 34.67 35.52

add a normal distribution curve for comparison

mean <- mean(clist)
sd <- sd(clist)
binwidth <- 0.005 * sqrt(n_sample) * 4 * (n_sum ^ 0.4)  

title_txt <- sprintf('CLT after summing %d samples of %d observations', n_sum, n_sample)

gg <- ggplot()
gg <- gg + geom_histogram(aes(x = clist), binwidth = binwidth, colour = "white", 
                          fill = "cornflowerblue", size = 0.1)
gg <- gg + stat_function(fun = function(x) dnorm(x, mean = mean, sd = sd) * n_sum * binwidth, 
                         color = "darkred", size = 1)
gg <- gg + labs(x = 'Age', y = 'Frequency', title = title_txt)
gg

MATH2406 Assessment 1

R Glover

16/05/2021