Introduction

# Include all the libraries here
library(ggplot2) # for visualisations
library(dplyr) # for data wrangling including piping

Question 1

# read the data 
pop_data <- read.csv("pop_dataset_0002.txt")

# 1.1 Print out the dimensions of the data frame. 
print(dim(pop_data))

## [1] 56000     4

# 1.2 Print out the names and type of each of the data frame’s columns. 
print(str(pop_data))

## 'data.frame':    56000 obs. of  4 variables:
##  $ region    : chr  "SSC21184" "SSC21184" "SSC21184" "SSC21184" ...
##  $ age       : int  0 0 1 1 2 2 3 3 4 4 ...
##  $ gender    : chr  "M" "F" "M" "F" ...
##  $ population: int  114 95 88 107 122 120 123 125 114 117 ...
## NULL

# 1.3 Print out the number of unique regions in the dataset. 
print(length(unique(pop_data[["region"]])))

## [1] 500

# 1.4 What is the minimum age bin? 
min(pop_data$age) # min Age bin is 0-1 years

## [1] 0

# 1.5 What is the maximum age bin? 
max(pop_data$age) #max Age bin is 55-56 years

## [1] 55

# 1.6 What is the bin size for the age field? 
# bin size is 1

Question 2.

# 2.1 Use the expected value for the age to find the mean age for the whole data sample. 
# first create new variables for population and age
pop_data$aaa <- (pop_data$population/sum(pop_data$population)) 
pop_data$bbb <- (pop_data$aaa*pop_data$age)
# calculate mean by using the sum of the 2nd new column
mean1 <- sum(pop_data$bbb)
print(paste('Mean age using expected value =', mean1))

## [1] "Mean age using expected value = 27.8002663266396"

# 2.2 Provide the standard deviation for the whole data sample. 
# first create new variables in line with the sd formula
pop_data$ccc <- (pop_data$age-mean1)
pop_data$ddd <- ((pop_data$ccc)^2)
# Calc SD
var1 <- sum(pop_data$aaa * (pop_data$ddd))
sd1 <- sqrt(var1)
print(paste('Standard deviation of data set is', sd1))

## [1] "Standard deviation of data set is 15.7780380945597"

Question 3.

## 3.1 mean 
#Filter the dataset to specific regions
region_unique <- unique(pop_data$region)
# Calculate the means for each region in a new list
region_means <- rep(0,500)
for(x in 1:length(region_unique))
{
region_x<- filter(pop_data, region == region_unique[x])
region_x$p1 <- (region_x$population/sum(region_x$population))
region_x$p2 <- (region_x$p1*region_x$age)
region_x_mu <- sum(region_x$p2)
region_means[x] <- region_x_mu
}
# Produce summary stats
summary(region_means)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00   27.43   29.23   30.61   33.35   55.00

## 3.2 standard deviation 
print(paste('Standard deviation of region means =',sd(region_means)))

## [1] "Standard deviation of region means = 7.99617869157122"

## 3.3 minimum 
min <- 2
## 3.4 first quartile 
Q1 <- 27.43
## 3.5 median
median <- 29.23
## 3.6 third quartile 
Q3 <- 33.35
## 3.7 maximum 
max <- 55.00
## 3.8 interquartile range 
IQR <- Q3 - Q1 # = 33.35 - 27.43 = 5.92
print(paste('Interquartile range =',IQR))

## [1] "Interquartile range = 5.92"

#3.9 histogram plot of the distribution of means from each region 
hist(region_means, main="Region Means", xlab = 'Age', )

Question 4:

#4.1 Show which region has the least people and how many it has. 
#assuming this means regions with 0 people are not included
#first sum population for each age group
min_pop_df <- aggregate(pop_data$population, by=list(Category=pop_data$region), FUN=sum)
# use min function to find smallest region
min_pop_df[which.min(min_pop_df$x),]

##    Category x
## 10 SSC20099 3

Question 5:

# 5.1 Plot the distribution of ages for the region with the most people.
# first find largest region
popregion_size <- pop_data %>%group_by(region)%>% summarise(pop=sum(population))
popregion_size %>% arrange(desc(pop))

## # A tibble: 500 x 2
##    region     pop
##    <chr>    <int>
##  1 SSC22015 37948
##  2 SSC21671 22979
##  3 SSC21125 20939
##  4 SSC20911 19340
##  5 SSC22569 19274
##  6 SSC21143 19180
##  7 SSC20773 18540
##  8 SSC22556 17928
##  9 SSC20865 17809
## 10 SSC20660 17442
## # ... with 490 more rows

# Filter for SSC22015
most_region <- filter(pop_data, region =="SSC22015")
# Group by age and sum population
most_region_plot <- most_region %>% group_by(age) %>% summarise(pop=sum(population))
# Create distribution with ggplot
gg <- ggplot(data=most_region_plot)
gg <- gg + geom_point(aes(x=age,y=pop))
gg <- gg + labs(x='Age in Region SSC22015', y='Population density')
gg

# 5.2 Plot cumulative distribution for the regions with the most people.
most_region_cum <- data.frame(x=sort(most_region$population))
most_region_cum$counts <- 1
most_region_cum$Fx <- cumsum(most_region_cum$counts)/sum(most_region_cum$counts)
gg <- ggplot()
gg <- gg + geom_point(aes(x=most_region_cum$x, y=most_region_cum$Fx))
gg <- gg + labs(x='Population', y='Cumulative distribution function')
gg

# 5.3 Plot the cumulative distribution for males and females on the same plot.
# Filter for males
most_region_m <- filter(most_region, gender=='M')
most_region_m$cumpop_m <- cumsum(most_region_m$population)
# filter for females
most_region_f <- filter(most_region, gender=='F')
most_region_f$cumpop_f <- cumsum(most_region_f$population)
# Join to create new table
most_region_genders <- inner_join(most_region_m, most_region_f, by='age')
# Plot using ggplot
gg <- ggplot()
gg <- gg + geom_line(aes(x=most_region_genders$age, y=most_region_genders$cumpop_m, color='blue')) +
  geom_line(aes(x=most_region_genders$age, y=most_region_genders$cumpop_f, color='red')) + xlab('Age') + ylab('Cumulative Population') + scale_color_manual(labels = c("Male","Female"), values=c('blue','red'))
gg

# Question 6:

# 6.1 Plot the ratio of old to young people using 40 years old as a cut off, i.e. young is defined as age < 40 and old as >= 40. 
# filter data according to age cut offs for old and young
old_pop <- filter(pop_data, age >=40)
young_pop <- filter(pop_data, age <40)
# group by region and sum populations together
old_pop_region <- old_pop %>% group_by(region) %>% summarise(popo=sum(population))
young_pop_region <- young_pop %>% group_by(region) %>% summarise(popy=sum(population))
# join new lists
old_young_ratio <- inner_join(old_pop_region, young_pop_region, by='region')
# create ratio variables
old_young_ratio$ratio <- (old_young_ratio$popo / old_young_ratio$popy)
old_young_ratio$total <- (old_young_ratio$popo + old_young_ratio$popy)
# create plot
old_young_plot <- ggplot(old_young_ratio, aes(x=total)) +
  geom_point(aes(y=ratio)) + xlab('Region Population')+ylab('Old to Young Ratio')
old_young_plot

6.2 Comments: From the chart above: It shows that regions with a lower population tend to have a higher old to young population ratio. In other words, when examining a region with a smaller population, this region is more likely to have more people over the age of 40 than young people under the age of 40. It also makes sense then that larger populated regions have more younger people as population growth to achieve such size would have to be a lot greater resulting in more young people.

Question 7:

# 7.1 Plot the gender ratio as a function of the population of the region. 
# Filter by gender
male_pop <- filter(pop_data, gender=='M')
female_pop <- filter(pop_data, gender =='F')
# sum each gender by region
male_pop_sum <- male_pop %>% group_by(region) %>% summarise(popm=sum(population))
female_pop_sum <- female_pop %>% group_by(region) %>% summarise(popf=sum(population))
# Join new tables
male_female_pop <- inner_join(male_pop_sum,female_pop_sum,by='region')
# Calc ratio and total for plotting
male_female_pop$ratio <- (male_female_pop$popm / male_female_pop$popf)
male_female_pop$total <- (male_female_pop$popm + male_female_pop$popf)
male_female_plot <- ggplot(male_female_pop, aes(x=total)) + geom_point(aes(y=ratio)) + xlab("Region Population") + ylab("Male:Female Ratio") + ggtitle("Gender ratio as a function of population by region")
male_female_plot

7.2 Comments From the graphic above, it appears that there is a higher concentration of males to females on average, and that this effect is more prominent in regions with smaller population sizes given the density of points above 1 on the lower end of the x axis.

Question 8:

Gender = Male Age group = 31 to 34

# Filter for males age 31 to 34
launch_product <- filter(pop_data, age<35)
launch_product <- filter(launch_product, age>30)
launch_product <- filter(launch_product,gender=='M')

# Calc which regions have highest males from 31 to 34
# Clean this table a bit by subsetting
launch_product <- subset(launch_product, select=c(region,age,gender,population))

# Group by region
launch_product_region <-launch_product %>% group_by(region) %>% summarise(pop=sum(population))

# Sort with largest region at top
arrange(launch_product_region, .by_group=TRUE, desc(pop))

## # A tibble: 500 x 2
##    region     pop
##    <chr>    <int>
##  1 SSC22015  1648
##  2 SSC22569  1497
##  3 SSC21143  1024
##  4 SSC21040  1014
##  5 SSC22106   847
##  6 SSC21125   843
##  7 SSC20773   805
##  8 SSC20660   782
##  9 SSC21671   781
## 10 SSC20911   730
## # ... with 490 more rows

# tibble shows these two regions at the top SSC22015 (1,648) and SSC22569 (1,497)

8.2 I launch a product aimed at Males from 31 to 34 years old and I target regions with the highest number of people in this targeted demographic. These regions are SSC22015 (1,648) and SSC22569 (1,497).

Question 9.

# 9.1 The central limit theorem by selecting k = 100 samples of size n age values each from the cumulative distribution for the region with the most people.
# filter to region with most people
most_region <- filter(pop_data, region =="SSC22015")
#n=1000
n_sample <- 1000
n_sum <- 100
# set the cumulative list to zero then loop and attach CLT results
clist <- NULL
for(i in seq(1,n_sum)){
  result <- mean(sample(most_region$population, n_sum))
  clist<-c(clist,result)
}
# add a normal distribution curve for comparison
mean <- mean(clist)
sd <- sd(clist)
binwidth <- 0.005 * sqrt(n_sample) * 4 * (n_sum ^ 0.4)
title_txt <- sprintf('CLT after summing %d samples of %d observations', n_sum, n_sample)
gg <- ggplot()
gg <- gg + geom_histogram(aes(x = clist), binwidth = binwidth, colour = "white", 
                          fill = "cornflowerblue", size = 0.1)
gg <- gg + stat_function(fun = function(x) dnorm(x, mean = mean, sd = sd) * n_sum * binwidth, 
                         color = "darkred", size = 1)
gg <- gg + labs(x = 'Population', y = 'Age', title = title_txt)
gg

# 9.2 Repeat this process for n = 1, 10, 100 and 1000 observations.
#n=1
n_sample <- 1
n_sum <- 100
# set the cumulative list to zero then loop and attach CLT results
clist <- NULL
for(i in seq(1,n_sum)){
  result <- mean(sample(most_region$population, n_sum))
  clist<-c(clist,result)
}
# add a normal distribution curve for comparison
mean <- mean(clist)
sd <- sd(clist)
binwidth <- 0.005 * sqrt(n_sample) * 4 * (n_sum ^ 0.4)
title_txt <- sprintf('CLT after summing %d samples of %d observations', n_sum, n_sample)
gg <- ggplot()
gg <- gg + geom_histogram(aes(x = clist), binwidth = binwidth, colour = "white", 
                          fill = "cornflowerblue", size = 0.1)
gg <- gg + stat_function(fun = function(x) dnorm(x, mean = mean, sd = sd) * n_sum * binwidth, 
                         color = "darkred", size = 1)
gg <- gg + labs(x = 'Population', y = 'Age', title = title_txt)
gg

#n=10
n_sample <- 10
n_sum <- 100
# set the cumulative list to zero then loop and attach CLT results
clist <- NULL
for(i in seq(1,n_sum)){
  result <- mean(sample(most_region$population, n_sum))
  clist<-c(clist,result)
}
# add a normal distribution curve for comparison
mean <- mean(clist)
sd <- sd(clist)
binwidth <- 0.005 * sqrt(n_sample) * 4 * (n_sum ^ 0.4)
title_txt <- sprintf('CLT after summing %d samples of %d observations', n_sum, n_sample)
gg <- ggplot()
gg <- gg + geom_histogram(aes(x = clist), binwidth = binwidth, colour = "white", 
                          fill = "cornflowerblue", size = 0.1)
gg <- gg + stat_function(fun = function(x) dnorm(x, mean = mean, sd = sd) * n_sum * binwidth, 
                         color = "darkred", size = 1)
gg <- gg + labs(x = 'Population', y = 'Age', title = title_txt)
gg

#n=100
n_sample <- 100
n_sum <- 100
# set the cumulative list to zero then loop and attach CLT results
clist <- NULL
for(i in seq(1,n_sum)){
  result <- mean(sample(most_region$population, n_sum))
  clist<-c(clist,result)
}
# add a normal distribution curve for comparison
mean <- mean(clist)
sd <- sd(clist)
binwidth <- 0.005 * sqrt(n_sample) * 4 * (n_sum ^ 0.4)

title_txt <- sprintf('CLT after summing %d samples of %d observations', n_sum, n_sample)

gg <- ggplot()
gg <- gg + geom_histogram(aes(x = clist), binwidth = binwidth, colour = "white", 
                          fill = "cornflowerblue", size = 0.1)
gg <- gg + stat_function(fun = function(x) dnorm(x, mean = mean, sd = sd) * n_sum * binwidth, 
                         color = "darkred", size = 1)
gg <- gg + labs(x = 'Population', y = 'Age', title = title_txt)
gg

Assessment 1

Heinzy Johan Bernt s3923824