# Include all the libraries here
library(readr)
library(knitr)
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
##
## extract
library(ggplot2)
Data Inspection
# read the data
data <- read.table("pop_dataset_0002.txt", sep = ",", header = TRUE)
head(data)
## region age gender population
## 1 SSC21184 0 M 114
## 2 SSC21184 0 F 95
## 3 SSC21184 1 M 88
## 4 SSC21184 1 F 107
## 5 SSC21184 2 M 122
## 6 SSC21184 2 F 120
# 1.1 Dimensions of the data frame.
print(paste0("Dimensions of the data frame: ", nrow(data), " X ",ncol(data)))
## [1] "Dimensions of the data frame: 56000 X 4"
# 1.2 Names and type of each of the data frame’s columns.
print(paste0("Name and type of the 1st column: ", colnames(data)[1], " - ",typeof(data$region)))
## [1] "Name and type of the 1st column: region - character"
print(paste0("Name and type of the 1st column: ", colnames(data)[2], " - ",typeof(data$age)))
## [1] "Name and type of the 1st column: age - integer"
print(paste0("Name and type of the 1st column: ", colnames(data)[3], " - ",typeof(data$gender)))
## [1] "Name and type of the 1st column: gender - character"
print(paste0("Name and type of the 1st column: ", colnames(data)[4], " - ",typeof(data$population)))
## [1] "Name and type of the 1st column: population - integer"
# 1.3 The number of unique regions in the dataset.
print(paste0("The number of unique regions: ", length(unique(data$region))))
## [1] "The number of unique regions: 500"
# 1.4 The minimum age bin.
print(paste0("The minimum age bin: ", min(data$age)))
## [1] "The minimum age bin: 0"
# 1.5 The maximum age bin.
print(paste0("The minimum age bin: ", max(data$age)))
## [1] "The minimum age bin: 55"
# 1.6 The bin size for the age field.
print(paste0("The bin size of the age column: ", length(unique(data$age))))
## [1] "The bin size of the age column: 56"
Descriptive Statistics Analysis
# 2.1 Use the expected value for the age to find the mean age for the whole data sample.
data_q2 <- data %>% group_by(age) %>% summarise(collective_pop = sum(population))
data_q2_prob <- data_q2 %>% mutate(p_age = collective_pop / sum(collective_pop))
mu_age <- sum(data_q2_prob$age *data_q2_prob$p_age)
print(paste0("The mean age for the entire dataset: ", mu_age))
## [1] "The mean age for the entire dataset: 27.8002663266396"
# 2.2 Provide the standard deviation for the whole data sample.
var <- sum((data_q2_prob$p_age) * ((data_q2_prob$age - mu_age)^2))
st_dev <- sqrt(var)
print(paste0("The standard deviation for the entire dataset: ", st_dev))
## [1] "The standard deviation for the entire dataset: 15.7780380945597"
Statistics on the Regions’ Means
data_q3 <- data %>% group_by(region, age) %>% summarise(collective_pop = sum(population))
## `summarise()` has grouped output by 'region'. You can override using the
## `.groups` argument.
data_q3_prob <- data_q3 %>% mutate(p_age = collective_pop / sum(collective_pop))
data_q3_mu <- data_q3_prob %>% group_by(region) %>% summarise(mean_age_region = sum(age * p_age))
## 3.1 mean
mean_mean_age_reg <- mean(data_q3_mu$mean_age_region)
print(paste0("The mean of the means from each region: ", mean_mean_age_reg))
## [1] "The mean of the means from each region: 30.6084587602067"
## 3.2 standard deviation
st_dev_mean_age_reg <- sd(data_q3_mu$mean_age_region)
print(paste0("The standard deviation of the means from each region: ", st_dev_mean_age_reg))
## [1] "The standard deviation of the means from each region: 7.99617869157122"
## 3.3 minimum
min_mean_age_reg <- min(data_q3_mu$mean_age_region)
print(paste0("The minimum of the means from each region: ", min_mean_age_reg))
## [1] "The minimum of the means from each region: 2"
## 3.4 first quartile
first_quart_mean_age_reg <- quantile(data_q3_mu$mean_age_region, 0.25)
print(paste0("The 1st quartile of the means from each region: ", first_quart_mean_age_reg))
## [1] "The 1st quartile of the means from each region: 27.4257826660491"
## 3.5 median
median_mean_age_reg_b <- quantile(data_q3_mu$mean_age_region, 0.5)
print(paste0("The median of the means from each region: ", median_mean_age_reg_b))
## [1] "The median of the means from each region: 29.231583768481"
## 3.6 third quartile
third_quart_mean_age_reg <- quantile(data_q3_mu$mean_age_region, 0.75)
print(paste0("The 3rd quartile of the means from each region: ", third_quart_mean_age_reg))
## [1] "The 3rd quartile of the means from each region: 33.3501326259947"
## 3.7 maximum
max_mean_age_reg <- max(data_q3_mu$mean_age_region)
print(paste0("The maximum of the means from each region: ", max_mean_age_reg))
## [1] "The maximum of the means from each region: 55"
## 3.8 interquartile range
iqr_mean_age_reg = IQR(data_q3_mu$mean_age_region)
print(paste0("The interquartile range of the means from each region: ", iqr_mean_age_reg))
## [1] "The interquartile range of the means from each region: 5.92434995994564"
## 3.9 histogram plot of the distribution of means from each region
qplot(data_q3_mu$mean_age_region,
geom="histogram",
binwidth=2,
fill=I("blue"),
col=I("red"),
main="Histogram for Mean of Age per Region",
xlab="Mean of Age",
ylab="Frequency")
Consider the region with the smallest population:
# 4.1 Show which region has the least people and how many it has.
data_q4 <- data %>% group_by(region) %>% summarise(tot_pop = sum(population))
min_tot_pop <- min(data_q4$tot_pop)
lowest_pop_reg <- data_q4 %>% filter(tot_pop == min_tot_pop)
lowest_pop_reg[1,1] # region with the least population
## # A tibble: 1 × 1
## region
## <chr>
## 1 SSC20099
lowest_pop_reg[1,2] # corresponding population
## # A tibble: 1 × 1
## tot_pop
## <int>
## 1 3
print(paste0("The region with the least population: ", lowest_pop_reg[1,1], " With the population of: ", lowest_pop_reg[1,2]))
## [1] "The region with the least population: SSC20099 With the population of: 3"
Consider the region with the largest population:
max_tot_pop <- max(data_q4$tot_pop)
highest_pop_reg <- data_q4 %>% filter(tot_pop == max_tot_pop)
highest_pop_reg[1,1] # region with the most population
## # A tibble: 1 × 1
## region
## <chr>
## 1 SSC22015
highest_pop_reg[1,2] # corresponding population
## # A tibble: 1 × 1
## tot_pop
## <int>
## 1 37948
print(paste0("The region with the highest population: ", highest_pop_reg[1,1], " With the population of: ", highest_pop_reg[1,2]))
## [1] "The region with the highest population: SSC22015 With the population of: 37948"
# 5.1 Plot the distribution of ages for the region with the most people.
data_q5 <- data %>% filter(region == "SSC22015")
data_q5_age_dist <- data_q5 %>% group_by(region, age) %>% summarise(collec_pop = sum(population))
## `summarise()` has grouped output by 'region'. You can override using the
## `.groups` argument.
data_q5_age_prob <- data_q5_age_dist %>% mutate(age_prob = collec_pop / sum(collec_pop))
ggplot(data_q5_age_prob, aes(x = age, y = collec_pop)) +
labs(title = "Distribution of Age for the Most Populated Region",
x = "Age", y = "Population") +
geom_point()
ggplot(data_q5_age_prob, aes(x = age, y = age_prob)) +
labs(title = "Distribution of Age for the Most Populated Region",
x = "Age", y = "Probability") +
geom_line()
# 5.2 Plot cumulative distribution for the regions with the most people.
data_q5_age_cdf <- data_q5_age_prob %>% mutate(age_cdf = cumsum(age_prob))
ggplot(data_q5_age_cdf, aes(x = age, y = age_cdf)) +
labs(title = "Cumulative Distribution of Age for the Most Populated Region",
x = "Age", y = "CDF Probability") +
geom_point()
ggplot(data_q5_age_cdf, aes(x = age, y = age_cdf)) +
labs(title = "Cumulative Distribution of Age for the Most Populated Region",
x = "Age", y = "CDF Probability") +
geom_line()
# 5.3 Plot the cumulative distribution for males and females on the same plot.
data_q5_male <- data %>% filter(region == "SSC22015" & gender == "M") %>%
mutate(age_prob = population / sum(population), age_cdf_male = cumsum(age_prob))
data_q5_female <- data %>% filter(region == "SSC22015" & gender == "F") %>%
mutate(age_prob = population / sum(population), age_cdf_female = cumsum(age_prob))
data_q5_male_mod <- data_q5_male %>% select(2,5)
data_q5_female_mod <- data_q5_female %>% select(2,5)
data_q5_combine <- data_q5_male_mod %>% inner_join(data_q5_female_mod, by = "age", suffix = c("_male", "_female"))
data_q5_age_combine_cdf <- data_q5_combine %>%
mutate(age_male_cdf = cumsum(age_prob_male), age_female_cdf = cumsum(age_prob_female))
data_q5_combine_pivot <- data_q5_age_combine_cdf %>% select(-(2:3)) %>% rename(male = age_male_cdf, female = age_female_cdf) %>%
pivot_longer(cols = 2:3, names_to = "gender", values_to = "cdf")
ggplot(data_q5_combine_pivot, aes(x = age, y = cdf)) +
geom_line(aes(color = gender, linetype = gender)) +
scale_color_manual(values = c("darkred", "steelblue"))
An Analysis of Trends in Age Against Regional Population:
# 6.1 Plot the ratio of old to young people using 40 years old as a cut off, i.e. young is defined as age < 40 and old as >= 40.
data_old <- data %>% group_by(region) %>% filter(age >= 40)
data_old_mod <- data_old %>% group_by(region) %>% summarise(collec_old_pop = sum(population))
data_young <- data %>% group_by(region) %>% filter(age < 40)
data_young_mod <- data_young %>% group_by(region) %>% summarise(collec_young_pop = sum(population))
data_old_young <- data_old_mod %>% inner_join(data_young_mod, by = "region")
data_age_ratio <- data_old_young %>% mutate(tot_pop = collec_old_pop + collec_young_pop,
age_ratio = collec_old_pop / collec_young_pop) # gives "Inf"
### Omit observations with "Inf"
data_age_ratio_NoInf <- data_age_ratio[!is.infinite(data_age_ratio$age_ratio), ]
data_age_ratio_NoInf %<>% filter(tot_pop >= 100)
vis_age_ratio <- ggplot(data_age_ratio_NoInf, aes(x = age_ratio, y = tot_pop)) +
labs(title = "Distribution of Age Ratio against Region Population",
x = "Old to Young Age Ratio", y = "Region Population") +
geom_point()
vis_age_ratio + scale_x_continuous(trans='log10') +
scale_y_continuous(trans='log10')
What could cause such trends?
[6.2: By dividing the population spectrum into two areas including 100 to 1000 people and 1000 to 10000 people, trends displayed in Figure 5 become more easily identifiable. As population increases from 100 to 1000, the age ratio drastically decreases. This can be due to the fact that vast majority of the regions with less 1000 residents are small localities that are mostly suitable for older age lifestyles. As a result, as expected most of younger people would emigrate to more populated regions with better infrastructure and facilities. On the other hand, with growth of population from 1000 to 10000, age ratio declines less drastically. This gradual drop in age ratio can be caused by the somewhat balanced mixture of the youth and the elderly that is seen in more populated and modern areas nowadays. The age ratio and population seem to be conversely related. This inverse relationship becomes clearer by analysing the limits of spectrum which shows that maximum age ratios exist at the very least populated areas, and the minimum age ratios occur in the most populated areas.]
An Analysis of Trends in Gender Ratio Against Region Population:
# 7.1 Plot the gender ratio as a function of the population of the region.
data_male <- data %>% group_by(region) %>% filter(gender == "M")
data_male_mod <- data_male %>% group_by(region) %>% summarise(collec_male_pop = sum(population))
data_female <- data %>% group_by(region) %>% filter(gender == "F")
data_female_mod <- data_female %>% group_by(region) %>% summarise(collec_female_pop = sum(population))
data_male_female <- data_male_mod %>% inner_join(data_female_mod, by = "region")
data_gender_ratio <- data_male_female %>% mutate(tot_pop = collec_male_pop + collec_female_pop,
gender_ratio = collec_male_pop / collec_female_pop) # gives "Inf"
### Omit observations with "Inf"
data_gender_ratio_Inf <- data_gender_ratio[!is.infinite(data_gender_ratio$gender_ratio), ]
data_gender_ratio_Inf %<>% filter(tot_pop >= 100)
vis_gender_ratio <- ggplot(data_gender_ratio_Inf, aes(x = gender_ratio, y = tot_pop)) +
labs(title = "Distribution of Gender Ratio against Region Population",
x = "Male to Female Age Ratio", y = "Region Population") +
geom_point()
vis_gender_ratio + scale_x_continuous(trans='log10') +
scale_y_continuous(trans='log10')
What could cause such trends?
[7.2: The plot in Figure 6 is displaying a perfect bell curve (normal distribution) on how gender ratio and region population are correlated. To deeper analyse the plot, population is split into two zones, lower than 1000 people (bottom area) and higher than 1000 people (top area). In the top area which relates to the more populated regions, gender ratio appears to stay very close to 1. A gender ratio of 1 implies a 50-50 split of males and females in the region. This is expected to a certain extent as when size of the sample grows, it becomes more diverse in terms of items that in contains. However, as the population decreases, gender ratio deviates from a perfect split of male and female. This is also, not unexpected because when the are less people in the population, each one of them take a bigger part of the sample and be it male or female leave a greater impact on the value of gender ratio.]
Imagine you have enough resources for two events to launch a new product:
8.1 Select a gender and age group which spans 3 to 5 years. This will be the primary customers for your hypothetical product.
[8.1: The product I plan to launch is an after shave cream. And the target group for this product are 31 to 35 years old men.]
8.2 Which two regions would you start with and why?
# 8.1 & 8.2
data_q8 <- data %>% group_by(region) %>% summarise(tot_pop = sum(population)) %>%
filter(tot_pop >= 10000)
data_q8_pop_age_ratio <- left_join(data_q8, data_age_ratio_NoInf, by = "region") %>%
select(-(3:5))
data_q8_pop_age_gender_ratio <- left_join(data_q8_pop_age_ratio, data_gender_ratio_Inf,
by = "region") %>% select(-(4:6)) %>%
rename(tot_pop = tot_pop.x)
data_q8_age_bottom10 <- data_q8_pop_age_gender_ratio %>%
arrange((data_q8_pop_age_gender_ratio$age_ratio)) %>%
slice(1:10)
data_q8_gender_top2 <- data_q8_age_bottom10 %>% arrange(desc(data_q8_age_bottom10$gender_ratio)) %>%
slice(1:2)
print(paste0("The 1st selected region: ", data_q8_gender_top2[1,1], " And the 2nd selected region: ", data_q8_gender_top2[2,1]))
## [1] "The 1st selected region: SSC22569 And the 2nd selected region: SSC21040"
[8.2: Since the product is developed for young men, higher population, lower age ratio, and higher gender ratio are the factors that can form the logic behind our selection of regions. Only populations higher than 10,000 are considered as eligible for the product launch due to marketing, promotion, and logistics purposes. In relation to the age and gender ratios, selection will be based on comparison of values. In order to achieve this, regions with populations less than 10000 were excluded from the data. The resulting data frame then was joined with age ratio data set and gender ratio data set from the previous sections to create a comprehensive data set that contains data on region, population, age ratio, and gender ratio at the same time. As the second criterion refers to lower age ratios (to ensure the dominance of the youth in the population), 10 regions with the least amounts of age ratios are selected. Finally, from those 10 regions, only 2 with the highest values of gender ratios are chosen. The first and the second selected regions respectively are SSC22569 and SSC21040.]
In this report, data on population demographic of Australian regions was wrangled with and analysed. Our analysis began with general analytical concepts such as expected value, weight and probability, and analysis of means of regions. After that, emphasis was placed on the least and the most populated regions in the data which led to interesting conclusions with regards to gender distribution among various age groups. Then the correlations of age ratio and gender ratio with regional populations were investigated. Next, target customer group for a new product launch was selected based on the analysis of population demographic. Eventually, central limit theorem of cumulative distribution function of age was calculated and demonstrated for the data set.
Data Novia n.d., How to Create a GGplot with Multiple Lines, Data Novia, viewed 30 January 2022, < https://www.datanovia.com/en/blog/how-to-create-a-ggplot-with-multiple-lines/>.