# Include all the libraries here
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.2 v purrr 0.3.4
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.2 v forcats 0.5.1
## Warning: package 'readr' was built under R version 4.1.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(readr)
Using the correct code, check the data by completing the following steps and answer each question:
# read the data
pop_dataset <- read_csv("C:/Users/61423/Downloads/pop_dataset.csv",col_types = cols(gender = col_factor(levels = c("M","F"))))
df <- pop_dataset
# 1.1 Print out the dimensions of the data frame.
q1.1 <- dim(df)
print(q1.1)
## [1] 56000 4
# 1.2 Print out the names and type of each of the data frame’s columns.
q1.2 <- colnames(df)
print(q1.2)
## [1] "region" "age" "gender" "population"
# 1.3 Print out the number of unique regions in the dataset.
q1.3 <- length(unique(df$region))
print(q1.3)
## [1] 500
# 1.4 What is the minimum age bin?
q1.4 <- min(df$age)
print(q1.4)
## [1] 0
# 1.5 What is the maximum age bin?
q1.5 <- max(df$age)
print(q1.5)
## [1] 55
# 1.6 What is the bin size for the age field?
The bin size for the age field is 1. Each bin represents 1 age bin
Perform the following data analysis by looking at some descriptive statistics on the complete data set and answer both questions:
# 2.1 Use the expected value for the age to find the mean age for the whole data sample.
df$pxi <- (df$population/sum(df$population))
df$pxixi <- (df$pxi*df$age)
q2.1 <- sum(df$pxixi)
print(q2.1)
## [1] 27.80027
# 2.2 Provide the standard deviation for the whole data sample.
df$ximinusmu <- (df$age-q2.1)
df$ximinusmusqrd <- ((df$ximinusmu)^2)
q2.2var <- var1_from_mean <- sum(df$pxi * (df$ximinusmusqrd))
q2.2std <- sqrt(q2.2var)
print(q2.2var)
## [1] 248.9465
print(q2.2std)
## [1] 15.77804
Record the following statistics on the means from each region:
region_list <- unique(df$region)
regions_vector <- rep(0,500)
for(x in 1:length(region_list))
{
region_x<- filter(df, region == region_list[x])
region_x$pxi <- (region_x$population/sum(region_x$population))
region_x$pxixi <- (region_x$pxi*region_x$age)
region_x_mu <- sum(region_x$pxixi)
regions_vector[x] <- region_x_mu
}
## 3.1 mean
q3.1 <- mean(regions_vector)
print(q3.1)
## [1] 30.60846
## 3.2 standard deviation
q3.2 <- sd(regions_vector)
print(q3.2)
## [1] 7.996179
## 3.3 minimum
q3.3 <- min(regions_vector)
print(q3.3)
## [1] 2
## 3.4 first quartile
q3.4 <- quantile(regions_vector,0.25)
print(q3.4)
## 25%
## 27.42578
## 3.5 median
q3.5 <- median(regions_vector)
print(q3.5)
## [1] 29.23158
## 3.6 third quartile
q3.6 <- quantile(regions_vector,0.75)
print(q3.6)
## 75%
## 33.35013
## 3.7 maximum
q3.7 <- max(regions_vector)
print(q3.7)
## [1] 55
## 3.8 interquartile range
q3.8 <- IQR(regions_vector)
print(q3.8)
## [1] 5.92435
## 3.9 histogram plot of the distribution of means from each region
# histogram with added parameters
print(hist(regions_vector,
breaks=20,
main="Region Means",
xlab="Age",
xlim=c(0,60),
ylim=c(0,160),
))
## $breaks
## [1] 2 4 6 8 10 12 14 16 18 20 22 24 26 28 30 32 34 36 38 40 42 44 46 48 50
## [26] 52 54 56
##
## $counts
## [1] 2 3 3 2 3 2 3 5 3 11 15 32 79 131 55 39 25 18 14
## [20] 12 14 3 6 4 5 6 5
##
## $density
## [1] 0.002 0.003 0.003 0.002 0.003 0.002 0.003 0.005 0.003 0.011 0.015 0.032
## [13] 0.079 0.131 0.055 0.039 0.025 0.018 0.014 0.012 0.014 0.003 0.006 0.004
## [25] 0.005 0.006 0.005
##
## $mids
## [1] 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 33 35 37 39 41 43 45 47 49 51
## [26] 53 55
##
## $xname
## [1] "regions_vector"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
Consider the region with the smallest population:
#4.1 Show which region has the least people and how many it has.
popsize_region_df <- df %>%group_by(region)%>%
summarise(pop=sum(population))
q4.1 <- length(which(popsize_region_df$pop == 3))
print(q4.1)
## [1] 35
Consider the region with the largest population:
# 5.2 Plot cumulative distribution for the regions with the most people.
popsize_region_df <- df %>%
group_by(region) %>%
summarise(pop=sum(population)) %>%
arrange(desc(pop))
biggest_region_plot_df <- filter(df, region == "SSC22015")
biggest_region_subset_df <- subset(biggest_region_plot_df, select = -c(pxi,pxixi,ximinusmu,ximinusmusqrd))
q5.2 <- biggest_region_subset_df %>% group_by(age) %>% select(population) %>% summarise_each(funs(sum))
## Warning: `summarise_each_()` was deprecated in dplyr 0.7.0.
## Please use `across()` instead.
## Adding missing grouping variables: `age`
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
q5.2$cumulativepop <- cumsum(q5.2)
print(plot(q5.2$cumulativepop))
## NULL
# 5.3 Plot the cumulative distribution for males and females on the same plot.
biggest_male_df <- filter(biggest_region_subset_df, gender == "M")
biggest_male_df <- subset(biggest_male_df , select = -c(gender,region))
biggest_male_df$cumulativepop_male <- cumsum(biggest_male_df$population)
biggest_male_df <- rename(biggest_male_df, c("male_population"="population"))
biggest_female_df <- filter(biggest_region_subset_df, gender == "F")
biggest_female_df <- subset(biggest_female_df , select = -c(gender,region))
biggest_female_df$cumulativepop_female <- cumsum(biggest_female_df$population)
biggest_female_df <- rename(biggest_female_df, c("female_population"="population"))
biggest_region_gender_df <- inner_join(biggest_female_df,biggest_male_df,by="age")
biggest_region_cumpop_df <- subset(biggest_region_gender_df, select = -c(male_population,female_population))
q5.3 <- ggplot(biggest_region_cumpop_df,aes(x=age)) +
geom_line(aes(x=age,y = cumulativepop_female), color = "red") +
geom_line(aes(x=age,y = cumulativepop_male), color = "blue") +
xlab("Age") +
ylab("Cumulative Population")+
ggtitle("Male and Female Cumulative Populations")+
scale_x_continuous(breaks = c(0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56))+
theme(legend.position = "bottom")
print(q5.3)
Provide an analysis of trends in age against region population:
# 6.1 Plot the ratio of old to young people using 40 years old as a cut off, i.e. young is defined as age < 40 and old as >= 40.
old_df <- filter(pop_dataset, age >= 40)
young_df <- filter(pop_dataset, age < 40)
oldsummary_df <- old_df %>%group_by(region)%>% summarise(old_population=sum(population))
youngsummary_df <- young_df %>%group_by(region)%>% summarise(young_population=sum(population))
oldYoung_ratio_df <- inner_join(oldsummary_df,youngsummary_df,by="region")
oldYoung_ratio_df$old_young_ratio <- (oldYoung_ratio_df$old_population/oldYoung_ratio_df$young_population)
oldYoung_ratio_df$total_pop <- (oldYoung_ratio_df$old_population+oldYoung_ratio_df$young_population)
oldYoung_ratio_byregion_df <- subset(oldYoung_ratio_df , select = -c(old_population,young_population))
q6.1 <- ggplot(oldYoung_ratio_byregion_df,aes(x=total_pop)) +
geom_point(aes(y = old_young_ratio), color = "pink") +
xlab("Total Population") +
ylab("Old/Young ratio")+
ggtitle("Trends in age against region population")+
coord_cartesian(xlim = c(0, 1000), ylim = c(0, 6))
print(q6.1)
6.2 Comment on any trends you see in the data. What could cause such trends?
One of the trends represented in the graph is that the higher a regions population the lower the ratio of young to old people is. This would indicate that the regions with a large population will have more old people than young.
Provide an analysis of trends in gender ratio against region population:
# 7.1 Plot the gender ratio as a function of the population of the region.
female_df <- filter(pop_dataset, gender == "F")
male_df <- filter(pop_dataset, gender == "M")
female_sum <- female_df %>%group_by(region)%>%summarise(female_population=sum(population))
male_sum <- male_df %>%group_by(region)%>% summarise(male_population=sum(population))
femaleMale_ratio_df <- inner_join(female_sum,male_sum,by="region")
femaleMale_ratio_df$female_male_ratio <- (female_sum$female_population/male_sum $male_population)
femaleMale_ratio_df$total_pop <- (female_sum$female_population+male_sum$male_population)
q7.1 <- ggplot(femaleMale_ratio_df,aes(x=total_pop)) +
geom_point(aes(y = female_male_ratio), color = "blue") +
xlab("Total Population") +
ylab("Female/Male Ratio")+
ggtitle("Trends in gender ratio against region population")+
coord_cartesian(xlim = c(0, 800), ylim = c(0,3))
print(q7.1)
7.2 Comment on any trends you see in the data. What could cause such trends?
This graph indicates that there is a lower ratio between females and males in regions with population over 250. However, unlike the previous graph, the data tends to be spaced out more as the region size decreases.
Imagine you have enough resources for two events to launch a new product:
8.1 Select a gender and age group which spans 3 to 5 years. This will be the primary customers for your hypothetical product.
The hypothetical product will be for older men. I have chosen an age span of 3 years from 50-53.
8.2 Which two regions would you start with and why?
The two regions I would choose are SSC22015 and SSC22752 because they have they highest population with the target audience.
marketing_df <- filter(df, age == c(50,51,53))
## Warning in age == c(50, 51, 53): longer object length is not a multiple of
## shorter object length
male_marketing_df <- filter(marketing_df,gender=="M")
male_marketing_df <- subset(male_marketing_df , select = -c(age,gender,pxi,pxixi,ximinusmu,ximinusmusqrd))
maleover50_df <- male_marketing_df %>%group_by(region)%>% summarise(pop=sum(population))
attach(maleover50_df)
sorted_maleover50_df <- maleover50_df[order(-pop),]
head(sorted_maleover50_df )
## # A tibble: 6 x 2
## region pop
## <chr> <dbl>
## 1 SSC22015 449
## 2 SSC22752 288
## 3 SSC21743 269
## 4 SSC21040 241
## 5 SSC21178 223
## 6 SSC22030 183