Introduction

# Include all the libraries here
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v tibble  3.1.2     v purrr   0.3.4
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.2     v forcats 0.5.1

## Warning: package 'readr' was built under R version 4.1.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(readr)

Question 1

Using the correct code, check the data by completing the following steps and answer each question:

# read the data 
pop_dataset <- read_csv("C:/Users/61423/Downloads/pop_dataset.csv",col_types = cols(gender = col_factor(levels = c("M","F"))))

df <- pop_dataset

# 1.1 Print out the dimensions of the data frame.

q1.1 <- dim(df)

print(q1.1)

## [1] 56000     4

# 1.2 Print out the names and type of each of the data frame’s columns.
q1.2 <- colnames(df)

print(q1.2)

## [1] "region"     "age"        "gender"     "population"

# 1.3 Print out the number of unique regions in the dataset. 

q1.3 <- length(unique(df$region))

print(q1.3)

## [1] 500

# 1.4 What is the minimum age bin?

q1.4 <- min(df$age)

print(q1.4)

## [1] 0

# 1.5 What is the maximum age bin? 

q1.5 <- max(df$age)

print(q1.5)

## [1] 55

# 1.6 What is the bin size for the age field?

1.6 What is the bin size for the age field?

The bin size for the age field is 1. Each bin represents 1 age bin

Question 2.

Perform the following data analysis by looking at some descriptive statistics on the complete data set and answer both questions:

# 2.1 Use the expected value for the age to find the mean age for the whole data sample. 

df$pxi <- (df$population/sum(df$population))
df$pxixi <- (df$pxi*df$age)

q2.1 <- sum(df$pxixi)

print(q2.1)

## [1] 27.80027

# 2.2 Provide the standard deviation for the whole data sample. 

df$ximinusmu <- (df$age-q2.1)
df$ximinusmusqrd <- ((df$ximinusmu)^2)

q2.2var <- var1_from_mean <- sum(df$pxi * (df$ximinusmusqrd))

q2.2std <- sqrt(q2.2var)

print(q2.2var)

## [1] 248.9465

print(q2.2std)

## [1] 15.77804

Question 3.

Record the following statistics on the means from each region:

region_list <- unique(df$region)

regions_vector <- rep(0,500)

for(x in 1:length(region_list))
{
region_x<- filter(df, region == region_list[x])
region_x$pxi <- (region_x$population/sum(region_x$population))
region_x$pxixi <- (region_x$pxi*region_x$age)
region_x_mu <- sum(region_x$pxixi)
regions_vector[x] <- region_x_mu
}
## 3.1 mean 

q3.1 <- mean(regions_vector)

print(q3.1)

## [1] 30.60846

## 3.2 standard deviation 

q3.2 <- sd(regions_vector)

print(q3.2)

## [1] 7.996179

## 3.3 minimum 

q3.3 <- min(regions_vector)

print(q3.3)

## [1] 2

## 3.4 first quartile 

q3.4 <- quantile(regions_vector,0.25)

print(q3.4)

##      25% 
## 27.42578

## 3.5 median

q3.5 <- median(regions_vector)

print(q3.5)

## [1] 29.23158

## 3.6 third quartile 

q3.6 <- quantile(regions_vector,0.75)

print(q3.6)

##      75% 
## 33.35013

## 3.7 maximum 

q3.7 <- max(regions_vector)

print(q3.7)

## [1] 55

## 3.8 interquartile range 

q3.8 <- IQR(regions_vector)

print(q3.8)

## [1] 5.92435

## 3.9 histogram plot of the distribution of means from each region 

# histogram with added parameters
print(hist(regions_vector,
breaks=20,
main="Region Means",
xlab="Age",
xlim=c(0,60),
ylim=c(0,160),
))

## $breaks
##  [1]  2  4  6  8 10 12 14 16 18 20 22 24 26 28 30 32 34 36 38 40 42 44 46 48 50
## [26] 52 54 56
## 
## $counts
##  [1]   2   3   3   2   3   2   3   5   3  11  15  32  79 131  55  39  25  18  14
## [20]  12  14   3   6   4   5   6   5
## 
## $density
##  [1] 0.002 0.003 0.003 0.002 0.003 0.002 0.003 0.005 0.003 0.011 0.015 0.032
## [13] 0.079 0.131 0.055 0.039 0.025 0.018 0.014 0.012 0.014 0.003 0.006 0.004
## [25] 0.005 0.006 0.005
## 
## $mids
##  [1]  3  5  7  9 11 13 15 17 19 21 23 25 27 29 31 33 35 37 39 41 43 45 47 49 51
## [26] 53 55
## 
## $xname
## [1] "regions_vector"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"

Question 4:

Consider the region with the smallest population:

#4.1 Show which region has the least people and how many it has. 

popsize_region_df <- df %>%group_by(region)%>% 
  summarise(pop=sum(population))

q4.1 <- length(which(popsize_region_df$pop == 3))

print(q4.1)

## [1] 35

Question 5:

Consider the region with the largest population:

# 5.2 Plot cumulative distribution for the regions with the most people.
popsize_region_df <- df %>%
  group_by(region) %>% 
  summarise(pop=sum(population)) %>% 
  arrange(desc(pop))

biggest_region_plot_df <- filter(df, region == "SSC22015")
biggest_region_subset_df <- subset(biggest_region_plot_df, select = -c(pxi,pxixi,ximinusmu,ximinusmusqrd))
q5.2 <- biggest_region_subset_df  %>% group_by(age) %>% select(population) %>% summarise_each(funs(sum))

## Warning: `summarise_each_()` was deprecated in dplyr 0.7.0.
## Please use `across()` instead.

## Adding missing grouping variables: `age`

## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))

q5.2$cumulativepop <- cumsum(q5.2)

print(plot(q5.2$cumulativepop))

## NULL

# 5.3 Plot the cumulative distribution for males and females on the same plot.

biggest_male_df <- filter(biggest_region_subset_df, gender == "M")
biggest_male_df <- subset(biggest_male_df , select = -c(gender,region))
biggest_male_df$cumulativepop_male <- cumsum(biggest_male_df$population)
biggest_male_df <- rename(biggest_male_df, c("male_population"="population"))

biggest_female_df <- filter(biggest_region_subset_df, gender == "F")
biggest_female_df <- subset(biggest_female_df , select = -c(gender,region))
biggest_female_df$cumulativepop_female <- cumsum(biggest_female_df$population)
biggest_female_df  <-  rename(biggest_female_df, c("female_population"="population"))

biggest_region_gender_df <- inner_join(biggest_female_df,biggest_male_df,by="age")

biggest_region_cumpop_df <- subset(biggest_region_gender_df, select = -c(male_population,female_population))

q5.3 <- ggplot(biggest_region_cumpop_df,aes(x=age)) + 
  geom_line(aes(x=age,y = cumulativepop_female), color = "red") +
  geom_line(aes(x=age,y = cumulativepop_male), color = "blue") +
  xlab("Age") +
  ylab("Cumulative Population")+
  ggtitle("Male and Female Cumulative Populations")+
  scale_x_continuous(breaks = c(0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32,34,36,38,40,42,44,46,48,50,52,54,56))+
  theme(legend.position = "bottom")

print(q5.3)

Question 6:

Provide an analysis of trends in age against region population:

# 6.1 Plot the ratio of old to young people using 40 years old as a cut off, i.e. young is defined as age < 40 and old as >= 40.

old_df <- filter(pop_dataset, age >= 40)
young_df <- filter(pop_dataset, age < 40)

oldsummary_df <- old_df %>%group_by(region)%>% summarise(old_population=sum(population))
youngsummary_df <- young_df %>%group_by(region)%>% summarise(young_population=sum(population))


oldYoung_ratio_df <- inner_join(oldsummary_df,youngsummary_df,by="region")
oldYoung_ratio_df$old_young_ratio <- (oldYoung_ratio_df$old_population/oldYoung_ratio_df$young_population)
oldYoung_ratio_df$total_pop <- (oldYoung_ratio_df$old_population+oldYoung_ratio_df$young_population)

oldYoung_ratio_byregion_df <- subset(oldYoung_ratio_df , select = -c(old_population,young_population))

q6.1 <- ggplot(oldYoung_ratio_byregion_df,aes(x=total_pop)) + 
  geom_point(aes(y = old_young_ratio), color = "pink") +
  xlab("Total Population") +
  ylab("Old/Young ratio")+
  ggtitle("Trends in age against region population")+
  coord_cartesian(xlim = c(0, 1000), ylim = c(0, 6))

print(q6.1)

6.2 Comment on any trends you see in the data. What could cause such trends?

One of the trends represented in the graph is that the higher a regions population the lower the ratio of young to old people is. This would indicate that the regions with a large population will have more old people than young.

Question 7:

Provide an analysis of trends in gender ratio against region population:

# 7.1 Plot the gender ratio as a function of the population of the region. 
female_df <- filter(pop_dataset, gender == "F")
male_df <- filter(pop_dataset, gender == "M")

female_sum <- female_df %>%group_by(region)%>%summarise(female_population=sum(population))

male_sum <- male_df %>%group_by(region)%>% summarise(male_population=sum(population))



femaleMale_ratio_df <- inner_join(female_sum,male_sum,by="region")
femaleMale_ratio_df$female_male_ratio <- (female_sum$female_population/male_sum $male_population)
femaleMale_ratio_df$total_pop <- (female_sum$female_population+male_sum$male_population)

q7.1 <- ggplot(femaleMale_ratio_df,aes(x=total_pop)) + 
  geom_point(aes(y = female_male_ratio), color = "blue") +
  xlab("Total Population") +
  ylab("Female/Male Ratio")+
  ggtitle("Trends in gender ratio against region population")+
  coord_cartesian(xlim = c(0, 800), ylim = c(0,3))

print(q7.1)

7.2 Comment on any trends you see in the data. What could cause such trends?

This graph indicates that there is a lower ratio between females and males in regions with population over 250. However, unlike the previous graph, the data tends to be spaced out more as the region size decreases.

Question 8:

Imagine you have enough resources for two events to launch a new product:

8.1 Select a gender and age group which spans 3 to 5 years. This will be the primary customers for your hypothetical product.

The hypothetical product will be for older men. I have chosen an age span of 3 years from 50-53.

8.2 Which two regions would you start with and why?

The two regions I would choose are SSC22015 and SSC22752 because they have they highest population with the target audience.

marketing_df <- filter(df, age == c(50,51,53))

## Warning in age == c(50, 51, 53): longer object length is not a multiple of
## shorter object length

male_marketing_df <- filter(marketing_df,gender=="M")
male_marketing_df <- subset(male_marketing_df , select = -c(age,gender,pxi,pxixi,ximinusmu,ximinusmusqrd))
maleover50_df <- male_marketing_df %>%group_by(region)%>% summarise(pop=sum(population))

attach(maleover50_df)
sorted_maleover50_df <- maleover50_df[order(-pop),]

head(sorted_maleover50_df )

## # A tibble: 6 x 2
##   region     pop
##   <chr>    <dbl>
## 1 SSC22015   449
## 2 SSC22752   288
## 3 SSC21743   269
## 4 SSC21040   241
## 5 SSC21178   223
## 6 SSC22030   183

Assessment 1

Tyson Aviles s3920384

30/01/2022