A127122CODESdplyr4.knit

CODES

library(ggplot2)
library(dplyr)
library(readxl)

## Question 1.1

A1a <- read_excel("pop_dataset.xlsx", sheet = 1)

# sort original regions into numerical order (using dplyr)
A1 <- arrange(A1a, region) 

dim (A1)

## Question 1.2

glimpse(A1) #dplyr function

## Question 1.3

# To create a dataframe with region numbered from 1 to 500

# Aggregate regions into groups by dplyr
df1 <- A1 %>%
  group_by(region) %>%
  tally()   

glimpse(df1)

## Question 2.1

# Total population
pop_tot <- sum(A1$population) 

# Aggregate Population for each age for all regions (dplyr)
df2 <- A1 %>%
  group_by(age) %>%
  tally(population)  


# Probability for each age, n = population
prob=df2$n/pop_tot 

# Ages from 0 to 55
age=df2$age 

# Sum of prob x age
ExpV_age <-sum(prob * age) 

# 2.1 Expected value for the age (mean age) for the whole population
ExpV_age

## Question 2.2 

agesq <- df2$age^2

ExpV_age1 <-sum(prob * agesq)

ExpV_age2 <- ExpV_age^2

# Sample Variance

SampleVar <-ExpV_age1 - ExpV_age2
#SampleVar
SampleSDAge <- SampleVar^0.5
SampleSDAge

# Population Variance

# N = number of observations of Sample
N <- 56000 

Pop_Var <- N * SampleVar/(N - 1)

# Standard Deviation for the age for the whole sample
PopSDAge <- Pop_Var^0.5
PopSDAge

## Question 3

options(digits=5)

# Range of ages 0 to 55, this is the same for all regions
Age <-A1[c(1:112), 2] 

# To calculate the weighted mean of the ages for each region

i <- 1

Region <- 1

format(round(Region, 0), nsmall = 0)


# Create empty list to store weighted means for each region
WMStore <- list()               

while (i < 56000) {

j <- i + 111

# Population for each region, 112 observations per region
Pop <- A1[c(i:j), 4]           

# weighted mean for each region
WM <- weighted.mean(Age, Pop)   


# WMStore contains the weighted mean for all the 500 regions 
WMStore[[Region]] <- WM               
Region <- Region + 1

i <- i+112

if (i ==56000){
break
}
}

# This contains the weighted mean for each of the regions
WMS <- unlist(WMStore) 

# This is a dataframe of the weighted mean in a column of 500 rows
df_WMS <- as.data.frame((as.data.frame(WMS))) 

# Summary statistics of the weighted means 

df_WMS %>%
  summarise(mean = mean(WMS),
                    sd = sd(WMS),
                    min = min(WMS),
                    Q1 = quantile(WMS, 0.25), 
                    Median = median(WMS, 0.5),
                    Q3 = quantile(WMS, 0.75),
                    max = max(WMS), 
                    IQR = IQR(WMS))

## Question 3.9

p1a<- ggplot(data=df_WMS, aes(x=WMS)) + 
  geom_histogram(breaks=seq(0, 60, by=2), 
                 col="black", 
                 fill="green", 
                 alpha = .2) + 
  labs(title="3.9 Distribution of weighted means", x="Weighted Mean of Age", y="Frequency") + 
  xlim(c(0,60)) + 
  ylim(c(0,150))

p1a

## Question 4


# population for each region
df3S <- A1 %>%
  group_by(region) %>%
  tally(population)   

# Find all regions with smallest population
df3Smin <- df3S %>%
  select(region, n) %>%
  filter(n == min(df3S$n)) 

# Find all regions with smallest population, SSC20099 is one of them with 3 people
glimpse(df3Smin)

## Question 5.1 

# Aggregate # population for each region (dplyr)
df4S <- A1 %>%
  group_by(region) %>%
  tally(population)  

# Find region with largest population (n).  SSC22015 is the largest with 37948 people
df4Smax <- df4S %>%
  select(region, n)%>%
  filter(n == max(df4S$n)) 

df4Smax

# Data in the region SSC22015 which has the largest population
RL<- A1 %>% filter_at(vars(region), any_vars(. %in% c('SSC22015'))) 

# add male and female population for each age
RLC <- RL %>%
  group_by(age) %>%
  tally(population)  

# Distribution of Ages for region SSC22015

p1 <- ggplot(RLC, aes(x = age, y = n)) +
geom_line() +

labs(x = 'Age', y = 'Population')

print(p1 + ggtitle("5.1 Distribution of ages for region with most people"))

## Question 5.2 

p2<- ggplot(RLC, aes(x = age)) +
  stat_ecdf(geom = "step") +
    labs(x = 'Age', y = 'Probability')  

p3 <- print(p2 + ggtitle("5.2 Cumulative Distribution of ages for region with most people"))

## Question 5.3 

#Population data for largest region in the order of gender (dplyr)
RLS <- arrange(RL, gender) 

set.seed(1234)

RLSF <- RLS[1:56, ] #Female
RLSM <- RLS[57:112, ] #Male

Female <- RLSF$population
Male <- RLSM$population

# create a random data frame 

gender <- data.frame(x = c(Female, Male), group = gl(2, 56))

gender$group <- as.numeric(gender$group)

gender$group[1:56] <- "Female"

gender$group[57:112] <- "Male"

p15 <- ggplot(gender, aes(x = x, col = group)) + stat_ecdf() 

# stat_ecdf() function is used to plot ECDF plot

p16<- print(p15 + ggtitle("5.3 Cumulative Distribution of females and males population") + labs(x = "Population per Region",
         y = "Probability"))

## Question 6.1

options(digits=5)

# To calculate the population young and old, ratio old/young for each region

i <- 1
m <- 81

Region <- 1

format(round(Region, 0), nsmall = 0)

# Create empty list to store total population for each region
PopTotStore <- list()   

# Create empty list to store ratio for each region
RatioStore <- list()              

while (i < 56000) {

j <- i + 79
n <- m + 31

# Population for young, 80 observations per region
Popyoung <- A1[c(i:j), 4] 

# Population for old, 32 observations per region
Popold <- A1[c(m:n), 4]           
PopyoungS <- sum(Popyoung)
PopoldS <- sum(Popold)

PopTotStore [[Region]] <- PopyoungS + PopoldS

RatioStore [[Region]] <- PopoldS/PopyoungS

Region <- Region + 1

i <- i + 112
m <- m + 112

if (i == 56000) {
  
  break
  
if (m == 56000) {
  
  break
}  
}
}

# This contains the total Population for each of the regions
PopTotStore <- unlist(PopTotStore) 

# This contains the ratio for each of the regions
RatioStore <- unlist(RatioStore) 

# This is a dataframe of the total population for each region
PopTot <- as.data.frame((as.data.frame(PopTotStore))) 

PopTot$Region <- 1:nrow(PopTot)

# This is a dataframe of the ratio old/young for each region
Ratio <- as.data.frame((as.data.frame(RatioStore))) 

Ratio$Region <- 1:nrow(Ratio)

PopRatio <- merge(Ratio, PopTot, by="Region")

names(PopRatio) [2] <- "Ratio"
names(PopRatio)[3] <- "PopRegion"

# To delete rows where ratio is inf or ratio < 1000
PopRatio1<- PopRatio[PopRatio$Ratio <= 20, ] 

p17 <- ggplot(PopRatio1, aes(x=PopRegion, y=Ratio)) + 
  geom_point()+
  geom_smooth(method=lm, color="red")+
  labs(title="6.1 Ratio of old to young vs Population in Region",
       x="Population in Region")+
  theme_classic()  

p17

## Question 7.1

options(digits=5)

# To calculate the population female and male, ratio female/male for each region

# Create empty list to store total population for each #region
PopGenS <- 0             
RatioGenS <- 0   

p <- 1
a <- 56

Region <- 1

format(round(Region, 0), nsmall = 0)


# Create empty list to store total population for each #region
PopGenS <- list()             


# Create empty list to store ratio for each region
RatioGenS <- list()             

while (p < 56000) {

q <- p + 111
b <- a +55

PopSort <- A1[c(p:q), ] #Data from row i to j

# Sort in the order of gender for each region (dplyr)
RLG <- arrange(PopSort, gender) 

# Population for female, 56 observations per region
PopF <- RLG [c(1:56), 4] 

# Population for male, 56 observations per region
PopM <- RLG [c(57:112), 4] 

PopFS <- sum(PopF)
PopMS <- sum(PopM)
PopTS <- PopFS+PopMS

PopGenS [[Region]] <- PopTS

RatioGenS [[Region]] <- PopFS/PopMS

Region <- Region + 1

p <- p + 112
a <- a + 112

if (p == 56000) {
  
  break
  
if (a == 56000) {
  
  break
}  
}
}

# This contains the total Population for each of the regions
PopGenS <- unlist(PopGenS) 

# This contains the ratio for each of the regions
RatioGenS <- unlist(RatioGenS) 

# This is a dataframe of the total population for each region
PopGenST <- as.data.frame((as.data.frame(PopGenS))) 

PopGenST$Region <- 1:nrow(PopGenST)

# This is a dataframe of the ratio (F/M) population for each region
RatioGenST <- as.data.frame((as.data.frame(RatioGenS))) 

RatioGenST$Region <- 1:nrow(RatioGenST)

PopRatioGen <- merge(RatioGenST, PopGenST, by="Region")

names(PopRatioGen) [2] <- "Ratio_FM"
names(PopRatioGen)[3] <- "PopRegion"

# To delete rows where ratio is inf or ratio < 1000
PopRatioGen<- PopRatioGen[PopRatioGen$Ratio_FM <= 1000, ] 

max(PopRatioGen$Ratio_FM) #max ratio is 5.33
min(PopRatioGen$Ratio_FM) #min ratio is 0

p17 <- ggplot(PopRatioGen, aes(x=PopRegion, y=Ratio_FM)) + 
  geom_point()+
  geom_smooth(method=lm, color="red")+
  labs(title="7.1 Ratio of female to male vs Population in Region",
       x="Population in Region")+
  theme_classic()  

p17

## Question 8


# Select females only from the dataset
F18_21 <- A1 %>%
  select(region, age, gender, population)%>%
  filter(gender == "F") 

# Select females over 17
F18_21A <- F18_21 %>%
  select(region, age, gender, population)%>%
  filter(age > 17)  

# Select females between 18 and 21
F18_21B <- F18_21A %>%
  select(region, age, gender, population)%>%
  filter(age < 22) 

# Aggregate female population by region
F18_21C <- F18_21B %>%
  group_by(region) %>%
  tally(population)   

# The two regions with the largest population of females between 18 and 21 are SSC22015 (1113) and SSC20492 (2566) (dplyr)
F18_21D<- F18_21C %>% arrange(desc(n))

## Question 9

#### Question 9.1 when n=1
# Selecting n = 1 from the region with most people

# Start from sample 1 
s <- 1 

# Create empty list to store weighted means for each age
n1Store <- list() 

while (s < 101) {
  
n1 <- RL[sample(nrow(RL), 1), ]  #Select n=1

Age <- n1[c(1), 2] #Range of ages 0 to 55

# Population for SSC22015, region with most people
Pop <- n1[c(1), 4]   

# weighted means for SSC22015 
WMn1 <- weighted.mean(Age, Pop) 
            

# n1Store contains the weighted means for SSC22015       
n1Store[[s]] <- WMn1  

s <- s + 1 #choose next sample

if (s == 101){
break
}
}

#### Question 9.1 when n=10
# Selecting n = 10 from the region with most people

# Start from sample 1 
s <- 1 

# Create empty list to store weighted means for each age
n10Store <- list() 

while (s < 101) {
  
n10 <- RL[sample(nrow(RL), 10), ]  #Select n=10

Age <- n10[c(1:10), 2] #Range of ages 0 to 55


# Population for SSC22015, region with most people
Pop <- n10[c(1:10), 4]   

WMn10 <- weighted.mean(Age, Pop) # weighted means for SSC22015 


# n10Store contains the weighted means for SSC22015  
n10Store[[s]] <- WMn10  

# choose next sample
s <- s + 1 

if (s == 101){
break
}
}

#### Question 9.1 when n=100
# Selecting n = 100 from the region with most people

# Start from sample 1
s <- 1  

# Create empty list to store weighted means for each age
n100Store <- list() 

while (s < 101) {
  
n100 <- RL[sample(nrow(RL), 100), ]  #Select n=100

Age <- n100[c(1:100), 2] #Range of ages 0 to 55

# Population for SSC22015, region with most people
Pop <- n100[c(1:100), 4]   

# weighted means for SSC22015
WMn100 <- weighted.mean(Age, Pop)  
                  

# n100Store contains the weighted means for SSC22015       
n100Store[[s]] <- WMn100 

# choose next sample
s <- s + 1 

if (s == 101){
break
}
}

#### Question 9.1 when n=1000
# Selecting n = 1000 from the region with most people

# Repeat the original dataset nine times to get more than 1000 rows.
RL1000<- RL[rep(seq_len(nrow(RL)), 9), ] 

# Start from sample 1 
s <- 1 

# Create empty list to store weighted means for each age
n1000Store <- list() 

while (s < 101) {
  
n1000 <- RL1000[sample(nrow(RL1000), 1000), ]  #Select n=1000

Age <- n1000[c(1:1000), 2] #Range of ages 0 to 55

# Population for SSC22015, region with most people
Pop <- n1000[c(1:1000), 4]   

# weighted means for SSC22015
WMn1000 <- weighted.mean(Age, Pop) 
         

# n100Store contains the weighted means for SSC22015       
n1000Store[[s]] <- WMn1000 

# choose next sample
s <- s + 1 

if (s == 101){
break
}
}

n1Store <- unlist(n1Store)
n10Store <- unlist(n10Store)
n100Store <- unlist(n100Store)
n1000Store <- unlist(n1000Store)

# Dataframe of the weighted mean in a column of 100 rows
n1Sum <- as.data.frame((as.data.frame(n1Store))) 

n10Sum <- as.data.frame((as.data.frame(n10Store))) 

n100Sum <- as.data.frame((as.data.frame(n100Store))) 

n1000Sum <- as.data.frame((as.data.frame(n1000Store)))

# For n=1

# set sample size

# Compare the results from n = 1, 10, 100 and 1000

n_sample <- 100

# set number of times the samples will be summed
n_sum1 <- 1
n_sum <- 100*n_sum1

n1S <- n1Sum$n1Store

mean1 <- mean(n1S)

sd1 <- sd(n1S)

# Set aesthetics
binwidth <- 0.005 * sqrt(n_sample) * 4 * ((n_sum) ^ 0.4)  


title_txt <- sprintf('9.3a CLT test after summing 100 samples of n = 1', n_sum, n_sample)

p18 <- ggplot()
p18 <- p18 + geom_histogram(aes(x = n1S), binwidth = binwidth, colour = "white", 
                          fill = "cornflowerblue", size = 0.1)
p18 <- p18 + stat_function(fun = function(x) dnorm(x, mean = mean1, sd = sd1) * n_sample * binwidth, 
                         color = "darkred", size = 1)
p18 <- p18 + labs(x = 'Age', y = 'Frequency', title = title_txt)

p18

# For n=10

# set sample size

# Compare the results from n = 1, 10, 100 and 1000

n_sample <- 100

# set number of times the samples will be summed
n_sum1 <- 10
n_sum <- 10*n_sum1

n10S <- n10Sum$n10Store  #(TESTING ONLY)

mean10 <- mean(n10S)

sd10 <- sd(n10S)

# Set aesthetics
binwidth <- 0.005 * sqrt(n_sample) * 4 * (n_sum ^ 0.4)  

title_txt <- sprintf('9.3b CLT test after summing 100 samples of n = 10', n_sum, n_sample)

p19 <- ggplot()
p19 <- p19 + geom_histogram(aes(x = n10S), binwidth = binwidth, colour = "white", 
                          fill = "cornflowerblue", size = 0.1)
p19 <- p19 + stat_function(fun = function(x) dnorm(x, mean = mean10, sd = sd10) * n_sample * binwidth, 
                         color = "darkred", size = 1)
p19 <- p19 + labs(x = 'Age', y = 'Frequency', title = title_txt)

p19

# For n=100

# set sample size

# Compare the results from n = 1, 10, 100 and 1000

n_sample <- 100

# set number of times the samples will be summed
n_sum <- 100

n100S <- n100Sum$n100Store

mean100 <- mean(n100S)

sd100 <- sd(n100S)

# Set aesthetics
binwidth <- 0.005 * sqrt(n_sample) * 1 * (n_sum ^ 0.1)  

title_txt <- sprintf('9.3c CLT test after summing 100 samples of n=100', n_sum, n_sample)

p20 <- ggplot()
p20 <- p20 + geom_histogram(aes(x = n100S), binwidth = binwidth, colour = "white", 
                          fill = "cornflowerblue", size = 0.1)
p20 <- p20 + stat_function(fun = function(x) dnorm(x, mean = mean100, sd = sd100) * n_sample * binwidth, 
                         color = "darkred", size = 1)
p20 <- p20 + labs(x = 'Age', y = 'Frequency', title = title_txt)

p20

# For n=1000

# set sample size

# Compare the results from n = 1, 10, 100 and 1000

n_sample <- 100

# set number of times the samples will be summed
n_sum1 <- 1000
n_sum <- 0.1*n_sum1

n1000S <- n1000Sum$n1000Store

mean1000 <- mean(n1000S)

sd1000 <- sd(n1000S)

# Set aesthetics
binwidth <- 0.005 * sqrt(n_sample) * 0.3 * (n_sum ^ 0.1)  

title_txt <- sprintf('9.3d CLT test after summing 100 samples of n=1000', n_sum, n_sample)

p21 <- ggplot()
p21 <- p21 + geom_histogram(aes(x = n1000S), binwidth = binwidth, colour = "white", 
                          fill = "cornflowerblue", size = 0.1)
p21 <- p21 + stat_function(fun = function(x) dnorm(x, mean = mean1000, sd = sd1000) * n_sample * binwidth, 
                         color = "darkred", size = 1)
p21 <- p21 + labs(x = 'Age', y = 'Frequency', title = title_txt)

p21