CODES
library(ggplot2)
library(dplyr)
library(readxl)
## Question 1.1
A1a <- read_excel("pop_dataset.xlsx", sheet = 1)
A1 <- arrange(A1a, region) #sort original regions into numerical order (using dplyr)
dim (A1)
## Question 1.2
glimpse(A1) #dplyr function
## Question 1.3
#To create a dataframe with region numbered from 1 to 500
df1 <- A1 %>%
group_by(region) %>%
tally() #Aggregate regions into groups by dplyr
glimpse(df1)
## Question 2.1
pop_tot <- sum(A1$population) #Total population
df2 <- A1 %>%
group_by(age) %>%
tally(population) #Aggregate Population for each age for all regions (dplyr)
prob=df2$n/pop_tot #Probability for each age, n = population
age=df2$age #Ages from 0 to 55
ExpV_age <-sum(prob * age) # Sum of prob x age
ExpV_age #2.1 Expected value for the age (mean age) for the whole population
## Question 2.2
agesq <- df2$age^2
ExpV_age1 <-sum(prob * agesq)
ExpV_age2 <- ExpV_age^2
# Sample Variance
SampleVar <-ExpV_age1 - ExpV_age2
#SampleVar
SampleSDAge <- SampleVar^0.5
SampleSDAge
# Population Variance
N <- 56000 # N = number of observations of Sample
Pop_Var <- N * SampleVar/(N - 1)
#Standard Deviation for the age for the whole sample
PopSDAge <- Pop_Var^0.5
PopSDAge
## Question 3
options(digits=5)
Age <-A1[c(1:112), 2] #Range of ages 0 to 55, this is the same for all regions
# To calculate the weighted mean of the ages for each region
i <- 1
Region <- 1
format(round(Region, 0), nsmall = 0)
WMStore <- list() # Create empty list to store weighted means for each
while (i < 56000) {
j <- i + 111
Pop <- A1[c(i:j), 4] # Population for each region, 112 observations per
WM <- weighted.mean(Age, Pop) # weighted mean for each region
WMStore[[Region]] <- WM # WMStore contains the weighted mean for all the 500 regions
Region <- Region + 1
i <- i+112
if (i ==56000){
break
}
}
WMS <- unlist(WMStore) # This contains the weighted mean for each of the regions
df_WMS <- as.data.frame((as.data.frame(WMS))) # This is a dataframe of the weighted mean in a column of 500 rows
# Summary statistics of the weighted means
df_WMS %>%
summarise(mean = mean(WMS),
sd = sd(WMS),
min = min(WMS),
Q1 = quantile(WMS, 0.25),
Median = median(WMS, 0.5),
Q3 = quantile(WMS, 0.75),
max = max(WMS),
IQR = IQR(WMS))
## Question 3.9
p1a<- ggplot(data=df_WMS, aes(x=WMS)) +
geom_histogram(breaks=seq(0, 60, by=2),
col="black",
fill="green",
alpha = .2) +
labs(title="3.9 Distribution of weighted means", x="Weighted Mean of Age", y="Frequency") +
xlim(c(0,60)) +
ylim(c(0,150))
p1a
## Question 4
df3S <- A1 %>%
group_by(region) %>%
tally(population) # population for each region
df3Smin <- df3S %>%
select(region, n) %>%
filter(n == min(df3S$n)) # Find all regions with smallest population
glimpse(df3Smin) #Find all regions with smallest population, SSC20099 is one of them with 3 people
## Question 5.1
df4S <- A1 %>%
group_by(region) %>%
tally(population) #Aggregate # population for each region (dplyr)
df4Smax <- df4S %>%
select(region, n)%>%
filter(n == max(df4S$n)) # Find region with largest population (n). SSC22015 is the largest with 37948 people
df4Smax
RL<- A1 %>% filter_at(vars(region), any_vars(. %in% c('SSC22015'))) #Data in the region SSC22015 which has the largest [population]
RLC <- RL %>%
group_by(age) %>%
tally(population) #add male and female population for each age
#Distribution of Ages for region SSC22015
p1 <- ggplot(RLC, aes(x = age, y = n)) +
geom_line() +
labs(x = 'Age', y = 'Population')
print(p1 + ggtitle("5.1 Distribution of ages for region with most people"))
## Question 5.2
p2<- ggplot(RLC, aes(x = age)) +
stat_ecdf(geom = "step") +
labs(x = 'Age', y = 'Probability')
p3 <- print(p2 + ggtitle("5.2 Cumulative Distribution of ages for region with most people"))
## Question 5.3
RLS <- arrange(RL, gender) #Population data for largest region in the order of gender (dplyr)
set.seed(1234)
RLSF <- RLS[1:56, ] #Female
RLSM <- RLS[57:112, ] #Male
Female <- RLSF$population
Male <- RLSM$population
# create a random data frame
gender <- data.frame(x = c(Female, Male), group = gl(2, 56))
gender$group <- as.numeric(gender$group)
gender$group[1:56] <- "Female"
gender$group[57:112] <- "Male"
p15 <- ggplot(gender, aes(x = x, col = group)) + stat_ecdf()
# stat_ecdf() function is used to plot ECDF plot
p16<- print(p15 + ggtitle("5.3 Cumulative Distribution of females and males population") + labs(x = "Population per Region",
y = "Probability"))
## Question 6.1
options(digits=5)
# To calculate the population young and old, ratio old/young for each region
i <- 1
m <- 81
Region <- 1
format(round(Region, 0), nsmall = 0)
PopTotStore <- list() # Create empty list to store total population for each region
RatioStore <- list() # Create empty list to store ratio for each region
while (i < 56000) {
j <- i + 79
n <- m + 31
# Population for young, 80 observations per region
Popyoung <- A1[c(i:j), 4]
# Population for old, 32 observations per region
Popold <- A1[c(m:n), 4]
PopyoungS <- sum(Popyoung)
PopoldS <- sum(Popold)
PopTotStore [[Region]] <- PopyoungS + PopoldS
RatioStore [[Region]] <- PopoldS/PopyoungS
Region <- Region + 1
i <- i + 112
m <- m + 112
if (i == 56000) {
break
if (m == 56000) {
break
}
}
}
#This contains the total Population for each of the regions
PopTotStore <- unlist(PopTotStore)
#This contains the ratio for each of the regions
RatioStore <- unlist(RatioStore)
# This is a dataframe of the total population for each region
PopTot <- as.data.frame((as.data.frame(PopTotStore)))
PopTot$Region <- 1:nrow(PopTot)
# This is a dataframe of the ratio old/young for each region
Ratio <- as.data.frame((as.data.frame(RatioStore)))
Ratio$Region <- 1:nrow(Ratio)
PopRatio <- merge(Ratio, PopTot, by="Region")
names(PopRatio) [2] <- "Ratio"
names(PopRatio)[3] <- "PopRegion"
#To delete rows where ratio is inf or ratio < 1000
PopRatio1<- PopRatio[PopRatio$Ratio <= 20, ]
p17 <- ggplot(PopRatio1, aes(x=PopRegion, y=Ratio)) +
geom_point()+
geom_smooth(method=lm, color="red")+
labs(title="6.1 Ratio of old to young vs Population in Region",
x="Population in Region")+
theme_classic()
p17
## Question 7.1
options(digits=5)
# To calculate the population female and male, ratio female/male for each region
# Create empty list to store total population for each #region
PopGenS <- 0
RatioGenS <- 0
p <- 1
a <- 56
Region <- 1
format(round(Region, 0), nsmall = 0)
# Create empty list to store total population for each #region
PopGenS <- list()
# Create empty list to store ratio for each region
RatioGenS <- list()
while (p < 56000) {
q <- p + 111
b <- a +55
PopSort <- A1[c(p:q), ] #Data from row i to j
#Sort in the order of gender for each region (dplyr)
RLG <- arrange(PopSort, gender)
# Population for female, 56 observations per region
PopF <- RLG [c(1:56), 4]
# Population for male, 56 observations per region
PopM <- RLG [c(57:112), 4]
PopFS <- sum(PopF)
PopMS <- sum(PopM)
PopTS <- PopFS+PopMS
PopGenS [[Region]] <- PopTS
RatioGenS [[Region]] <- PopFS/PopMS
Region <- Region + 1
p <- p + 112
a <- a + 112
if (p == 56000) {
break
if (a == 56000) {
break
}
}
}
#This contains the total Population for each of the regions
PopGenS <- unlist(PopGenS)
#This contains the ratio for each of the regions
RatioGenS <- unlist(RatioGenS)
# This is a dataframe of the total population for each region
PopGenST <- as.data.frame((as.data.frame(PopGenS)))
PopGenST$Region <- 1:nrow(PopGenST)
# This is a dataframe of the ratio (F/M) population for each region
RatioGenST <- as.data.frame((as.data.frame(RatioGenS)))
RatioGenST$Region <- 1:nrow(RatioGenST)
PopRatioGen <- merge(RatioGenST, PopGenST, by="Region")
names(PopRatioGen) [2] <- "Ratio_FM"
names(PopRatioGen)[3] <- "PopRegion"
#To delete rows where ratio is inf or ratio < 1000
PopRatioGen<- PopRatioGen[PopRatioGen$Ratio_FM <= 1000, ]
max(PopRatioGen$Ratio_FM) #max ratio is 5.33
min(PopRatioGen$Ratio_FM) #min ratio is 0
p17 <- ggplot(PopRatioGen, aes(x=PopRegion, y=Ratio_FM)) +
geom_point()+
geom_smooth(method=lm, color="red")+
labs(title="7.1 Ratio of female to male vs Population in Region",
x="Population in Region")+
theme_classic()
p17
## Question 8
#Select females only from the dataset
F18_21 <- A1 %>%
select(region, age, gender, population)%>%
filter(gender == "F")
#Select females over 17
F18_21A <- F18_21 %>%
select(region, age, gender, population)%>%
filter(age > 17)
#Select females between 18 and 21
F18_21B <- F18_21A %>%
select(region, age, gender, population)%>%
filter(age < 22)
#Aggregate female population by region
F18_21C <- F18_21B %>%
group_by(region) %>%
tally(population)
# The two regions with the largest population of females between 18 and 21 are SSC22015 (1113) and SSC20492 (2566) (dplyr)
F18_21D<- F18_21C %>% arrange(desc(n))
## Question 9
#### Question 9.1 when n=1
# Selecting n = 1 from the region with most people
#Start from sample 1
s <- 1
# Create empty list to store weighted means for each age
n1Store <- list()
while (s < 101) {
n1 <- RL[sample(nrow(RL), 1), ] #Select n=1
Age <- n1[c(1), 2] #Range of ages 0 to 55
# Population for SSC22015, region with most people
Pop <- n1[c(1), 4]
# weighted means for SSC22015
WMn1 <- weighted.mean(Age, Pop)
# n1Store contains the weighted means for SSC22015
n1Store[[s]] <- WMn1
s <- s + 1 #choose next sample
if (s == 101){
break
}
}
#### Question 9.1 when n=10
# Selecting n = 10 from the region with most people
#Start from sample 1
s <- 1
# Create empty list to store weighted means for each age
n10Store <- list()
while (s < 101) {
n10 <- RL[sample(nrow(RL), 10), ] #Select n=10
Age <- n10[c(1:10), 2] #Range of ages 0 to 55
# Population for SSC22015, region with most people
Pop <- n10[c(1:10), 4]
WMn10 <- weighted.mean(Age, Pop) # weighted means for SSC22015
# n10Store contains the weighted means for SSC22015
n10Store[[s]] <- WMn10
#choose next sample
s <- s + 1
if (s == 101){
break
}
}
#### Question 9.1 when n=100
# Selecting n = 100 from the region with most people
#Start from sample 1
s <- 1
# Create empty list to store weighted means for each age
n100Store <- list()
while (s < 101) {
n100 <- RL[sample(nrow(RL), 100), ] #Select n=100
Age <- n100[c(1:100), 2] #Range of ages 0 to 55
# Population for SSC22015, region with most people
Pop <- n100[c(1:100), 4]
# weighted means for SSC22015
WMn100 <- weighted.mean(Age, Pop)
# n100Store contains the weighted means for SSC22015
n100Store[[s]] <- WMn100
#choose next sample
s <- s + 1
if (s == 101){
break
}
}
#### Question 9.1 when n=1000
# Selecting n = 1000 from the region with most people
#Repeat the original dataset nine times to get more than 1000 rows.
RL1000<- RL[rep(seq_len(nrow(RL)), 9), ]
#Start from sample 1
s <- 1
# Create empty list to store weighted means for each age
n1000Store <- list()
while (s < 101) {
n1000 <- RL1000[sample(nrow(RL1000), 1000), ] #Select n=1000
Age <- n1000[c(1:1000), 2] #Range of ages 0 to 55
# Population for SSC22015, region with most people
Pop <- n1000[c(1:1000), 4]
# weighted means for SSC22015
WMn1000 <- weighted.mean(Age, Pop)
# n100Store contains the weighted means for SSC22015
n1000Store[[s]] <- WMn1000
#choose next sample
s <- s + 1
if (s == 101){
break
}
}
n1Store <- unlist(n1Store)
n10Store <- unlist(n10Store)
n100Store <- unlist(n100Store)
n1000Store <- unlist(n1000Store)
# Dataframe of the weighted mean in a column of 100 rows
n1Sum <- as.data.frame((as.data.frame(n1Store)))
n10Sum <- as.data.frame((as.data.frame(n10Store)))
n100Sum <- as.data.frame((as.data.frame(n100Store)))
n1000Sum <- as.data.frame((as.data.frame(n1000Store)))
#For n=1
# set sample size
# Compare the results from n = 1, 10, 100 and 1000
n_sample <- 100
# set number of times the samples will be summed
n_sum1 <- 1
n_sum <- 100*n_sum1
n1S <- n1Sum$n1Store
mean1 <- mean(n1S)
sd1 <- sd(n1S)
# Set aesthetics
binwidth <- 0.005 * sqrt(n_sample) * 4 * ((n_sum) ^ 0.4)
title_txt <- sprintf('9.3a CLT test after summing 100 samples of n = 1', n_sum, n_sample)
p18 <- ggplot()
p18 <- p18 + geom_histogram(aes(x = n1S), binwidth = binwidth, colour = "white",
fill = "cornflowerblue", size = 0.1)
p18 <- p18 + stat_function(fun = function(x) dnorm(x, mean = mean1, sd = sd1) * n_sample * binwidth,
color = "darkred", size = 1)
p18 <- p18 + labs(x = 'Age', y = 'Frequency', title = title_txt)
p18
#For n=10
# set sample size
# Compare the results from n = 1, 10, 100 and 1000
n_sample <- 100
# set number of times the samples will be summed
n_sum1 <- 10
n_sum <- 10*n_sum1
n10S <- n10Sum$n10Store #(TESTING ONLY)
mean10 <- mean(n10S)
sd10 <- sd(n10S)
# Set aesthetics
binwidth <- 0.005 * sqrt(n_sample) * 4 * (n_sum ^ 0.4)
title_txt <- sprintf('9.3b CLT test after summing 100 samples of n = 10', n_sum, n_sample)
p19 <- ggplot()
p19 <- p19 + geom_histogram(aes(x = n10S), binwidth = binwidth, colour = "white",
fill = "cornflowerblue", size = 0.1)
p19 <- p19 + stat_function(fun = function(x) dnorm(x, mean = mean10, sd = sd10) * n_sample * binwidth,
color = "darkred", size = 1)
p19 <- p19 + labs(x = 'Age', y = 'Frequency', title = title_txt)
p19
#For n=100
# set sample size
# Compare the results from n = 1, 10, 100 and 1000
n_sample <- 100
# set number of times the samples will be summed
n_sum <- 100
n100S <- n100Sum$n100Store
mean100 <- mean(n100S)
sd100 <- sd(n100S)
# Set aesthetics
binwidth <- 0.005 * sqrt(n_sample) * 1 * (n_sum ^ 0.1)
title_txt <- sprintf('9.3c CLT test after summing 100 samples of n=100', n_sum, n_sample)
p20 <- ggplot()
p20 <- p20 + geom_histogram(aes(x = n100S), binwidth = binwidth, colour = "white",
fill = "cornflowerblue", size = 0.1)
p20 <- p20 + stat_function(fun = function(x) dnorm(x, mean = mean100, sd = sd100) * n_sample * binwidth,
color = "darkred", size = 1)
p20 <- p20 + labs(x = 'Age', y = 'Frequency', title = title_txt)
p20
#For n=1000
# set sample size
# Compare the results from n = 1, 10, 100 and 1000
n_sample <- 100
# set number of times the samples will be summed
n_sum1 <- 1000
n_sum <- 0.1*n_sum1
n1000S <- n1000Sum$n1000Store
mean1000 <- mean(n1000S)
sd1000 <- sd(n1000S)
# Set aesthetics
binwidth <- 0.005 * sqrt(n_sample) * 0.3 * (n_sum ^ 0.1)
title_txt <- sprintf('9.3d CLT test after summing 100 samples of n=1000', n_sum, n_sample)
p21 <- ggplot()
p21 <- p21 + geom_histogram(aes(x = n1000S), binwidth = binwidth, colour = "white",
fill = "cornflowerblue", size = 0.1)
p21 <- p21 + stat_function(fun = function(x) dnorm(x, mean = mean1000, sd = sd1000) * n_sample * binwidth,
color = "darkred", size = 1)
p21 <- p21 + labs(x = 'Age', y = 'Frequency', title = title_txt)
p21