Codes
library(data.table)
library(dplyr)
library(ggplot2)
library(Hmisc)
library(lattice)
library(latticeExtra)
library(magrittr)
library(openxlsx)
library(readxl)
library(stringr)
library(tidyr)
library(tidyverse)
#Question 1
A1a <- read_excel("pop_dataset.xlsx", sheet = 1)
A1 <- A1a[order(A1a$region), ] #sort original regions into numerical order
dim (A1)
str (A1)
#To create a dataframe with region numbered from 1 to 500
df1 <-aggregate(A1$region, by=list(A1$region), FUN=length) #Aggregate regions into groups
str(df1)
#Question 2
pop_tot <- sum(A1$population) #Total population
df2 <-aggregate(A1[,4], by=list(A1$age), FUN=sum) #Population for each age for all regions
prob=df2$population/pop_tot #Probability for each age
age=df2$Group.1 #Ages from 0 to 55
ExpV_age <-sum(prob * age) # Sum of prob x age
# 2.1 Expected value for the age (mean age) for the whole population
ExpV_age
# 2.2 Provide the standard deviation for the whole data sample. Ans=Sample Standard Deviation= 15.778, the same as the population Standard Deviation of 15.778 to 3dp.
agesq=df2$Group.1^2
ExpV_age1 <-sum(prob * agesq)
ExpV_age2 <- ExpV_age^2
# Sample Variance
SampleVar <-ExpV_age1 - ExpV_age2
#SampleVar
SampleSDAge <- SampleVar^0.5
SampleSDAge
# Population Variance
N <- 56000 # N = number of observations of Sample
Pop_Var <- N * SampleVar/(N - 1)
#Standard Deviation for the age for the whole sample
PopSDAge <- Pop_Var^0.5
PopSDAge
#Question 3
options(digits=5)
Age <-A1[c(1:112), 2] #Range of ages 0 to 55, this the same for all regions
# To calculate the weighted mean of the ages for each region
i <- 1
Region <- 1
format(round(Region, 0), nsmall = 0)
WMStore <- list() # Create empty list to store weighted means for each
while (i < 56000) {
j <- i + 111
Pop <- A1[c(i:j), 4] # Population for each region, 112 observations per
WM <- weighted.mean(Age, Pop) # weighted mean for each region
WMStore[[Region]] <- WM # WMStore contains the weighted mean for all the 500 regions
Region <- Region + 1
i <- i+112
if (i ==56000){
break
}
}
WMS <- unlist(WMStore) # This contains the weighted mean for each of the regions
df_WMS <- as.data.frame((as.data.frame(WMS))) # This is a dataframe of the weighted mean in a column of 500 rows
mean(WMS) # 3.1 Mean=30.608
sd(WMS) # 3.2 SD=7.996
min(WMS) # 3.3 Minimum = 2
quantile(WMS,0.25) # 3.4 First Quartile = 27.426
quantile(WMS,0.50) # 3.5 Median = 29.232
quantile(WMS,0.75) # 3.6 Third Quartile = 33.35
max(WMS) # 3.7 Maximum = 55
IQR(WMS) # 3.8 IQR = 5.924
p1a <- hist(WMS, xlab="Weighted Mean of Age", main="3.9 Distribution of means from each region")
p1a
#Question 4
df3S <-aggregate(A1[,4], by=list(A1$region), FUN=sum) # population for each region
df3Smin <- filter(df3S, df3S$population == min(df3S$population)) # Find all regions with smallest population
head(df3Smin) #Find all regions with smallest population, SSC20099 is one of them with 3 people
#Question 5
#Question 5.1 Distribution of Ages
df4S <-aggregate(A1[,4], by=list(A1$region), FUN=sum) # population for each region
df4Smax <- filter(df4S, df4S$population == max(df4S$population)) # Find region with largest population. SSC22015 is the largest with 37948 people
df4Smax
RL<- A1 %>% filter_at(vars(region), any_vars(. %in% c('SSC22015'))) #Data in the region SSC22015 which has the largest [population]
RLC <-aggregate(RL[,4], by=list(RL$age), FUN=sum) #add male and female population for each age
#Distribution of Ages for region SSC22015
p1 <- ggplot(RLC, aes(x = age, y = population)) +
geom_line()
print(p1 + ggtitle("5.1 Distribution of ages for region with most people"))
#Question 5.2 Cumulative Distribution of Ages
p2<- ggplot(RLC, aes(x = age)) +
stat_ecdf(geom = "step")
stat_ecdf()
p3 <- print(p2 + ggtitle("5.2 Cumulative Distribution of ages for region with most people"))
#Question 5.3 Cumulative Distribution of females and males population
RLS <- A1a[order(RL$gender), ] #Population data for largest region in the order of gender
set.seed(1234)
RLSF <- RLS[1:56, ] #Female
RLSM <- RLS[57:112, ] #Male
Female <- RLSF$population
Male <- RLSM$population
# create a random data frame
gender <- data.frame(x = c(Female, Male), group = gl(2, 56))
gender$group <- as.numeric(gender$group)
gender$group[1:56] <- "Female"
gender$group[57:112] <- "Male"
p15 <- ggplot(gender, aes(x = x, col = group)) + stat_ecdf()
# stat_ecdf() function is used to plot ECDF plot
p16<- print(p15 + ggtitle("5.3 Cumulative Distribution of females and males population") + labs(x = "Population per Region",
y = "Probability"))
#Question 6
options(digits=5)
# To calculate the population young and old, ratio old/young for each region
i <- 1
m <- 81
Region <- 1
format(round(Region, 0), nsmall = 0)
PopTotStore <- list() # Create empty list to store total population for each region
RatioStore <- list() # Create empty list to store ratio for each region
while (i < 56000) {
j <- i + 79
n <- m + 31
# Population for young, 80 observations per region
Popyoung <- A1[c(i:j), 4]
# Population for old, 32 observations per region
Popold <- A1[c(m:n), 4]
PopyoungS <- sum(Popyoung)
PopoldS <- sum(Popold)
#print(Popold)
PopTotStore [[Region]] <- PopyoungS + PopoldS
#print(PopTotStore)
RatioStore [[Region]] <- PopoldS/PopyoungS
#print(RatioStore)
Region <- Region + 1
i <- i + 112
m <- m + 112
if (i == 56000) {
break
if (m == 56000) {
break
}
}
}
PopTotStore <- unlist(PopTotStore) #This contains the total Population for each of the regions
RatioStore <- unlist(RatioStore) #This contains the ratio for each of the regions
PopTot <- as.data.frame((as.data.frame(PopTotStore))) # This is a dataframe of the total population for each region
PopTot$Region <- 1:nrow(PopTot)
#PopTot
Ratio <- as.data.frame((as.data.frame(RatioStore))) # This is a dataframe of the ratio old/young for each region
Ratio$Region <- 1:nrow(Ratio)
PopRatio <- merge(Ratio, PopTot, by="Region")
names(PopRatio) [2] <- "Ratio"
names(PopRatio)[3] <- "PopRegion"
#To delete rows where ratio is inf or ratio < 1000
PopRatio1<- PopRatio[PopRatio$Ratio <= 20, ]
p17 <- ggplot(PopRatio1, aes(x=PopRegion, y=Ratio)) +
geom_point()+
geom_smooth(method=lm, color="red")+
labs(title="6.1 Ratio of old to young vs Population in Region",
x="Population in Region")+
theme_classic()
p17
#Question 7
options(digits=5)
# To calculate the population female and male, ratio female/male for each region
PopGenS <- 0 # Create empty list to store total population for each #region
RatioGenS <- 0
p <- 1
a <- 56
Region <- 1
format(round(Region, 0), nsmall = 0)
PopGenS <- list() # Create empty list to store total population for each #region
RatioGenS <- list() # Create empty list to store ratio for each region
while (p < 56000) {
q <- p + 111
b <- a +55
PopSort <- A1[c(p:q), ] #Data from row i to j
#RLG <-PopSort[order(PopSort$gender),]
RLG <-PopSort[order(PopSort$gender),] #Sort in the order of gender for each region
PopF <- RLG [c(1:56), 4] # Population for female, 56 observations per region
PopM <- RLG [c(57:112), 4] # Population for male, 56 observations per region
PopFS <- sum(PopF)
PopMS <- sum(PopM)
PopTS <- PopFS+PopMS
PopGenS [[Region]] <- PopTS
RatioGenS [[Region]] <- PopFS/PopMS
#print(RatioGenS)
Region <- Region + 1
p <- p + 112
a <- a + 112
if (p == 56000) {
break
if (a == 56000) {
break
}
}
}
PopGenS <- unlist(PopGenS) #This contains the total Population for each of the regions
RatioGenS <- unlist(RatioGenS) #This contains the ratio for each of the regions
PopGenST <- as.data.frame((as.data.frame(PopGenS))) # This is a dataframe of the total population for each region
PopGenST$Region <- 1:nrow(PopGenST)
#PopGenST
RatioGenST <- as.data.frame((as.data.frame(RatioGenS))) # This is a dataframe of the ratio (F/M) population for each region
RatioGenST$Region <- 1:nrow(RatioGenST)
#RatioGenST
PopRatioGen <- merge(RatioGenST, PopGenST, by="Region")
names(PopRatioGen) [2] <- "Ratio_FM"
names(PopRatioGen)[3] <- "PopRegion"
#To delete rows where ratio is inf or ratio < 1000
PopRatioGen<- PopRatioGen[PopRatioGen$Ratio_FM <= 1000, ]
max(PopRatioGen$Ratio_FM) #max ratio is 5.33
min(PopRatioGen$Ratio_FM) #min ratio is 0
p17 <- ggplot(PopRatioGen, aes(x=PopRegion, y=Ratio_FM)) +
geom_point()+
geom_smooth(method=lm, color="red")+
labs(title="7.1 Ratio of female to male vs Population in Region",
x="Population in Region")+
theme_classic()
p17
#Question 8
F18_21 <- filter(A1, A1$gender == "F") #Select females only from the dataset
F18_21A <- filter(F18_21, age > 17) #Select females over 17
F18_21B <- filter(F18_21A, age < 22) #Select females between 18 and 21
F18_21C <- aggregate(F18_21B$population, by=list(F18_21B$region), FUN=sum)
F18_21D <- F18_21C[order(F18_21C$x, decreasing=TRUE), ] # The two regions with the largest population of females between 18 and 21 are SSC22015 (1113) and SSC20492 (2566)
#Question 9
# Question 9.1 when n=1
# Selecting n = 1 from the region with most people
s <- 1 #Start from sample 1
n1Store <- list() # Create empty list to store weighted means for each age
while (s < 101) {
n1 <- RL[sample(nrow(RL), 1), ] #Select n=1
Age <- n1[c(1), 2] #Range of ages 0 to 55
Pop <- n1[c(1), 4] # Population for SSC22015, region with most people
WMn1 <- weighted.mean(Age, Pop) # weighted means for SSC22015
# n1Store contains the weighted means for SSC22015
n1Store[[s]] <- WMn1
s <- s + 1 #choose next sample
if (s == 101){
break
}
}
# Question 9.1 when n=10
# Selecting n = 10 from the region with most people
s <- 1 #Start from sample 1
n10Store <- list() # Create empty list to store weighted means for each age
while (s < 101) {
n10 <- RL[sample(nrow(RL), 10), ] #Select n=10
Age <- n10[c(1:10), 2] #Range of ages 0 to 55
Pop <- n10[c(1:10), 4] # Population for SSC22015, region with most people
WMn10 <- weighted.mean(Age, Pop) # weighted means for SSC22015
# n10Store contains the weighted means for SSC22015
n10Store[[s]] <- WMn10
s <- s + 1 #choose next sample
if (s == 101){
break
}
}
# Question 9.1 when n=100
# Selecting n = 100 from the region with most people
s <- 1 #Start from sample 1
n100Store <- list() # Create empty list to store weighted means for each age
while (s < 101) {
n100 <- RL[sample(nrow(RL), 100), ] #Select n=100
Age <- n100[c(1:100), 2] #Range of ages 0 to 55
Pop <- n100[c(1:100), 4] # Population for SSC22015, region with most people
WMn100 <- weighted.mean(Age, Pop) # weighted means for SSC22015
# n100Store contains the weighted means for SSC22015
n100Store[[s]] <- WMn100
s <- s + 1 #choose next sample
if (s == 101){
break
}
}
# Question 9.1 when n=1000
# Selecting n = 1000 from the region with most people
RL1000<- RL[rep(seq_len(nrow(RL)), 9), ] #Repeat the original dataset nine times to get more than 1000 rows.
s <- 1 #Start from sample 1
n1000Store <- list() # Create empty list to store weighted means for each age
while (s < 101) {
n1000 <- RL1000[sample(nrow(RL1000), 1000), ] #Select n=1000
Age <- n1000[c(1:1000), 2] #Range of ages 0 to 55
Pop <- n1000[c(1:1000), 4] # Population for SSC22015, region with most people
WMn1000 <- weighted.mean(Age, Pop) # weighted means for SSC22015
# n1000Store contains the weighted means for SSC22015
n1000Store[[s]] <- WMn1000
s <- s + 1 #choose next sample
if (s == 101){
break
}
}
n1Store <- unlist(n1Store)
n10Store <- unlist(n10Store)
n100Store <- unlist(n100Store)
n1000Store <- unlist(n1000Store)
n1Sum <- as.data.frame((as.data.frame(n1Store))) # This is a dataframe of the weighted mean in a column of 100 rows
n10Sum <- as.data.frame((as.data.frame(n10Store))) # This is a dataframe of the weighted mean in a column of 100 rows
n100Sum <- as.data.frame((as.data.frame(n100Store))) # This is a dataframe of the weighted mean in a column of 100 rows
n1000Sum <- as.data.frame((as.data.frame(n1000Store))) # This is a dataframe of the weighted mean in a column of 100 rows
#For n=1
# set sample size
n_sample <- 100
# set number of times the samples will be summed
n_sum <- 100
n1S <- n1Sum$n1Store
mean1 <- mean(n1S)
sd1 <- sd(n1S)
binwidth <- 0.005 * sqrt(n_sample) * 4 * (n_sum ^ 0.4) # rule of thumb for aesthetics
title_txt <- sprintf('9.3a CLT test after summing %d samples of %d observations (n=1)', n_sum, n_sample)
p18 <- ggplot()
p18 <- p18 + geom_histogram(aes(x = n1S), binwidth = binwidth, colour = "white",
fill = "cornflowerblue", size = 0.1)
p18 <- p18 + stat_function(fun = function(x) dnorm(x, mean = mean1, sd = sd1) * n_sample * binwidth,
color = "darkred", size = 1)
p18 <- p18 + labs(x = 'x', y = 'f(x)', title = title_txt)
p18
#For n=10
# set sample size
n_sample <- 100
# set number of times the samples will be summed
n_sum <- 100
n10S <- n10Sum$n10Store #(TESTING ONLY)
mean10 <- mean(n10S)
sd10 <- sd(n10S)
binwidth <- 0.005 * sqrt(n_sample) * 4 * (n_sum ^ 0.4) # rule of thumb for aesthetics
title_txt <- sprintf('9.3b CLT test after summing %d samples of %d observations (n=10)', n_sum, n_sample)
p19 <- ggplot()
p19 <- p19 + geom_histogram(aes(x = n10S), binwidth = binwidth, colour = "white",
fill = "cornflowerblue", size = 0.1)
p19 <- p19 + stat_function(fun = function(x) dnorm(x, mean = mean10, sd = sd10) * n_sample * binwidth,
color = "darkred", size = 1)
p19 <- p19 + labs(x = 'x', y = 'f(x)', title = title_txt)
p19
#For n=100
# set sample size
n_sample <- 100
# set number of times the samples will be summed
n_sum <- 100
n100S <- n100Sum$n100Store
mean100 <- mean(n100S)
sd100 <- sd(n100S)
#binwidth <- 0.005 * sqrt(n_sample) * 4 * (n_sum ^ 0.4) # rule of thumb for aesthetics
binwidth <- 0.005 * sqrt(n_sample) * 1 * (n_sum ^ 0.4) # rule of thumb for aesthetics
title_txt <- sprintf('9.3c CLT test after summing %d samples of %d observations (n=100)', n_sum, n_sample)
p20 <- ggplot()
p20 <- p20 + geom_histogram(aes(x = n100S), binwidth = binwidth, colour = "white",
fill = "cornflowerblue", size = 0.1)
p20 <- p20 + stat_function(fun = function(x) dnorm(x, mean = mean100, sd = sd100) * n_sample * binwidth,
color = "darkred", size = 1)
p20 <- p20 + labs(x = 'x', y = 'f(x)', title = title_txt)
p20
#For n=1000
# set sample size
n_sample <- 100
# set number of times the samples will be summed
n_sum <- 100
n1000S <- n1000Sum$n1000Store
mean1000 <- mean(n1000S)
sd1000 <- sd(n1000S)
#binwidth <- 0.005 * sqrt(n_sample) * 4 * (n_sum ^ 0.4) # rule of thumb for aesthetics
binwidth <- 0.005 * sqrt(n_sample) * 0.5 * (n_sum ^ 0.1) # rule of thumb for aesthetics
title_txt <- sprintf('9.3d CLT test after summing %d samples of %d observations (n=1000)', n_sum, n_sample)
p21 <- ggplot()
p21 <- p21 + geom_histogram(aes(x = n1000S), binwidth = binwidth, colour = "white",
fill = "cornflowerblue", size = 0.1)
p21 <- p21 + stat_function(fun = function(x) dnorm(x, mean = mean1000, sd = sd1000) * n_sample * binwidth,
color = "darkred", size = 1)
p21 <- p21 + labs(x = 'x', y = 'f(x)', title = title_txt)
p21