Assessment126122Code3.knit

Codes

library(data.table)
library(dplyr)
library(ggplot2)
library(Hmisc)
library(lattice)
library(latticeExtra)
library(magrittr)
library(openxlsx)
library(readxl) 
library(stringr)
library(tidyr)
library(tidyverse)

#Question 1

A1a <- read_excel("pop_dataset.xlsx", sheet = 1)

A1 <- A1a[order(A1a$region), ] #sort original regions into numerical order

dim (A1)

str (A1)

#To create a dataframe with region numbered from 1 to 500

df1 <-aggregate(A1$region, by=list(A1$region), FUN=length) #Aggregate regions into groups

str(df1)

#Question 2

pop_tot <- sum(A1$population) #Total population

df2 <-aggregate(A1[,4], by=list(A1$age), FUN=sum) #Population for each age for all regions

prob=df2$population/pop_tot #Probability for each age

age=df2$Group.1 #Ages from 0 to 55

ExpV_age <-sum(prob * age) # Sum of prob x age

# 2.1 Expected value for the age (mean age) for the whole population
ExpV_age

# 2.2 Provide the standard deviation for the whole data sample.  Ans=Sample Standard Deviation= 15.778, the same as the population Standard Deviation of 15.778 to 3dp.

agesq=df2$Group.1^2

ExpV_age1 <-sum(prob * agesq)

ExpV_age2 <- ExpV_age^2

# Sample Variance

SampleVar <-ExpV_age1 - ExpV_age2
#SampleVar
SampleSDAge <- SampleVar^0.5
SampleSDAge

# Population Variance

N <- 56000 # N = number of observations of Sample

Pop_Var <- N * SampleVar/(N - 1)

#Standard Deviation for the age for the whole sample

PopSDAge <- Pop_Var^0.5
PopSDAge

#Question 3

options(digits=5)

Age <-A1[c(1:112), 2] #Range of ages 0 to 55, this the same for all regions

# To calculate the weighted mean of the ages for each region

i <- 1

Region <- 1

format(round(Region, 0), nsmall = 0)

WMStore <- list()               # Create empty list to store weighted means for each 
while (i < 56000) {

j <- i + 111

Pop <- A1[c(i:j), 4]             # Population for each region, 112 observations per 

WM <- weighted.mean(Age, Pop)    # weighted mean for each region


WMStore[[Region]] <- WM         # WMStore contains the weighted mean for all the 500 regions       
Region <- Region + 1

i <- i+112

if (i ==56000){
break
}
}

WMS <- unlist(WMStore) # This contains the weighted mean for each of the regions

df_WMS <- as.data.frame((as.data.frame(WMS))) # This is a dataframe of the weighted mean in a column of 500 rows

mean(WMS)                          # 3.1 Mean=30.608
sd(WMS)                            # 3.2 SD=7.996
min(WMS)                           # 3.3 Minimum = 2
quantile(WMS,0.25)                 # 3.4 First Quartile = 27.426
quantile(WMS,0.50)                 # 3.5 Median = 29.232
quantile(WMS,0.75)                 # 3.6 Third Quartile = 33.35
max(WMS)                           # 3.7 Maximum = 55
IQR(WMS)                           # 3.8 IQR = 5.924

p1a <- hist(WMS, xlab="Weighted Mean of Age", main="3.9 Distribution of means from each region") 
p1a

#Question 4

df3S <-aggregate(A1[,4], by=list(A1$region), FUN=sum) # population for each region

df3Smin <- filter(df3S, df3S$population == min(df3S$population)) # Find all regions with smallest population

head(df3Smin) #Find all regions with smallest population, SSC20099 is one of them with 3 people

#Question 5

#Question 5.1 Distribution of Ages

df4S <-aggregate(A1[,4], by=list(A1$region), FUN=sum) # population for each region

df4Smax <- filter(df4S, df4S$population == max(df4S$population)) # Find region with largest population.  SSC22015 is the largest with 37948 people
df4Smax

RL<- A1 %>% filter_at(vars(region), any_vars(. %in% c('SSC22015'))) #Data in the region SSC22015 which has the largest [population]

RLC <-aggregate(RL[,4], by=list(RL$age), FUN=sum) #add male and female population for each age

#Distribution of Ages for region SSC22015

p1 <- ggplot(RLC, aes(x = age, y = population)) +
geom_line()

print(p1 + ggtitle("5.1 Distribution of ages for region with most people"))

#Question 5.2 Cumulative Distribution of Ages

p2<- ggplot(RLC, aes(x = age)) +
  stat_ecdf(geom = "step")
  stat_ecdf()
  
p3 <- print(p2 + ggtitle("5.2 Cumulative Distribution of ages for region with most people"))

#Question 5.3 Cumulative Distribution of females and males population

RLS <- A1a[order(RL$gender), ] #Population data for largest region in the order of gender 

set.seed(1234)

RLSF <- RLS[1:56, ] #Female
RLSM <- RLS[57:112, ] #Male

Female <- RLSF$population
Male <- RLSM$population

# create a random data frame 

gender <- data.frame(x = c(Female, Male), group = gl(2, 56))

gender$group <- as.numeric(gender$group)

gender$group[1:56] <- "Female"

gender$group[57:112] <- "Male"

p15 <- ggplot(gender, aes(x = x, col = group)) + stat_ecdf() 

# stat_ecdf() function is used to plot ECDF plot

p16<- print(p15 + ggtitle("5.3 Cumulative Distribution of females and males population") + labs(x = "Population per Region",
         y = "Probability"))

#Question 6
options(digits=5)

# To calculate the population young and old, ratio old/young for each region

i <- 1
m <- 81

Region <- 1

format(round(Region, 0), nsmall = 0)

PopTotStore <- list()             # Create empty list to store total population for each region
RatioStore <- list()              # Create empty list to store ratio for each region

while (i < 56000) {

j <- i + 79
n <- m + 31

# Population for young, 80 observations per region
Popyoung <- A1[c(i:j), 4] 

# Population for old, 32 observations per region
Popold <- A1[c(m:n), 4]  

PopyoungS <- sum(Popyoung)
PopoldS <- sum(Popold)

#print(Popold)

PopTotStore [[Region]] <- PopyoungS + PopoldS

#print(PopTotStore)

RatioStore [[Region]] <- PopoldS/PopyoungS

#print(RatioStore)

Region <- Region + 1

i <- i + 112
m <- m + 112

if (i == 56000) {
  
  break
  
if (m == 56000) {
  
  break
}  
}
}

PopTotStore <- unlist(PopTotStore) #This contains the total Population for each of the regions
RatioStore <- unlist(RatioStore) #This contains the ratio for each of the regions

PopTot <- as.data.frame((as.data.frame(PopTotStore))) # This is a dataframe of the total population for each region

PopTot$Region <- 1:nrow(PopTot)
#PopTot

Ratio <- as.data.frame((as.data.frame(RatioStore))) # This is a dataframe of the ratio old/young for each region

Ratio$Region <- 1:nrow(Ratio)

PopRatio <- merge(Ratio, PopTot, by="Region")

names(PopRatio) [2] <- "Ratio"
names(PopRatio)[3] <- "PopRegion"

#To delete rows where ratio is inf or ratio < 1000

PopRatio1<- PopRatio[PopRatio$Ratio <= 20, ] 

p17 <- ggplot(PopRatio1, aes(x=PopRegion, y=Ratio)) + 
  geom_point()+
  geom_smooth(method=lm, color="red")+
  labs(title="6.1 Ratio of old to young vs Population in Region",
       x="Population in Region")+
  theme_classic()  

p17

#Question 7

options(digits=5)

# To calculate the population female and male, ratio female/male for each region

PopGenS <- 0             # Create empty list to store total population for each #region
RatioGenS <- 0   

p <- 1
a <- 56

Region <- 1

format(round(Region, 0), nsmall = 0)

PopGenS <- list()             # Create empty list to store total population for each #region
RatioGenS <- list()              # Create empty list to store ratio for each region

while (p < 56000) {

q <- p + 111
b <- a +55

PopSort <- A1[c(p:q), ] #Data from row i to j

#RLG <-PopSort[order(PopSort$gender),]

RLG <-PopSort[order(PopSort$gender),] #Sort in the order of gender for each region

PopF <- RLG [c(1:56), 4] # Population for female, 56 observations per region

PopM <- RLG [c(57:112), 4] # Population for male, 56 observations per region

PopFS <- sum(PopF)
PopMS <- sum(PopM)
PopTS <- PopFS+PopMS

PopGenS [[Region]] <- PopTS

RatioGenS [[Region]] <- PopFS/PopMS

#print(RatioGenS)

Region <- Region + 1

p <- p + 112
a <- a + 112

if (p == 56000) {
  
  break
  
if (a == 56000) {
  
  break
}  
}
}

PopGenS <- unlist(PopGenS) #This contains the total Population for each of the regions
RatioGenS <- unlist(RatioGenS) #This contains the ratio for each of the regions

PopGenST <- as.data.frame((as.data.frame(PopGenS))) # This is a dataframe of the total population for each region

PopGenST$Region <- 1:nrow(PopGenST)
#PopGenST

RatioGenST <- as.data.frame((as.data.frame(RatioGenS))) # This is a dataframe of the ratio (F/M) population for each region

RatioGenST$Region <- 1:nrow(RatioGenST)
#RatioGenST

PopRatioGen <- merge(RatioGenST, PopGenST, by="Region")

names(PopRatioGen) [2] <- "Ratio_FM"
names(PopRatioGen)[3] <- "PopRegion"

#To delete rows where ratio is inf or ratio < 1000

PopRatioGen<- PopRatioGen[PopRatioGen$Ratio_FM <= 1000, ] 

max(PopRatioGen$Ratio_FM) #max ratio is 5.33
min(PopRatioGen$Ratio_FM) #min ratio is 0

p17 <- ggplot(PopRatioGen, aes(x=PopRegion, y=Ratio_FM)) + 
  geom_point()+
  geom_smooth(method=lm, color="red")+
  labs(title="7.1 Ratio of female to male vs Population in Region",
       x="Population in Region")+
  theme_classic()  

p17

#Question 8

F18_21 <- filter(A1, A1$gender == "F") #Select females only from the dataset

F18_21A <- filter(F18_21, age > 17) #Select females over 17

F18_21B <- filter(F18_21A, age < 22) #Select females between 18 and 21

F18_21C <- aggregate(F18_21B$population, by=list(F18_21B$region), FUN=sum)

F18_21D <- F18_21C[order(F18_21C$x, decreasing=TRUE), ]  # The two regions with the largest population of females between 18 and 21 are SSC22015 (1113) and SSC20492 (2566)

#Question 9

# Question 9.1 when n=1
# Selecting n = 1 from the region with most people

s <- 1 #Start from sample 1 

n1Store <- list() # Create empty list to store weighted means for each age

while (s < 101) {
  
n1 <- RL[sample(nrow(RL), 1), ]  #Select n=1

Age <- n1[c(1), 2] #Range of ages 0 to 55

Pop <- n1[c(1), 4]   # Population for SSC22015, region with most people

WMn1 <- weighted.mean(Age, Pop) # weighted means for SSC22015 
            

# n1Store contains the weighted means for SSC22015       

n1Store[[s]] <- WMn1  

s <- s + 1 #choose next sample

if (s == 101){
break
}
}

# Question 9.1 when n=10
# Selecting n = 10 from the region with most people

s <- 1 #Start from sample 1 

n10Store <- list() # Create empty list to store weighted means for each age

while (s < 101) {
  
n10 <- RL[sample(nrow(RL), 10), ]  #Select n=10

Age <- n10[c(1:10), 2] #Range of ages 0 to 55

Pop <- n10[c(1:10), 4]   # Population for SSC22015, region with most people

WMn10 <- weighted.mean(Age, Pop) # weighted means for SSC22015 


# n10Store contains the weighted means for SSC22015       

n10Store[[s]] <- WMn10  

s <- s + 1 #choose next sample

if (s == 101){
break
}
}

# Question 9.1 when n=100
# Selecting n = 100 from the region with most people

s <- 1 #Start from sample 1 

n100Store <- list() # Create empty list to store weighted means for each age

while (s < 101) {
  
n100 <- RL[sample(nrow(RL), 100), ]  #Select n=100

Age <- n100[c(1:100), 2] #Range of ages 0 to 55

Pop <- n100[c(1:100), 4]   # Population for SSC22015, region with most people

WMn100 <- weighted.mean(Age, Pop) # weighted means for SSC22015 
                  

# n100Store contains the weighted means for SSC22015       

n100Store[[s]] <- WMn100 

s <- s + 1 #choose next sample

if (s == 101){
break
}
}

# Question 9.1 when n=1000
# Selecting n = 1000 from the region with most people

RL1000<- RL[rep(seq_len(nrow(RL)), 9), ] #Repeat the original dataset nine times to get more than 1000 rows.

s <- 1 #Start from sample 1 

n1000Store <- list() # Create empty list to store weighted means for each age

while (s < 101) {
  
n1000 <- RL1000[sample(nrow(RL1000), 1000), ]  #Select n=1000

Age <- n1000[c(1:1000), 2] #Range of ages 0 to 55

Pop <- n1000[c(1:1000), 4]   # Population for SSC22015, region with most people

WMn1000 <- weighted.mean(Age, Pop) # weighted means for SSC22015
         

# n1000Store contains the weighted means for SSC22015       

n1000Store[[s]] <- WMn1000 

s <- s + 1 #choose next sample

if (s == 101){
break
}
}

n1Store <- unlist(n1Store)
n10Store <- unlist(n10Store)
n100Store <- unlist(n100Store)
n1000Store <- unlist(n1000Store)

n1Sum <- as.data.frame((as.data.frame(n1Store))) # This is a dataframe of the weighted mean in a column of 100 rows

n10Sum <- as.data.frame((as.data.frame(n10Store))) # This is a dataframe of the weighted mean in a column of 100 rows

n100Sum <- as.data.frame((as.data.frame(n100Store))) # This is a dataframe of the weighted mean in a column of 100 rows

n1000Sum <- as.data.frame((as.data.frame(n1000Store))) # This is a dataframe of the weighted mean in a column of 100 rows

#For n=1

# set sample size

n_sample <- 100

# set number of times the samples will be summed
n_sum <- 100

n1S <- n1Sum$n1Store

mean1 <- mean(n1S)

sd1 <- sd(n1S)

binwidth <- 0.005 * sqrt(n_sample) * 4 * (n_sum ^ 0.4)  # rule of thumb for aesthetics

title_txt <- sprintf('9.3a CLT test after summing %d samples of %d observations (n=1)', n_sum, n_sample)

p18 <- ggplot()
p18 <- p18 + geom_histogram(aes(x = n1S), binwidth = binwidth, colour = "white", 
                          fill = "cornflowerblue", size = 0.1)
p18 <- p18 + stat_function(fun = function(x) dnorm(x, mean = mean1, sd = sd1) * n_sample * binwidth, 
                         color = "darkred", size = 1)
p18 <- p18 + labs(x = 'x', y = 'f(x)', title = title_txt)

p18

#For n=10

# set sample size

n_sample <- 100

# set number of times the samples will be summed
n_sum <- 100

n10S <- n10Sum$n10Store  #(TESTING ONLY)

mean10 <- mean(n10S)

sd10 <- sd(n10S)

binwidth <- 0.005 * sqrt(n_sample) * 4 * (n_sum ^ 0.4)  # rule of thumb for aesthetics

title_txt <- sprintf('9.3b CLT test after summing %d samples of %d observations (n=10)', n_sum, n_sample)

p19 <- ggplot()
p19 <- p19 + geom_histogram(aes(x = n10S), binwidth = binwidth, colour = "white", 
                          fill = "cornflowerblue", size = 0.1)
p19 <- p19 + stat_function(fun = function(x) dnorm(x, mean = mean10, sd = sd10) * n_sample * binwidth, 
                         color = "darkred", size = 1)
p19 <- p19 + labs(x = 'x', y = 'f(x)', title = title_txt)

p19

#For n=100

# set sample size

n_sample <- 100

# set number of times the samples will be summed
n_sum <- 100

n100S <- n100Sum$n100Store

mean100 <- mean(n100S)

sd100 <- sd(n100S)

#binwidth <- 0.005 * sqrt(n_sample) * 4 * (n_sum ^ 0.4)  # rule of thumb for aesthetics

binwidth <- 0.005 * sqrt(n_sample) * 1 * (n_sum ^ 0.4)  # rule of thumb for aesthetics

title_txt <- sprintf('9.3c CLT test after summing %d samples of %d observations (n=100)', n_sum, n_sample)

p20 <- ggplot()
p20 <- p20 + geom_histogram(aes(x = n100S), binwidth = binwidth, colour = "white", 
                          fill = "cornflowerblue", size = 0.1)
p20 <- p20 + stat_function(fun = function(x) dnorm(x, mean = mean100, sd = sd100) * n_sample * binwidth, 
                         color = "darkred", size = 1)
p20 <- p20 + labs(x = 'x', y = 'f(x)', title = title_txt)

p20

#For n=1000

# set sample size

n_sample <- 100

# set number of times the samples will be summed
n_sum <- 100

n1000S <- n1000Sum$n1000Store

mean1000 <- mean(n1000S)

sd1000 <- sd(n1000S)

#binwidth <- 0.005 * sqrt(n_sample) * 4 * (n_sum ^ 0.4)  # rule of thumb for aesthetics

binwidth <- 0.005 * sqrt(n_sample) * 0.5 * (n_sum ^ 0.1)  # rule of thumb for aesthetics

title_txt <- sprintf('9.3d CLT test after summing %d samples of %d observations (n=1000)', n_sum, n_sample)

p21 <- ggplot()
p21 <- p21 + geom_histogram(aes(x = n1000S), binwidth = binwidth, colour = "white", 
                          fill = "cornflowerblue", size = 0.1)
p21 <- p21 + stat_function(fun = function(x) dnorm(x, mean = mean1000, sd = sd1000) * n_sample * binwidth, 
                         color = "darkred", size = 1)
p21 <- p21 + labs(x = 'x', y = 'f(x)', title = title_txt)

p21