#Load libraries
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readr)
library(magrittr)
#Load and view dataset
df<- read_csv("pop_dataset_0002.txt")
##
## -- Column specification --------------------------------------------------------
## cols(
## region = col_character(),
## age = col_double(),
## gender = col_character(),
## population = col_double()
## )
head(df)
## # A tibble: 6 x 4
## region age gender population
## <chr> <dbl> <chr> <dbl>
## 1 SSC21184 0 M 114
## 2 SSC21184 0 F 95
## 3 SSC21184 1 M 88
## 4 SSC21184 1 F 107
## 5 SSC21184 2 M 122
## 6 SSC21184 2 F 120
#1 Using the correct code, check the data by completing the steps below. Then, answer each question.
dim(df)
## [1] 56000 4
print(paste('Number of rows:', nrow(df)))
## [1] "Number of rows: 56000"
print(paste('Number of columns:', ncol(df)))
## [1] "Number of columns: 4"
sapply(df, class)
## region age gender population
## "character" "numeric" "character" "numeric"
#Reassign characters as factors
df$region<-as.factor(df$region)
df$gender<-as.factor(df$gender)
sapply(df, class)
## region age gender population
## "factor" "numeric" "factor" "numeric"
print(paste('Number of unique regions:', length(unique(df$region))))
## [1] "Number of unique regions: 500"
min(df$age)
## [1] 0
max(df$age)
## [1] 55
count(distinct(df,age))
## # A tibble: 1 x 1
## n
## <int>
## 1 56
#Create new dataframe
df1<-df
head(df1)
## # A tibble: 6 x 4
## region age gender population
## <fct> <dbl> <fct> <dbl>
## 1 SSC21184 0 M 114
## 2 SSC21184 0 F 95
## 3 SSC21184 1 M 88
## 4 SSC21184 1 F 107
## 5 SSC21184 2 M 122
## 6 SSC21184 2 F 120
#Create new variable: agepop to multiply age by population
df1$agepop <-(df1$age * df1$population)
#Divide total sum of ages (agepop) by the total pop to find the mean (rounded to 3)
mean<-sum(df1$agepop)/sum(df1$population)
print(paste('Mean age of total population:', mean))
## [1] "Mean age of total population: 27.8002663266396"
mean<-round(mean, digits = 3)
print(paste('Mean age of total population (rounded):', mean))
## [1] "Mean age of total population (rounded): 27.8"
#Subtract the mean from every number and square the result
df1$num_minus_mean <-df1$age-mean
df1$sqr_result <-df1$num_minus_mean*df1$num_minus_mean
#Sum of Squared Differences divided by the Count
variance<-sum(df1$sqr_result)/sum(df1$population-1)
#Find the square root of the variance
print(paste('SD for the whole data sample:', sqrt(variance)))
## [1] "SD for the whole data sample: 4.44709795287088"
print(paste('SD for the whole data sample (rounded):', round(sqrt(variance), digits=3)))
## [1] "SD for the whole data sample (rounded): 4.447"
df2<- df
head(df2)
## # A tibble: 6 x 4
## region age gender population
## <fct> <dbl> <fct> <dbl>
## 1 SSC21184 0 M 114
## 2 SSC21184 0 F 95
## 3 SSC21184 1 M 88
## 4 SSC21184 1 F 107
## 5 SSC21184 2 M 122
## 6 SSC21184 2 F 120
#Create variable "agepop" to multiply age by population
df2$agepop<-df2$age * df2$population
head(df2)
## # A tibble: 6 x 5
## region age gender population agepop
## <fct> <dbl> <fct> <dbl> <dbl>
## 1 SSC21184 0 M 114 0
## 2 SSC21184 0 F 95 0
## 3 SSC21184 1 M 88 88
## 4 SSC21184 1 F 107 107
## 5 SSC21184 2 M 122 244
## 6 SSC21184 2 F 120 240
#Group region by total population and by age
regionpop <-aggregate(population~region,df2,sum)
head(regionpop[order(regionpop$population),])
## region population
## 10 SSC20099 3
## 14 SSC20127 3
## 17 SSC20151 3
## 51 SSC20346 3
## 62 SSC20383 3
## 66 SSC20398 3
regionagepop <-aggregate(agepop~region,df2,sum)
head(regionagepop[order(regionagepop$agepop),])
## region agepop
## 384 SSC22193 6
## 164 SSC21026 9
## 51 SSC20346 18
## 17 SSC20151 21
## 71 SSC20422 27
## 83 SSC20503 33
#Join both created tables and divide agepop by pop to get mean by region (rounded to 2)
joined<-full_join(regionpop, regionagepop, by='region')
joined$mean<-joined$agepop/joined$population
joined$mean<-round(joined$mean, digits=1)
head(joined[order(joined$mean),])
## region population agepop mean
## 384 SSC22193 3 6 2.0
## 164 SSC21026 3 9 3.0
## 201 SSC21211 6 33 5.5
## 282 SSC21701 6 33 5.5
## 51 SSC20346 3 18 6.0
## 17 SSC20151 3 21 7.0
#Find the mean of the region means
round(mean(joined$mean), digits=3)
## [1] 30.609
#Summary statistics
summary(joined$mean)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 27.40 29.20 30.61 33.33 55.00
round(sd(joined$mean), digits=3)
## [1] 7.997
min (joined$mean)
## [1] 2
quantile(joined$mean, 0.25)
## 25%
## 27.4
median(joined$mean)
## [1] 29.2
quantile(joined$mean, 0.75)
## 75%
## 33.325
max(joined$mean)
## [1] 55
IQR(joined$mean)
## [1] 5.925
gg<-ggplot(data=joined, aes(x=mean)) + geom_histogram(binwidth=1,
color="black",
fill="dodgerblue3",
bins=56) +
ggtitle("Mean Age Distribution by Region") +
xlab("Mean Age") +
geom_vline(aes(xintercept=mean(mean, na.rm=T)), color="red", linetype="dashed", size=1)
gg
#4 Consider the region with the smallest population: ## 4.1 Identify the region that has the least people and how many it has. If there is more than one such region, please select one.
#sum each population and filter down to min population (3)
regionpop <-aggregate(population~region,df2,sum)
minpop<-min(regionpop$population)
head(regionpop%>%filter(population==minpop))
## region population
## 1 SSC20099 3
## 2 SSC20127 3
## 3 SSC20151 3
## 4 SSC20346 3
## 5 SSC20383 3
## 6 SSC20398 3
#Find region with largest population
regionpop <-aggregate(population~region,df2,sum)
maxpop<-max(regionpop$population)
maxpopregion<-regionpop%>%filter(population==maxpop)
maxpopregion
## region population
## 1 SSC22015 37948
#Filter to region SSC22015
filtered<-df2%>%
filter(region=="SSC22015")
head(filtered)
## # A tibble: 6 x 5
## region age gender population agepop
## <fct> <dbl> <fct> <dbl> <dbl>
## 1 SSC22015 0 M 455 0
## 2 SSC22015 0 F 423 0
## 3 SSC22015 1 M 492 492
## 4 SSC22015 1 F 479 479
## 5 SSC22015 2 M 465 930
## 6 SSC22015 2 F 453 906
# Sum population
sumagepop <-aggregate(population~age,filtered,sum)
head(sumagepop)
## age population
## 1 0 878
## 2 1 971
## 3 2 918
## 4 3 975
## 5 4 965
## 6 5 847
# plot region
plot(sumagepop)
sumagepop$cdf <- cumsum(sumagepop$population)
plot(sumagepop$age, sumagepop$cdf)
sumgenderpop<-aggregate(population~gender+age, filtered, sum)
sumgenderpop$cdf <- cumsum(sumgenderpop$population)
qplot(sumgenderpop$age, sumgenderpop$cdf)
#create variables for old and young populations
region_old<-df2%>%group_by(region)%>%filter(age>=40)%>%summarise(old=sum(population))
head(region_old)
## # A tibble: 6 x 2
## region old
## <fct> <dbl>
## 1 SSC20005 17
## 2 SSC20012 178
## 3 SSC20018 50
## 4 SSC20027 371
## 5 SSC20029 12
## 6 SSC20048 316
region_young<-df2%>%group_by(region)%>%filter(age<40)%>%summarise(young=sum(population))
head(region_young)
## # A tibble: 6 x 2
## region young
## <fct> <dbl>
## 1 SSC20005 16
## 2 SSC20012 247
## 3 SSC20018 50
## 4 SSC20027 766
## 5 SSC20029 39
## 6 SSC20048 608
#concatenate the tables & work out the ratio
oldyoungjoin<-full_join(region_old, region_young, by='region')
oldyoungjoin$totalpop<-oldyoungjoin$old+oldyoungjoin$young
oldyoungjoin$ratio<-round(oldyoungjoin$old/oldyoungjoin$young, digits=3)
head(oldyoungjoin)
## # A tibble: 6 x 5
## region old young totalpop ratio
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 SSC20005 17 16 33 1.06
## 2 SSC20012 178 247 425 0.721
## 3 SSC20018 50 50 100 1
## 4 SSC20027 371 766 1137 0.484
## 5 SSC20029 12 39 51 0.308
## 6 SSC20048 316 608 924 0.52
#plot the ratios by total pop
ggplot(oldyoungjoin, aes(x = totalpop, y = ratio)) +
geom_point() +
labs(
title = "Ratio of Old to Young people by region",
x = "Population",
y = "Ratio")
#create variables for male and female populations
region_male<-df2%>%group_by(region)%>%filter(gender=="M")%>%summarise(male=sum(population))
head(region_male)
## # A tibble: 6 x 2
## region male
## <fct> <dbl>
## 1 SSC20005 13
## 2 SSC20012 217
## 3 SSC20018 60
## 4 SSC20027 599
## 5 SSC20029 30
## 6 SSC20048 476
region_female<-df2%>%group_by(region)%>%filter(gender=="F")%>%summarise(female=sum(population))
head(region_female)
## # A tibble: 6 x 2
## region female
## <fct> <dbl>
## 1 SSC20005 20
## 2 SSC20012 208
## 3 SSC20018 40
## 4 SSC20027 538
## 5 SSC20029 21
## 6 SSC20048 448
#concatenate the tables & work out the ratio
MFjoin<-full_join(region_male, region_female, by='region')
MFjoin$totalpop<-MFjoin$male+MFjoin$female
MFjoin$ratio<-round(MFjoin$male/MFjoin$female, digits=3)
MFjoin<-MFjoin%>%filter(male>0 & female>0)
head(MFjoin)
## # A tibble: 6 x 5
## region male female totalpop ratio
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 SSC20005 13 20 33 0.65
## 2 SSC20012 217 208 425 1.04
## 3 SSC20018 60 40 100 1.5
## 4 SSC20027 599 538 1137 1.11
## 5 SSC20029 30 21 51 1.43
## 6 SSC20048 476 448 924 1.06
#plot the ratios by totalpop
ggplot(MFjoin, aes(x = totalpop, y = ratio)) +
geom_point() +
labs(
title = "Ratio of Males to Females by region",
x = "Population",
y = "Ratio")
#create variables for small male and female populations
smallregion_male<-df2%>%group_by(region)%>%filter(gender=="M")%>%summarise(male=sum(population))
smallregion_female<-df2%>%group_by(region)%>%filter(gender=="F")%>%summarise(female=sum(population))
#concatenate the tables and filter to small populations
smallregiongenderjoin<-full_join(smallregion_male, smallregion_female, by='region')
smallregiongenderjoin$totalpop<-smallregiongenderjoin$male+smallregiongenderjoin$female
smallregiontotalmale<-smallregiongenderjoin%>%filter(totalpop<=50)%>%pull(male)%>%sum()
smallregiontotalfemale<-smallregiongenderjoin%>%filter(totalpop<=50)%>%pull(female)%>%sum()
print(paste('The total population for regions with populations of 50 or less:', smallregiontotalmale+smallregiontotalfemale))
## [1] "The total population for regions with populations of 50 or less: 3518"
#work out ratio
smallregionmaletofemaleratio<-round(smallregiontotalmale/smallregiontotalfemale, digits=3)
print(paste('The total Male/Female ratio for regions with populations of 50 or less:', smallregionmaletofemaleratio))
## [1] "The total Male/Female ratio for regions with populations of 50 or less: 1.013"
I would choose SSC22015 and SSC22569 as they have the highest population of boys in this age group. Both regions also have a higher ratio of boys to girls.
#Filter to boys and girls 2 years and under
baby_boy<-df2%>%group_by(region)%>%filter(gender=="M", age<=3)%>%summarise(male=sum(population))
baby_boy<-baby_boy[order(-baby_boy$male),]
baby_girl<-df2%>%group_by(region)%>%filter(gender=="F", age<=3)%>%summarise(female=sum(population))
#join the tables, exclude any regions with 0 for either gender, sort descending
babies<-left_join(baby_boy, baby_girl, by='region')
babies$totalpop<-babies$male+babies$female
babies$ratio<-round(babies$male/babies$female, digits=3)
babies<-babies%>%filter(male>0 & female>0)
babies<-babies[order(-babies$male),]
head(babies,2)
## # A tibble: 2 x 5
## region male female totalpop ratio
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 SSC22015 1890 1852 3742 1.02
## 2 SSC22569 1165 1113 2278 1.05
#Filter to region SSC22015
filtered<-df2%>%
filter(region=="SSC22015")
head(filtered)
## # A tibble: 6 x 5
## region age gender population agepop
## <fct> <dbl> <fct> <dbl> <dbl>
## 1 SSC22015 0 M 455 0
## 2 SSC22015 0 F 423 0
## 3 SSC22015 1 M 492 492
## 4 SSC22015 1 F 479 479
## 5 SSC22015 2 M 465 930
## 6 SSC22015 2 F 453 906
#create a list of the (repeated in list by the population for each age)
list<-rep(filtered$age, times = filtered$agepop)
# set sample size
n_sample <- 1000
# set number of times the samples will be summed
n_sum <- 100
# set the cumulative list to zero then loop and attach CLT results
clist<-NULL
for (i in seq(1, n_sum)){
result<-mean(sample(list, n_sum))
clist<-c(clist, result)
}
clist
## [1] 35.90 36.87 35.16 35.70 35.45 33.23 34.43 36.25 35.23 35.72 34.31 35.43
## [13] 32.72 35.47 36.32 34.71 33.04 35.69 34.90 36.84 32.36 34.11 33.96 34.92
## [25] 32.93 34.74 36.35 35.49 33.63 35.59 35.74 37.13 34.64 34.98 35.12 36.15
## [37] 35.26 34.99 33.64 34.81 34.70 35.25 33.71 34.90 33.24 35.62 35.53 36.61
## [49] 35.25 34.39 37.40 35.72 35.00 33.28 35.28 35.90 35.31 35.53 33.69 35.60
## [61] 35.74 37.52 33.68 36.18 38.25 35.22 34.99 34.90 36.60 35.54 34.06 37.76
## [73] 36.14 35.54 36.50 34.50 35.11 35.68 32.83 35.75 35.45 34.28 33.78 35.89
## [85] 36.46 36.75 36.74 36.44 33.76 33.97 33.45 34.90 33.62 33.89 35.96 34.64
## [97] 35.65 36.15 34.67 35.52
mean <- mean(clist)
sd <- sd(clist)
binwidth <- 0.005 * sqrt(n_sample) * 4 * (n_sum ^ 0.4)
title_txt <- sprintf('CLT after summing %d samples of %d observations', n_sum, n_sample)
gg <- ggplot()
gg <- gg + geom_histogram(aes(x = clist), binwidth = binwidth, colour = "white",
fill = "cornflowerblue", size = 0.1)
gg <- gg + stat_function(fun = function(x) dnorm(x, mean = mean, sd = sd) * n_sum * binwidth,
color = "darkred", size = 1)
gg <- gg + labs(x = 'Age', y = 'Frequency', title = title_txt)
gg
6.2 Comment on any trends you see in the data. What could cause such trends?