options(warn= -1)
#Load libraries
library(ggplot2)
library(dplyr)
library(radiant.data)
library(ggthemes)
library(hexbin)
library(rmarkdown)
library(RColorBrewer)
library(readr)
library(ggthemes)
library(cowplot)
library(knitr)
Check data and find: Number of regions, number of genders, age ranges , total rows, number of columns.
# read the data
Assignment1Dataset <- read.csv("Assignment1DataSet.txt")
#Checking for missing data
any(is.na(Assignment1Dataset)) #We have no missing data
## [1] FALSE
# 1.1 Print out the dimensions of the data frame.
#To find out the dimensions of the dataset we run
print(dim(Assignment1Dataset))
## [1] 56000 4
# 1.2 Print out the names and type of each of the data frame’s columns.
#To find out the column names
print(colnames(Assignment1Dataset))
## [1] "region" "age" "gender" "population"
#Finding out what type of variable R thinks each column is
print(class(Assignment1Dataset$region))
## [1] "character"
print(class(Assignment1Dataset$age))
## [1] "integer"
print(class(Assignment1Dataset$gender))
## [1] "character"
print(class(Assignment1Dataset$population))
## [1] "integer"
# 1.3 Print out the number of unique regions in the dataset.
print( length( unique(Assignment1Dataset$region) ) ) #Print the length of the unique regions
## [1] 500
# 1.4 What is the minimum age bin?
print( min(Assignment1Dataset$age) ) #Print the minimum age bin
## [1] 0
# 1.5 What is the maximum age bin?
print( max(Assignment1Dataset$age) ) #Print the maximum age bin
## [1] 55
# 1.6 What is the bin size for the age field?
print(1) #It goes from 0 to 55 in intervals of 1, therefore the bin size is 1.
## [1] 1
Summary statistics of all data. Note entire data set
# 2.1 Use the expected value for the age to find the mean age for the whole data sample.
WeightedAverage <- weighted.mean(Assignment1Dataset$age, Assignment1Dataset$population)
WeightedAverage #Use the weighted.mean function to calculate the weighted mean, assign it to variable WeightedAverage
## [1] 27.80027
# 2.2 Provide the standard deviation for the whole data sample.
weighted.sd(Assignment1Dataset$age, Assignment1Dataset$population) #Calculate weighted standard deviation
## [1] 15.77804
Record the following statistics on the means from each region: Create a new data frame based on the means from each region. make sure they combine male and female values to get age distributions
# We can write a loop to make a separate entry for every person in the age column of a row,
# lets unwrap the Assignment1Dataset, we will use this later
region.l = list()
age.l = list()
gender.l = list()
# no need for population as we are adding every individual
# we will make a loop for all rows
# add to list for performance
for(i in 1:length(Assignment1Dataset$region)){
X = NULL; X = Assignment1Dataset[i,] # loop for every entry
# We need to add X$population repeated entries
population = NULL; population = X$population[1]
if(population==0){
population = 1 # otherwise the rep function breaks
}
region.l[[i]] = rep(X$region[1],population ) # append data to the vector
age.l[[i]] = rep(X$age[1], population) # append data to the vector
gender.l[[i]] = rep(X$gender[1], population) # append data to the vector
}
# This will take a while (5-10 minutes)
region = Reduce(c, region.l)
age = Reduce(c, age.l)
gender = Reduce(c, gender.l)
# remove unnecessary data
rm(region.l, age.l, gender.l)
df = data.frame(ID = seq(from = 1, to = length(region), by = 1), # useful to make a new data frame
region = region,
age = age,
gender = gender)
# a new entry was made, retaining the information in the region, age & gender columns
#(from the Assignment1Dataset data frame)
Looped_df_age_avg = df %>% group_by(region) %>%
summarize(avg_age = mean(age), #Mean age for each unique region
SD = sd(age), #Standard deviation for each unique region
Min = min(age), #Minimum age for each unique region
Q1 = quantile(age,probs =.25), #Quartile 1 for each unique region
Median = median(age), #Median for each unique region
Q3 = quantile(age,probs = .75), #Quartile 3 for each unique region
Max = max(age), #Max age for each unique region
IQR = IQR(age), .groups = "keep") #IQR for each region. To suppress the warning we add groups = "keep"
Looped_df_age_avg #See what the variable looks like
## # A tibble: 500 x 9
## # Groups: region [500]
## region avg_age SD Min Q1 Median Q3 Max IQR
## <chr> <dbl> <dbl> <int> <dbl> <dbl> <dbl> <int> <dbl>
## 1 SSC20005 28.0 16.2 0 13 28 42.5 55 29.5
## 2 SSC20012 29.8 17.3 0 14 30 47 55 33
## 3 SSC20018 29.7 17.0 0 14 30 45.5 55 31.5
## 4 SSC20027 27.9 16.8 0 13 28 43 55 30
## 5 SSC20029 26.0 16.3 0 12 25 41 55 29
## 6 SSC20048 29.3 16.2 0 15 31 44 55 29
## 7 SSC20062 28.8 16.1 0 15 29 43 55 28
## 8 SSC20076 27.9 16.8 0 13 28 43 55 30
## 9 SSC20079 29.1 16.7 0 14 29 45 55 31
## 10 SSC20099 27.8 16.3 0 14 28 42 55 28
## # ... with 490 more rows
#We can now get the mean of all the unique regions values
## 3.1 Calculate the mean
mean(Looped_df_age_avg$avg_age) #Mean of the unique regions mean ages
## [1] 28.26092
## 3.2 Calculate the standard deviation
sd(Looped_df_age_avg$avg_age) #Standard deviation of the 500 unique regions mean age values
## [1] 1.254157
## 3.3 Calculate the minimum
min(Looped_df_age_avg$avg_age) #Minimum of the 500 unique regions mean age values
## [1] 24.84414
## 3.4 Calculate the first quartile
quantile(Looped_df_age_avg$avg_age, probs = .25) #Q1 of the 500 unique regions mean age values
## 25%
## 27.43381
## 3.5 Calculate the median
median(Looped_df_age_avg$avg_age) #Median of the 500 unique regions mean age values
## [1] 28.13439
## 3.6 Calculate the third quartile
quantile(Looped_df_age_avg$avg_age, probs = .75) #Q3 of the 500 unique regions mean age values
## 75%
## 28.99397
## 3.7 Calculate the maximum
max(Looped_df_age_avg$avg_age) #Maximum of the 500 unique regions mean age values
## [1] 32.33333
## 3.8 Calculate the interquartile range
IQR(Looped_df_age_avg$avg_age) #IQR of the 500 unique regions mean age values
## [1] 1.560161
## 3.9 histogram plot of the distribution of means from each region
ggplot(data=Looped_df_age_avg, aes(avg_age)) +
geom_histogram(binwidth = .5, colour ="black", fill="#FF6666", alpha=.2) + #Make bin size 0.5, change colour, fill & alpha
scale_x_continuous(breaks=seq(24, 33, .5)) + # X axis Ticks from 24-34, every .5
scale_y_continuous(breaks=seq(0, 100, 5)) + # Y axis Ticks from 0-30, every 2
geom_vline(aes(xintercept=median(avg_age, na.rm=T)), color="red", linetype="dashed", size=1) + #Add median line
geom_vline(aes(xintercept=mean(avg_age, na.rm=T)), color="blue", linetype="dashed", size=1) + #Add mean line
ggtitle("Histogram plot of the distribution of mean age from each unique region") + #Add title
labs(x = "Average age of the unique region", y= "Count of unique regions") + #Label the axis'
theme(plot.title = element_text(hjust= 0.5),
panel.border = element_rect(linetype = "dashed", fill=NA)) #Centre title, added dash around border
#We can see the centre follows a normal distribution but it's not a 100% perfect normal distribution on the edges
#Make it a square/rect
ggplot(Looped_df_age_avg, aes(sample=avg_age)) +
stat_qq() + #Add QQ stat
stat_qq_line(color = "red") + #Make QQPlot line red
ggtitle("Quantile-Quantile Plot of the distribution of mean age from each unique region")+ #Add title
theme_economist()+ #Add economist theme
theme(plot.title = element_text(hjust=.5, size=12), ##Centre title
panel.border = element_rect(linetype = "dashed", fill=NA)) #Add a panel border
Get descriptive statistics for the region with the smallest population
#Aggregate the number of people in each unique region, assign it to variable Region_Pop
Region_Pop <- aggregate(x = Assignment1Dataset$population,
by = list(Assignment1Dataset$region),
FUN = sum)
#4.1 Show which region has the least people and how many it has.
#Sort the regions to find out the one with the least people in it
sort(Region_Pop$x, decreasing = F)
## [1] 3 3 3 3 3 3 3 3 3 3 3 3
## [13] 3 3 3 3 3 3 3 3 3 3 3 3
## [25] 3 3 3 3 3 3 3 3 3 3 3 4
## [37] 4 5 5 5 6 6 6 6 6 6 6 6
## [49] 6 6 6 6 6 6 6 6 7 7 7 7
## [61] 7 8 8 9 9 9 9 9 9 9 9 9
## [73] 10 10 10 10 12 12 12 12 12 12 12 12
## [85] 12 12 12 12 12 12 12 13 13 14 14 15
## [97] 15 15 15 15 15 15 15 15 16 16 16 16
## [109] 16 17 17 17 18 18 18 18 18 18 18 18
## [121] 19 19 19 20 21 21 21 21 21 21 21 22
## [133] 22 22 22 22 23 23 23 23 24 24 24 24
## [145] 24 24 25 25 26 27 27 28 28 28 28 29
## [157] 29 30 30 30 32 32 32 33 33 33 34 34
## [169] 34 35 35 36 36 36 36 36 37 37 38 38
## [181] 38 39 40 40 40 41 41 41 42 42 42 44
## [193] 45 45 46 47 48 50 50 51 51 52 52 52
## [205] 54 54 54 55 56 57 57 57 58 58 59 60
## [217] 60 60 61 61 61 61 62 63 65 65 66 67
## [229] 67 69 69 69 69 70 71 71 72 72 73 73
## [241] 74 74 75 76 76 77 78 79 79 81 81 84
## [253] 86 86 87 88 88 88 89 89 93 98 99 100
## [265] 102 103 104 108 109 112 117 120 120 123 127 128
## [277] 129 130 130 130 132 132 135 138 144 144 147 148
## [289] 148 152 154 158 159 159 161 162 163 166 171 176
## [301] 178 183 183 190 190 191 191 194 198 199 201 202
## [313] 204 205 208 210 211 211 211 213 216 217 219 229
## [325] 239 239 245 249 260 265 267 268 270 277 283 284
## [337] 286 289 290 292 299 304 307 310 314 323 337 351
## [349] 353 359 373 376 380 425 427 440 468 475 489 494
## [361] 526 528 538 547 568 574 616 663 698 714 750 762
## [373] 779 790 794 808 831 833 847 862 924 942 988 1011
## [385] 1055 1079 1082 1082 1094 1137 1148 1159 1166 1188 1207 1213
## [397] 1226 1229 1282 1288 1289 1301 1330 1367 1412 1434 1536 1554
## [409] 1752 1790 1799 1826 1879 1908 1933 1953 2049 2257 2289 2306
## [421] 2342 2386 2391 2545 2548 2563 2575 2615 2643 2661 2823 2960
## [433] 3104 3168 3269 3363 3391 3439 3504 3555 3735 3772 3838 3981
## [445] 4060 4074 4153 4177 4306 4310 4978 4978 5016 5033 5056 5402
## [457] 5821 6048 6281 6461 6550 6666 6803 7282 7287 7776 7866 8174
## [469] 9046 9116 9729 9802 9818 9837 10596 10770 10893 10937 11218 11357
## [481] 12350 12544 13253 13730 14546 15608 15972 16669 16705 17140 17442 17809
## [493] 17928 18540 19180 19274 19340 20939 22979 37948
#We can see there are 35 regions with 3 people in them
# Populations is a data.frame with the regions and the summed population as columns
Populations = aggregate(x = Assignment1Dataset$population,
by = list(Assignment1Dataset$region),
FUN = sum)
# Now get the min population region using which
# index will contain the row numbers of the minimum populations
index = which(Populations$x == min(Populations$x))
min.Subpopulations = Populations[index,]
print(min.Subpopulations)
## Group.1 x
## 10 SSC20099 3
## 14 SSC20127 3
## 17 SSC20151 3
## 51 SSC20346 3
## 62 SSC20383 3
## 66 SSC20398 3
## 71 SSC20422 3
## 82 SSC20502 3
## 83 SSC20503 3
## 87 SSC20516 3
## 160 SSC21012 3
## 164 SSC21026 3
## 167 SSC21037 3
## 213 SSC21283 3
## 345 SSC22012 3
## 348 SSC22021 3
## 349 SSC22028 3
## 359 SSC22070 3
## 364 SSC22084 3
## 377 SSC22157 3
## 384 SSC22193 3
## 391 SSC22237 3
## 398 SSC22281 3
## 416 SSC22367 3
## 446 SSC22597 3
## 448 SSC22605 3
## 449 SSC22606 3
## 451 SSC22616 3
## 459 SSC22698 3
## 461 SSC22702 3
## 467 SSC22719 3
## 477 SSC22772 3
## 482 SSC22809 3
## 484 SSC22840 3
## 493 SSC22873 3
#4.2 Plot the distribution of ages for the region with the least people.
#I will graph ssc20099 because it is the first region which comes up
Region.SSC20099.2 <- Assignment1Dataset %>% filter(region =="SSC20099",population>=1)
Region.SSC20099.2
## region age gender population
## 1 SSC20099 46 M 3
#Plot
ggplot(data=Region.SSC20099.2, aes(x=age, y=population))+ #Tell ggplot what to map
geom_col(color="black",fill="red",alpha=0.4) + #Column chart, fill it red, have a dashed line & make alpha 0.4
xlab("Age in years") + ylab("Number of people") + ggtitle("Distribution of people by age in the SSC20099 region") +
coord_cartesian(ylim=c(1, 3)) + #Set max and min points for y axis data
expand_limits(x=c(0,56)) + #Expand the axis point view limits
scale_x_continuous(breaks=seq(0, 56, 2)) + # Ticks from 0-56, every 2
scale_y_continuous(breaks=seq(1, 3, 1)) + # Ticks from 1-3, every 1
theme(plot.title = element_text(hjust=.5), #Make title centred, add a dashed panel border
panel.border = element_rect(colour = "black", fill=NA, size=2),
panel.background = element_rect(fill='skyblue'))
Get descriptive statistics for the region with the highest population
#We can slightly amend the code from earlier to make it in decreasing order, to find out the largest region
sort(Region_Pop$x, decreasing = T)
## [1] 37948 22979 20939 19340 19274 19180 18540 17928 17809 17442 17140 16705
## [13] 16669 15972 15608 14546 13730 13253 12544 12350 11357 11218 10937 10893
## [25] 10770 10596 9837 9818 9802 9729 9116 9046 8174 7866 7776 7287
## [37] 7282 6803 6666 6550 6461 6281 6048 5821 5402 5056 5033 5016
## [49] 4978 4978 4310 4306 4177 4153 4074 4060 3981 3838 3772 3735
## [61] 3555 3504 3439 3391 3363 3269 3168 3104 2960 2823 2661 2643
## [73] 2615 2575 2563 2548 2545 2391 2386 2342 2306 2289 2257 2049
## [85] 1953 1933 1908 1879 1826 1799 1790 1752 1554 1536 1434 1412
## [97] 1367 1330 1301 1289 1288 1282 1229 1226 1213 1207 1188 1166
## [109] 1159 1148 1137 1094 1082 1082 1079 1055 1011 988 942 924
## [121] 862 847 833 831 808 794 790 779 762 750 714 698
## [133] 663 616 574 568 547 538 528 526 494 489 475 468
## [145] 440 427 425 380 376 373 359 353 351 337 323 314
## [157] 310 307 304 299 292 290 289 286 284 283 277 270
## [169] 268 267 265 260 249 245 239 239 229 219 217 216
## [181] 213 211 211 211 210 208 205 204 202 201 199 198
## [193] 194 191 191 190 190 183 183 178 176 171 166 163
## [205] 162 161 159 159 158 154 152 148 148 147 144 144
## [217] 138 135 132 132 130 130 130 129 128 127 123 120
## [229] 120 117 112 109 108 104 103 102 100 99 98 93
## [241] 89 89 88 88 88 87 86 86 84 81 81 79
## [253] 79 78 77 76 76 75 74 74 73 73 72 72
## [265] 71 71 70 69 69 69 69 67 67 66 65 65
## [277] 63 62 61 61 61 61 60 60 60 59 58 58
## [289] 57 57 57 56 55 54 54 54 52 52 52 51
## [301] 51 50 50 48 47 46 45 45 44 42 42 42
## [313] 41 41 41 40 40 40 39 38 38 38 37 37
## [325] 36 36 36 36 36 35 35 34 34 34 33 33
## [337] 33 32 32 32 30 30 30 29 29 28 28 28
## [349] 28 27 27 26 25 25 24 24 24 24 24 24
## [361] 23 23 23 23 22 22 22 22 22 21 21 21
## [373] 21 21 21 21 20 19 19 19 18 18 18 18
## [385] 18 18 18 18 17 17 17 16 16 16 16 16
## [397] 15 15 15 15 15 15 15 15 15 14 14 13
## [409] 13 12 12 12 12 12 12 12 12 12 12 12
## [421] 12 12 12 12 10 10 10 10 9 9 9 9
## [433] 9 9 9 9 9 8 8 7 7 7 7 7
## [445] 6 6 6 6 6 6 6 6 6 6 6 6
## [457] 6 6 6 6 5 5 5 4 4 3 3 3
## [469] 3 3 3 3 3 3 3 3 3 3 3 3
## [481] 3 3 3 3 3 3 3 3 3 3 3 3
## [493] 3 3 3 3 3 3 3 3
index2 = which(Populations$x == max(Populations$x))
max.Subpopulations = Populations[index2,]
print(max.Subpopulations)
## Group.1 x
## 346 SSC22015 37948
#5.1 Plot cumulative distribution for the regions with the most people.
LargestRegion2 <- Assignment1Dataset %>% filter (region=="SSC22015")
LargestRegion2
## region age gender population
## 1 SSC22015 0 M 455
## 2 SSC22015 0 F 423
## 3 SSC22015 1 M 492
## 4 SSC22015 1 F 479
## 5 SSC22015 2 M 465
## 6 SSC22015 2 F 453
## 7 SSC22015 3 M 478
## 8 SSC22015 3 F 497
## 9 SSC22015 4 M 527
## 10 SSC22015 4 F 438
## 11 SSC22015 5 M 434
## 12 SSC22015 5 F 413
## 13 SSC22015 6 M 415
## 14 SSC22015 6 F 404
## 15 SSC22015 7 M 396
## 16 SSC22015 7 F 396
## 17 SSC22015 8 M 409
## 18 SSC22015 8 F 358
## 19 SSC22015 9 M 354
## 20 SSC22015 9 F 341
## 21 SSC22015 10 M 371
## 22 SSC22015 10 F 352
## 23 SSC22015 11 M 329
## 24 SSC22015 11 F 329
## 25 SSC22015 12 M 303
## 26 SSC22015 12 F 274
## 27 SSC22015 13 M 305
## 28 SSC22015 13 F 292
## 29 SSC22015 14 M 295
## 30 SSC22015 14 F 255
## 31 SSC22015 15 M 249
## 32 SSC22015 15 F 290
## 33 SSC22015 16 M 284
## 34 SSC22015 16 F 261
## 35 SSC22015 17 M 286
## 36 SSC22015 17 F 265
## 37 SSC22015 18 M 258
## 38 SSC22015 18 F 286
## 39 SSC22015 19 M 282
## 40 SSC22015 19 F 259
## 41 SSC22015 20 M 254
## 42 SSC22015 20 F 243
## 43 SSC22015 21 M 263
## 44 SSC22015 21 F 325
## 45 SSC22015 22 M 306
## 46 SSC22015 22 F 340
## 47 SSC22015 23 M 311
## 48 SSC22015 23 F 338
## 49 SSC22015 24 M 316
## 50 SSC22015 24 F 363
## 51 SSC22015 25 M 302
## 52 SSC22015 25 F 411
## 53 SSC22015 26 M 372
## 54 SSC22015 26 F 424
## 55 SSC22015 27 M 320
## 56 SSC22015 27 F 390
## 57 SSC22015 28 M 356
## 58 SSC22015 28 F 447
## 59 SSC22015 29 M 380
## 60 SSC22015 29 F 411
## 61 SSC22015 30 M 455
## 62 SSC22015 30 F 482
## 63 SSC22015 31 M 428
## 64 SSC22015 31 F 478
## 65 SSC22015 32 M 414
## 66 SSC22015 32 F 482
## 67 SSC22015 33 M 410
## 68 SSC22015 33 F 406
## 69 SSC22015 34 M 396
## 70 SSC22015 34 F 459
## 71 SSC22015 35 M 413
## 72 SSC22015 35 F 391
## 73 SSC22015 36 M 339
## 74 SSC22015 36 F 382
## 75 SSC22015 37 M 348
## 76 SSC22015 37 F 386
## 77 SSC22015 38 M 364
## 78 SSC22015 38 F 313
## 79 SSC22015 39 M 325
## 80 SSC22015 39 F 335
## 81 SSC22015 40 M 294
## 82 SSC22015 40 F 294
## 83 SSC22015 41 M 305
## 84 SSC22015 41 F 312
## 85 SSC22015 42 M 297
## 86 SSC22015 42 F 291
## 87 SSC22015 43 M 294
## 88 SSC22015 43 F 312
## 89 SSC22015 44 M 347
## 90 SSC22015 44 F 354
## 91 SSC22015 45 M 293
## 92 SSC22015 45 F 319
## 93 SSC22015 46 M 275
## 94 SSC22015 46 F 311
## 95 SSC22015 47 M 264
## 96 SSC22015 47 F 294
## 97 SSC22015 48 M 271
## 98 SSC22015 48 F 265
## 99 SSC22015 49 M 248
## 100 SSC22015 49 F 271
## 101 SSC22015 50 M 218
## 102 SSC22015 50 F 227
## 103 SSC22015 51 M 211
## 104 SSC22015 51 F 261
## 105 SSC22015 52 M 262
## 106 SSC22015 52 F 218
## 107 SSC22015 53 M 238
## 108 SSC22015 53 F 274
## 109 SSC22015 54 M 194
## 110 SSC22015 54 F 217
## 111 SSC22015 55 M 175
## 112 SSC22015 55 F 212
#Grouping by region and age and then doing a sum and a cum sum
LargestRegion5.1 <- aggregate(x = LargestRegion2$population,
by = list(LargestRegion2$age),
FUN = sum)
LargestRegion5.1
## Group.1 x
## 1 0 878
## 2 1 971
## 3 2 918
## 4 3 975
## 5 4 965
## 6 5 847
## 7 6 819
## 8 7 792
## 9 8 767
## 10 9 695
## 11 10 723
## 12 11 658
## 13 12 577
## 14 13 597
## 15 14 550
## 16 15 539
## 17 16 545
## 18 17 551
## 19 18 544
## 20 19 541
## 21 20 497
## 22 21 588
## 23 22 646
## 24 23 649
## 25 24 679
## 26 25 713
## 27 26 796
## 28 27 710
## 29 28 803
## 30 29 791
## 31 30 937
## 32 31 906
## 33 32 896
## 34 33 816
## 35 34 855
## 36 35 804
## 37 36 721
## 38 37 734
## 39 38 677
## 40 39 660
## 41 40 588
## 42 41 617
## 43 42 588
## 44 43 606
## 45 44 701
## 46 45 612
## 47 46 586
## 48 47 558
## 49 48 536
## 50 49 519
## 51 50 445
## 52 51 472
## 53 52 480
## 54 53 512
## 55 54 411
## 56 55 387
names(LargestRegion5.1)[1] <- "age" #Rename column to age
names(LargestRegion5.1)[2] <- "population" #Rename column to population
LargestRegion5.1
## age population
## 1 0 878
## 2 1 971
## 3 2 918
## 4 3 975
## 5 4 965
## 6 5 847
## 7 6 819
## 8 7 792
## 9 8 767
## 10 9 695
## 11 10 723
## 12 11 658
## 13 12 577
## 14 13 597
## 15 14 550
## 16 15 539
## 17 16 545
## 18 17 551
## 19 18 544
## 20 19 541
## 21 20 497
## 22 21 588
## 23 22 646
## 24 23 649
## 25 24 679
## 26 25 713
## 27 26 796
## 28 27 710
## 29 28 803
## 30 29 791
## 31 30 937
## 32 31 906
## 33 32 896
## 34 33 816
## 35 34 855
## 36 35 804
## 37 36 721
## 38 37 734
## 39 38 677
## 40 39 660
## 41 40 588
## 42 41 617
## 43 42 588
## 44 43 606
## 45 44 701
## 46 45 612
## 47 46 586
## 48 47 558
## 49 48 536
## 50 49 519
## 51 50 445
## 52 51 472
## 53 52 480
## 54 53 512
## 55 54 411
## 56 55 387
# 5.1 Plot cumulative distribution for the regions with the most people.
#This graph is symmetric it has no skew, which we can see because the mean and median are equal
ggplot(data=LargestRegion5.1, aes(x=age,y=cumsum(population))) + #Plot the cum sum for the largest region
geom_point(aes(x=age,y=cumsum(population))) + #Use geom_point as our plotting method
geom_line() + #Add in a geom_line between points
ggtitle("Cumulative Distribution of people by age in the SSC22015 region") + #Add a title
geom_vline(aes(xintercept=median(age, na.rm=T)), color="red", linetype="dashed", size=1) + #Add median line
geom_vline(aes(xintercept=mean(age, na.rm=T)), color="blue", linetype="dashed", size=.5) + #Add mean line
scale_x_continuous(breaks=seq(0, 56, 2)) +# x axis Tick marks from 0-56, every 2
scale_y_continuous(breaks=seq(0, 40000, 5000)) + #Set y axis tick marks from 0 to 40000 every 5000
theme_fivethirtyeight() + #Use fivethirtyeight theme
theme(plot.title = element_text(hjust=.5, size=14)) + #Centre title
theme(axis.title = element_text()) + ylab("Cumulative Sum of the number people") + xlab("Age in years") #Label axis'
# 5.2 Plot the cumulative distribution for males and females on the same plot.
#We can see there are more cumulative females than cumulative males at every interval
ggplot(data=LargestRegion2, aes(x=age,y=cumsum(population),color=gender)) + #Plot the cum sum for the largest region
geom_point(aes(x=age,y=cumsum(population), alpha= 0.4)) + #Use geom_point as our plotting method
geom_line() + #Add in a geom_line between points
xlab("Age in Years") + ylab("Cumulative Sum of the number of people") + #Add in axis labels
ggtitle("Cumulative Distribution of People by Age and Gender in the SSC22015 Region") + #Add title
scale_x_continuous(breaks=seq(0, 56, 2)) +# x axis Tick marks from 0-56, every 2
scale_y_continuous(breaks=seq(0, 40000, 5000)) + #Set y axis tick marks from 0 to 40000 every 5000
theme(plot.title = element_text(hjust=.5), #Centre title
panel.border = element_rect(linetype = "dashed", fill = NA), #add dashed border, change panel background,
plot.background = element_rect("skyblue"),
panel.background = element_rect(fill='#c5d7c0'))#change plot background
Plot age ratio as a function of the population of the region (remove any with zero young or old)
#Question 6 & 7
#Making a new data frame for these two questions
unique_region_totals <- Assignment1Dataset %>%
group_by(region) %>%
summarize(sum_population = sum(population))
unique_region_totals
## # A tibble: 500 x 2
## region sum_population
## <chr> <int>
## 1 SSC20005 33
## 2 SSC20012 425
## 3 SSC20018 100
## 4 SSC20027 1137
## 5 SSC20029 51
## 6 SSC20048 924
## 7 SSC20062 359
## 8 SSC20076 5821
## 9 SSC20079 4978
## 10 SSC20099 3
## # ... with 490 more rows
#Vector of unique regions
VecUniqueRegions <- unique_region_totals$region
VecUniqueRegions
## [1] "SSC20005" "SSC20012" "SSC20018" "SSC20027" "SSC20029" "SSC20048"
## [7] "SSC20062" "SSC20076" "SSC20079" "SSC20099" "SSC20101" "SSC20106"
## [13] "SSC20107" "SSC20127" "SSC20135" "SSC20140" "SSC20151" "SSC20161"
## [19] "SSC20163" "SSC20167" "SSC20170" "SSC20173" "SSC20177" "SSC20179"
## [25] "SSC20190" "SSC20191" "SSC20196" "SSC20200" "SSC20201" "SSC20204"
## [31] "SSC20205" "SSC20211" "SSC20241" "SSC20249" "SSC20257" "SSC20266"
## [37] "SSC20269" "SSC20275" "SSC20276" "SSC20282" "SSC20293" "SSC20305"
## [43] "SSC20306" "SSC20312" "SSC20313" "SSC20325" "SSC20330" "SSC20337"
## [49] "SSC20343" "SSC20345" "SSC20346" "SSC20352" "SSC20353" "SSC20355"
## [55] "SSC20356" "SSC20360" "SSC20361" "SSC20366" "SSC20367" "SSC20373"
## [61] "SSC20378" "SSC20383" "SSC20391" "SSC20392" "SSC20393" "SSC20398"
## [67] "SSC20401" "SSC20407" "SSC20415" "SSC20417" "SSC20422" "SSC20423"
## [73] "SSC20433" "SSC20446" "SSC20452" "SSC20453" "SSC20473" "SSC20477"
## [79] "SSC20491" "SSC20492" "SSC20500" "SSC20502" "SSC20503" "SSC20506"
## [85] "SSC20514" "SSC20515" "SSC20516" "SSC20518" "SSC20519" "SSC20522"
## [91] "SSC20523" "SSC20525" "SSC20534" "SSC20536" "SSC20551" "SSC20556"
## [97] "SSC20558" "SSC20564" "SSC20567" "SSC20571" "SSC20577" "SSC20578"
## [103] "SSC20579" "SSC20583" "SSC20605" "SSC20615" "SSC20617" "SSC20620"
## [109] "SSC20628" "SSC20656" "SSC20660" "SSC20668" "SSC20691" "SSC20701"
## [115] "SSC20706" "SSC20718" "SSC20725" "SSC20739" "SSC20742" "SSC20747"
## [121] "SSC20770" "SSC20773" "SSC20774" "SSC20787" "SSC20796" "SSC20798"
## [127] "SSC20799" "SSC20810" "SSC20814" "SSC20817" "SSC20822" "SSC20827"
## [133] "SSC20830" "SSC20836" "SSC20837" "SSC20852" "SSC20858" "SSC20864"
## [139] "SSC20865" "SSC20870" "SSC20879" "SSC20883" "SSC20890" "SSC20894"
## [145] "SSC20911" "SSC20912" "SSC20934" "SSC20943" "SSC20948" "SSC20953"
## [151] "SSC20957" "SSC20975" "SSC20977" "SSC20986" "SSC20989" "SSC20995"
## [157] "SSC20999" "SSC21004" "SSC21011" "SSC21012" "SSC21013" "SSC21014"
## [163] "SSC21020" "SSC21026" "SSC21032" "SSC21034" "SSC21037" "SSC21040"
## [169] "SSC21049" "SSC21055" "SSC21061" "SSC21062" "SSC21077" "SSC21088"
## [175] "SSC21092" "SSC21101" "SSC21102" "SSC21113" "SSC21120" "SSC21121"
## [181] "SSC21125" "SSC21126" "SSC21136" "SSC21137" "SSC21141" "SSC21143"
## [187] "SSC21144" "SSC21152" "SSC21169" "SSC21170" "SSC21175" "SSC21178"
## [193] "SSC21184" "SSC21189" "SSC21190" "SSC21191" "SSC21193" "SSC21200"
## [199] "SSC21207" "SSC21210" "SSC21211" "SSC21214" "SSC21218" "SSC21233"
## [205] "SSC21238" "SSC21244" "SSC21251" "SSC21256" "SSC21260" "SSC21264"
## [211] "SSC21267" "SSC21281" "SSC21283" "SSC21286" "SSC21294" "SSC21304"
## [217] "SSC21305" "SSC21308" "SSC21316" "SSC21320" "SSC21324" "SSC21329"
## [223] "SSC21332" "SSC21342" "SSC21347" "SSC21348" "SSC21350" "SSC21351"
## [229] "SSC21367" "SSC21369" "SSC21374" "SSC21380" "SSC21387" "SSC21405"
## [235] "SSC21408" "SSC21412" "SSC21416" "SSC21417" "SSC21421" "SSC21426"
## [241] "SSC21440" "SSC21441" "SSC21457" "SSC21459" "SSC21461" "SSC21462"
## [247] "SSC21475" "SSC21479" "SSC21494" "SSC21505" "SSC21511" "SSC21514"
## [253] "SSC21517" "SSC21534" "SSC21536" "SSC21544" "SSC21547" "SSC21561"
## [259] "SSC21562" "SSC21567" "SSC21575" "SSC21585" "SSC21590" "SSC21594"
## [265] "SSC21597" "SSC21601" "SSC21602" "SSC21620" "SSC21630" "SSC21638"
## [271] "SSC21639" "SSC21654" "SSC21660" "SSC21666" "SSC21671" "SSC21674"
## [277] "SSC21678" "SSC21681" "SSC21683" "SSC21690" "SSC21691" "SSC21701"
## [283] "SSC21702" "SSC21719" "SSC21730" "SSC21732" "SSC21734" "SSC21736"
## [289] "SSC21741" "SSC21743" "SSC21755" "SSC21760" "SSC21769" "SSC21771"
## [295] "SSC21778" "SSC21779" "SSC21784" "SSC21798" "SSC21801" "SSC21802"
## [301] "SSC21803" "SSC21805" "SSC21808" "SSC21812" "SSC21817" "SSC21820"
## [307] "SSC21823" "SSC21830" "SSC21839" "SSC21848" "SSC21849" "SSC21858"
## [313] "SSC21862" "SSC21863" "SSC21888" "SSC21889" "SSC21890" "SSC21892"
## [319] "SSC21899" "SSC21900" "SSC21902" "SSC21905" "SSC21907" "SSC21914"
## [325] "SSC21915" "SSC21916" "SSC21918" "SSC21919" "SSC21928" "SSC21932"
## [331] "SSC21939" "SSC21945" "SSC21946" "SSC21947" "SSC21950" "SSC21951"
## [337] "SSC21956" "SSC21960" "SSC21963" "SSC21979" "SSC21988" "SSC22003"
## [343] "SSC22007" "SSC22011" "SSC22012" "SSC22015" "SSC22019" "SSC22021"
## [349] "SSC22028" "SSC22029" "SSC22030" "SSC22039" "SSC22043" "SSC22046"
## [355] "SSC22052" "SSC22053" "SSC22059" "SSC22065" "SSC22070" "SSC22072"
## [361] "SSC22075" "SSC22076" "SSC22082" "SSC22084" "SSC22085" "SSC22086"
## [367] "SSC22089" "SSC22096" "SSC22101" "SSC22106" "SSC22110" "SSC22124"
## [373] "SSC22125" "SSC22133" "SSC22134" "SSC22139" "SSC22157" "SSC22168"
## [379] "SSC22170" "SSC22175" "SSC22180" "SSC22185" "SSC22190" "SSC22193"
## [385] "SSC22200" "SSC22209" "SSC22224" "SSC22227" "SSC22232" "SSC22236"
## [391] "SSC22237" "SSC22239" "SSC22254" "SSC22263" "SSC22265" "SSC22273"
## [397] "SSC22274" "SSC22281" "SSC22283" "SSC22284" "SSC22288" "SSC22296"
## [403] "SSC22309" "SSC22323" "SSC22327" "SSC22330" "SSC22331" "SSC22333"
## [409] "SSC22338" "SSC22340" "SSC22342" "SSC22344" "SSC22349" "SSC22356"
## [415] "SSC22366" "SSC22367" "SSC22371" "SSC22373" "SSC22386" "SSC22396"
## [421] "SSC22398" "SSC22424" "SSC22442" "SSC22453" "SSC22467" "SSC22468"
## [427] "SSC22470" "SSC22476" "SSC22482" "SSC22489" "SSC22490" "SSC22495"
## [433] "SSC22500" "SSC22505" "SSC22513" "SSC22517" "SSC22521" "SSC22523"
## [439] "SSC22525" "SSC22532" "SSC22534" "SSC22556" "SSC22561" "SSC22569"
## [445] "SSC22586" "SSC22597" "SSC22598" "SSC22605" "SSC22606" "SSC22609"
## [451] "SSC22616" "SSC22621" "SSC22650" "SSC22652" "SSC22653" "SSC22659"
## [457] "SSC22660" "SSC22696" "SSC22698" "SSC22701" "SSC22702" "SSC22703"
## [463] "SSC22706" "SSC22708" "SSC22710" "SSC22717" "SSC22719" "SSC22729"
## [469] "SSC22734" "SSC22737" "SSC22744" "SSC22747" "SSC22752" "SSC22755"
## [475] "SSC22761" "SSC22764" "SSC22772" "SSC22773" "SSC22779" "SSC22800"
## [481] "SSC22803" "SSC22809" "SSC22838" "SSC22840" "SSC22844" "SSC22846"
## [487] "SSC22849" "SSC22861" "SSC22864" "SSC22867" "SSC22869" "SSC22871"
## [493] "SSC22873" "SSC22877" "SSC22886" "SSC22910" "SSC22912" "SSC22915"
## [499] "SSC22918" "SSC22922"
#Vector of unique regions population totals
VecUniquePopTotals <- unique_region_totals$sum_population
VecUniquePopTotals
## [1] 33 425 100 1137 51 924 359 5821 4978 3 38 17
## [13] 2049 3 1826 270 3 2643 178 12 314 6666 24 15
## [25] 3439 3168 18 9837 10 41 9 60 54 89 8 27
## [37] 161 52 6 138 58 3391 57 9116 9046 29 21 60
## [49] 3104 36 3 112 61 38 16 9802 11357 40 104 123
## [61] 6 3 159 103 211 3 21 2615 6 3555 3 75
## [73] 52 5 166 3735 1188 120 213 16705 39 3 3 29
## [85] 190 23 3 3772 1282 7 194 6803 5402 15972 7 239
## [97] 3363 198 9729 5016 1011 1790 7287 132 54 48 494 2306
## [109] 9 12 17442 14 58 427 18 148 6461 211 1082 45
## [121] 117 18540 12 171 2563 219 16 284 50 3981 1367 568
## [133] 1752 18 847 72 7866 239 17809 6 15608 1434 88 9
## [145] 19340 1055 862 12350 132 89 79 14 15 9 12 547
## [157] 69 32 65 3 210 5 54 3 130 37 3 17140
## [169] 1554 7 147 475 468 4 6 46 76 286 67 6281
## [181] 20939 61 6550 5 217 19180 11218 616 44 1288 22 13730
## [193] 14546 6 7 9 663 30 6 1908 6 2575 25 183
## [205] 2661 84 18 229 304 128 35 41 3 808 24 2386
## [217] 12 9818 21 2289 41 4978 5033 18 2391 18 1207 211
## [229] 12 52 9 249 22 18 176 127 12 102 88 1094
## [241] 2548 310 22 942 50 74 73 698 28 323 988 376
## [253] 538 1799 135 7282 70 3504 21 8 528 28 63 22
## [265] 289 1148 36 51 74 750 12 440 81 2960 22979 1213
## [277] 10 23 15 12 129 6 77 26 10770 28 4310 30
## [289] 15 16669 794 489 34 2545 1289 21 36 148 144 81
## [301] 55 6 265 245 40 526 69 61 40 59 15 201
## [313] 61 283 38 6 19 23 779 191 47 790 45 12
## [325] 307 216 292 37 1166 154 13 19 4 17 72 2823
## [337] 162 79 67 69 6048 36 120 20 3 37948 267 3
## [349] 3 13253 7776 71 15 98 208 109 9 204 3 1879
## [361] 60 1226 130 3 15 380 33 62 99 10596 1536 88
## [373] 337 130 159 1953 3 163 13 6 10 65 42 3
## [385] 19 5056 86 373 33 4177 3 1330 4060 12 833 12
## [397] 21 3 93 260 762 290 1933 1229 28 22 42 10937
## [409] 183 199 6 108 9 36 34 3 78 17 351 56
## [421] 73 34 12 152 1301 6 24 1412 16 714 57 30
## [433] 16 24 190 7 574 202 66 32 8174 17928 35 19274
## [445] 277 3 299 3 3 10 3 87 831 86 71 268
## [457] 2257 76 3 27 3 9 24 3838 12 4306 3 158
## [469] 18 1082 10893 4153 12544 2342 4074 21 3 12 1159 42
## [481] 57 3 205 3 32 69 15 1079 6 6 144 25
## [493] 3 3269 16 353 191 23 24 15
#6
#Old to young ratio
OldToYoung <- Assignment1Dataset %>%
group_by(region) %>%
summarise(AgeRatio = sum(population[age>=40]) / sum(population[age<40]))
#Testing the vector works, it does
OldToYoung
## # A tibble: 500 x 2
## region AgeRatio
## <chr> <dbl>
## 1 SSC20005 1.06
## 2 SSC20012 0.721
## 3 SSC20018 1
## 4 SSC20027 0.484
## 5 SSC20029 0.308
## 6 SSC20048 0.520
## 7 SSC20062 0.453
## 8 SSC20076 0.495
## 9 SSC20079 0.559
## 10 SSC20099 Inf
## # ... with 490 more rows
#Vector of Old to young age ratios
VecOldToYoung <- OldToYoung$AgeRatio
VecOldToYoung
## [1] 1.0625000 0.7206478 1.0000000 0.4843342 0.3076923 0.5197368 0.4534413
## [8] 0.4952479 0.5585473 Inf 0.9000000 0.0000000 0.5268256 0.0000000
## [15] 0.4232268 0.9014085 0.0000000 0.5251010 0.8541667 1.0000000 0.6185567
## [22] 0.3963134 3.0000000 0.2500000 0.4293433 0.4155496 1.0000000 0.3736908
## [29] 2.3333333 0.4642857 2.0000000 0.4634146 0.7419355 1.6176471 1.0000000
## [36] 8.0000000 0.7127660 1.1666667 1.0000000 0.7692308 0.4146341 0.3292826
## [43] 1.8500000 0.2760358 0.4110123 0.2608696 1.3333333 0.7647059 0.4410399
## [50] 0.0000000 0.0000000 0.6969697 0.6486486 1.0000000 0.2307692 0.2587646
## [57] 0.3253589 1.6666667 0.9622642 0.7826087 0.0000000 Inf 0.5000000
## [64] 0.4927536 0.9181818 Inf 0.1666667 0.4765669 Inf 0.4323127
## [71] 0.0000000 0.9230769 1.0800000 0.0000000 0.3495935 0.3464311 0.5448635
## [78] 0.3333333 0.6015038 0.1073914 1.6000000 Inf 0.0000000 2.2222222
## [85] 0.3970588 0.6428571 Inf 0.4006684 0.2186312 0.7500000 0.6033058
## [92] 0.3100327 0.4564573 0.4681496 0.0000000 0.6950355 0.3177900 0.6923077
## [99] 0.2848653 0.4078024 0.6201923 0.2722104 0.2454281 0.6923077 2.6000000
## [106] 0.4545455 0.6745763 0.3588686 0.0000000 0.0000000 0.3126129 3.6666667
## [113] 0.8125000 0.4139073 1.0000000 0.6086957 0.4434763 0.4066667 0.5590778
## [120] 0.5000000 1.2500000 0.3109885 3.0000000 0.6285714 0.5023447 0.7244094
## [127] 4.3333333 0.6705882 1.2727273 0.3779855 0.4730603 0.7423313 0.3518519
## [134] 0.5000000 0.6011342 0.5319149 0.4211382 0.7835821 0.4027253 Inf
## [141] 0.4270824 0.3314763 0.7959184 0.0000000 0.4383460 0.5491924 0.6264151
## [148] 0.5228113 1.1639344 1.6969697 2.0384615 3.6666667 1.5000000 0.5000000
## [155] 3.0000000 0.6779141 0.5681818 1.0000000 0.5853659 Inf 0.6666667
## [162] 0.0000000 0.4210526 0.0000000 0.6666667 1.4666667 Inf 0.3017392
## [169] 0.3655536 1.3333333 0.4700000 0.4660494 0.7660377 0.0000000 1.0000000
## [176] 1.5555556 0.8095238 0.7125749 2.0454545 0.4593401 0.3548366 0.4878049
## [183] 0.3909535 0.0000000 0.5839416 0.2726428 0.3249085 0.4426230 1.2000000
## [190] 0.4342984 0.6923077 0.4225031 0.4389158 Inf 0.0000000 0.5000000
## [197] 0.4766147 1.0000000 0.0000000 0.3580071 0.0000000 0.4807361 0.5625000
## [204] 0.6636364 0.5275545 1.3333333 0.6363636 0.7480916 0.7471264 0.5421687
## [211] 0.6666667 0.6400000 0.0000000 0.6389452 1.6666667 0.3744240 0.0000000
## [218] 0.4859997 0.9090909 0.4758221 0.5769231 0.5279312 0.4290176 1.0000000
## [225] 0.5637672 2.0000000 0.5944518 0.8347826 Inf 1.1666667 0.5000000
## [232] 0.5090909 0.3750000 1.0000000 0.7087379 0.7162162 3.0000000 0.4366197
## [239] 0.7254902 0.5695839 0.5041322 0.7415730 1.2000000 0.5621891 0.2820513
## [246] 0.2982456 0.3773585 0.3194707 0.6470588 0.7459459 0.5782748 0.7488372
## [253] 0.5197740 0.4685714 0.5000000 0.3698269 0.9444444 0.3888228 6.0000000
## [260] Inf 0.5667656 0.7500000 0.4000000 1.2000000 0.9395973 0.2573932
## [267] 0.3333333 0.8214286 0.2758621 0.5306122 1.0000000 0.6296296 1.3823529
## [274] 0.5554388 0.3822786 0.4614458 2.3333333 0.3529412 0.6666667 1.0000000
## [281] 1.1864407 0.0000000 0.7500000 1.1666667 0.4191593 1.8000000 0.4482527
## [288] 0.6666667 1.5000000 0.3773756 0.4981132 0.5377358 4.6666667 0.2931911
## [295] 0.3329886 0.4000000 0.2413793 1.1449275 0.5652174 0.9756098 0.9642857
## [302] Inf 0.5317919 0.6554054 0.4285714 0.6036585 1.8750000 0.5641026
## [309] 3.4444444 1.3600000 1.5000000 0.4055944 0.6944444 0.3872549 0.3103448
## [316] Inf 0.9000000 0.3529412 0.5769231 0.6608696 1.2380952 0.5076336
## [323] 0.6071429 0.3333333 0.6775956 0.5652174 0.8598726 0.4800000 0.5202086
## [330] 0.6739130 0.8571429 2.1666667 Inf 0.2142857 0.7142857 0.3743914
## [337] 1.0000000 1.1351351 0.9705882 0.4081633 0.4328358 1.2500000 0.5000000
## [344] 0.5384615 Inf 0.2938288 0.5523256 Inf 0.0000000 0.3653034
## [351] 0.4870912 0.8205128 1.1428571 1.3333333 0.5877863 1.3695652 0.5000000
## [358] 0.3161290 Inf 0.5776658 1.1428571 0.5578145 0.6883117 Inf
## [365] 1.5000000 0.4448669 2.3000000 1.5833333 1.0204082 0.2780123 0.3391456
## [372] 0.6603774 0.5747664 1.0312500 0.5742574 0.3821656 Inf 0.6138614
## [379] 3.3333333 0.0000000 1.5000000 0.9117647 0.8260870 0.0000000 0.5833333
## [386] 0.5284160 0.5636364 0.7511737 0.9411765 0.4438299 Inf 0.5833333
## [393] 0.4667630 3.0000000 0.4743363 Inf 1.3333333 Inf 0.4090909
## [400] 0.7333333 0.5000000 0.3679245 0.4414616 0.5796915 0.3333333 2.6666667
## [407] 0.9090909 0.2920260 0.7596154 0.8090909 1.0000000 1.2040816 0.0000000
## [414] 1.4000000 1.2666667 Inf 0.4181818 1.4285714 0.4686192 1.0740741
## [421] 0.5869565 0.8888889 3.0000000 0.6888889 0.5180863 0.0000000 0.1428571
## [428] 0.4831933 3.0000000 0.6264237 0.7272727 0.7647059 0.6000000 0.6000000
## [435] 1.0652174 1.3333333 0.5471698 0.9056604 0.8333333 2.2000000 0.4242899
## [442] 0.3759018 1.1875000 0.1811497 0.7643312 Inf 0.5333333 Inf
## [449] 0.0000000 0.6666667 Inf 1.2894737 0.5950096 0.4333333 0.6511628
## [456] 0.6645963 0.4311985 0.3818182 0.0000000 0.9285714 Inf 0.5000000
## [463] 0.0000000 0.4214815 0.0000000 0.3044532 Inf 0.6458333 1.0000000
## [470] 0.5239437 0.3940363 0.4198291 0.5033557 0.3736070 0.4503382 1.3333333
## [477] Inf 0.0000000 0.5641026 2.2307692 0.5833333 Inf 0.8303571
## [484] 0.0000000 0.6000000 1.6538462 0.0000000 0.5240113 Inf 0.0000000
## [491] 0.4545455 0.5625000 Inf 0.3928419 1.6666667 0.6650943 0.5528455
## [498] 2.8333333 1.6666667 0.2500000
#Vector of Gender ratio Females per Male
#Question 7
gender_ratio <- Assignment1Dataset %>%
group_by(region) %>%
summarise(gender.ratio = sum(population[gender=="F"])/sum(population[gender=="M"]))
gender_ratio
## # A tibble: 500 x 2
## region gender.ratio
## <chr> <dbl>
## 1 SSC20005 1.54
## 2 SSC20012 0.959
## 3 SSC20018 0.667
## 4 SSC20027 0.898
## 5 SSC20029 0.7
## 6 SSC20048 0.941
## 7 SSC20062 0.899
## 8 SSC20076 0.991
## 9 SSC20079 1.01
## 10 SSC20099 0
## # ... with 490 more rows
#Vector for gender ratio
VecGenderRatio <- gender_ratio$gender.ratio
VecGenderRatio
## [1] 1.5384615 0.9585253 0.6666667 0.8981636 0.7000000 0.9411765 0.8994709
## [8] 0.9914471 1.0056406 0.0000000 0.9000000 1.1250000 0.9990244 Inf
## [15] 1.0820981 0.8367347 Inf 1.0114155 1.5797101 3.0000000 1.2589928
## [22] 0.9815696 0.3333333 1.5000000 0.9640206 1.0063331 1.0000000 1.0104231
## [29] 0.6666667 1.5625000 2.0000000 1.3076923 0.8000000 0.8936170 1.0000000
## [36] 1.2500000 1.3000000 1.0800000 Inf 1.4642857 0.7575758 1.0184524
## [43] 0.7812500 0.9930039 1.0039876 1.4166667 1.3333333 0.9354839 1.0501982
## [50] 1.4000000 Inf 0.8360656 0.5641026 0.5200000 0.7777778 1.0895331
## [57] 1.0581733 1.5000000 0.5294118 0.9838710 1.0000000 0.0000000 1.0649351
## [64] 1.1914894 0.8672566 Inf 0.7500000 1.0038314 Inf 1.0753065
## [71] 0.0000000 0.3392857 0.8571429 Inf 1.2133333 0.9134221 1.0101523
## [78] 0.8750000 1.1958763 1.1700442 0.8571429 Inf Inf 0.5263158
## [85] 1.4675325 0.4375000 0.0000000 1.0522307 1.2218371 0.0000000 0.9795918
## [92] 0.9897631 1.0705251 1.0281905 1.3333333 0.7969925 1.0669945 0.6363636
## [99] 0.8957521 1.0540541 1.0139442 1.0838184 1.0343384 1.2000000 0.9285714
## [106] 1.6666667 0.8364312 0.9675768 0.5000000 1.0000000 0.9974805 3.6666667
## [113] 0.9333333 0.9953271 1.0000000 0.7011494 1.0576433 0.9181818 0.9963100
## [120] 0.5000000 0.8571429 1.0488452 3.0000000 0.6442308 1.0686037 0.8099174
## [127] 4.3333333 1.0285714 1.1739130 1.0126390 0.9725830 0.8745875 1.1007194
## [134] 1.0000000 0.9882629 1.2500000 1.0683671 0.8968254 0.9834057 1.0000000
## [141] 1.0499081 0.9069149 1.0465116 2.0000000 0.9966963 0.9573284 0.9113082
## [148] 1.0289141 1.0952381 0.5892857 0.7555556 1.3333333 1.5000000 0.5000000
## [155] 0.0000000 1.0110294 0.5333333 0.7777778 0.7105263 Inf 1.0000000
## [162] Inf 1.0769231 Inf 0.8840580 0.6818182 Inf 0.9284428
## [169] 1.0077519 1.3333333 1.2615385 1.1205357 0.9180328 0.0000000 Inf
## [176] 0.9166667 0.7272727 1.0428571 0.9142857 1.0392857 0.9534471 1.1785714
## [183] 1.0487957 0.0000000 1.3333333 1.0197978 1.0377838 0.9807074 1.4444444
## [190] 0.9138187 0.6923077 0.9775313 1.0341211 1.0000000 1.3333333 0.5000000
## [197] 0.9673591 1.0000000 1.0000000 1.0276302 1.0000000 1.0211931 0.9230769
## [204] 0.9468085 0.9992487 0.6470588 0.3846154 0.9572650 1.1111111 1.2857143
## [211] 0.5217391 1.7333333 0.0000000 1.0049628 0.6000000 0.9557377 3.0000000
## [218] 1.0143619 0.9090909 1.0096576 2.1538462 0.9793241 1.0253521 5.0000000
## [225] 1.0041911 0.5000000 0.8540707 1.2688172 0.3333333 1.4761905 0.0000000
## [232] 0.9606299 0.8333333 5.0000000 0.6603774 1.0483871 1.0000000 1.0816327
## [239] 0.4426230 1.1242718 1.0953947 0.7714286 2.6666667 1.0433839 1.5000000
## [246] 1.0000000 1.8076923 1.0650888 0.7500000 1.2123288 1.0932203 1.0215054
## [253] 0.9779412 1.0350679 1.1774194 0.9617457 1.0000000 1.0515222 1.3333333
## [260] 0.6000000 1.0307692 0.7500000 0.9090909 0.4666667 0.9794521 1.0140351
## [267] 1.4000000 1.2173913 0.6444444 1.2388060 0.3333333 0.7529880 0.8837209
## [274] 1.0584145 1.0400391 0.9820261 0.6666667 2.8333333 0.2500000 0.3333333
## [281] 0.9846154 Inf 0.7500000 1.6000000 1.0455840 0.8666667 0.9825207
## [288] 1.5000000 1.5000000 0.9936610 0.9178744 1.1077586 1.0000000 1.0182395
## [295] 0.9891975 2.5000000 0.8000000 1.0000000 1.0571429 0.6530612 1.2000000
## [302] Inf 1.0542636 1.0940171 0.8181818 1.1646091 1.0294118 2.0500000
## [309] 1.0000000 1.0344828 0.6666667 1.0100000 2.5882353 0.9929577 2.1666667
## [316] 1.0000000 5.3333333 2.2857143 0.8861985 0.9489796 0.4242424 1.0680628
## [323] 2.4615385 1.0000000 1.2246377 0.9115044 0.9597315 0.8500000 1.0600707
## [330] 0.7906977 3.3333333 0.1875000 0.0000000 0.2142857 0.6363636 1.0294752
## [337] 0.8837209 1.3939394 0.8611111 3.6000000 0.9940653 1.0000000 1.0338983
## [344] 0.1764706 Inf 1.0352910 1.2250000 0.0000000 Inf 0.9914350
## [351] 0.9867144 0.9722222 0.8750000 0.7192982 0.8909091 0.9122807 0.5000000
## [358] 1.0198020 0.0000000 0.9883598 0.7142857 1.0331675 1.0634921 0.0000000
## [365] 0.2500000 0.7511521 0.7368421 0.8235294 0.8679245 1.0337812 1.2196532
## [372] 0.8723404 0.9479769 0.8309859 1.0921053 0.9687500 0.0000000 0.8953488
## [379] 3.3333333 1.0000000 1.5000000 1.4074074 0.9090909 Inf 2.1666667
## [386] 1.0288925 1.3888889 1.0053763 1.0625000 1.0425428 Inf 1.0182094
## [393] 1.0118930 1.0000000 0.8846154 1.0000000 2.5000000 Inf 1.5833333
## [400] 0.9696970 0.8361446 1.2307692 1.0673797 1.0725126 0.8666667 0.6923077
## [407] 0.6800000 1.0034805 1.3164557 0.8598131 0.0000000 1.7692308 Inf
## [414] 2.0000000 1.0000000 Inf 1.2285714 3.2500000 1.2356688 0.9310345
## [421] 1.2121212 0.7894737 1.0000000 0.9240506 1.1719533 1.0000000 0.3333333
## [428] 0.9263302 1.2857143 1.1250000 0.5833333 0.4285714 0.2307692 0.6000000
## [435] 0.9791667 0.0000000 1.0500000 1.0612245 1.5384615 2.5555556 1.1533193
## [442] 1.0372727 0.8421053 0.8692658 0.7870968 0.0000000 0.9415584 0.0000000
## [449] 0.0000000 0.0000000 0.0000000 0.8913043 1.0072464 1.0476190 0.8684211
## [456] 1.2521008 1.0026619 1.0540541 Inf 0.6875000 0.0000000 0.0000000
## [463] 1.6666667 0.9661885 3.0000000 1.0721848 Inf 0.6288660 0.5000000
## [470] 0.9049296 1.0337939 1.0337904 1.0144532 1.0472028 0.9662162 0.4000000
## [477] Inf 1.0000000 0.9879931 0.5000000 2.1666667 0.0000000 1.3295455
## [484] 0.0000000 1.6666667 0.5681818 0.6666667 1.0281955 1.0000000 0.0000000
## [491] 0.7560976 0.5625000 0.0000000 0.8863243 0.7777778 1.0171429 0.9100000
## [498] 0.6428571 3.0000000 0.6666667
#Making a new df for our 6.1 and 7.1 plots
Plotsdf <- data.frame(VecUniqueRegions, VecUniquePopTotals, VecOldToYoung, VecGenderRatio)
View(Plotsdf)
head(Plotsdf)
## VecUniqueRegions VecUniquePopTotals VecOldToYoung VecGenderRatio
## 1 SSC20005 33 1.0625000 1.5384615
## 2 SSC20012 425 0.7206478 0.9585253
## 3 SSC20018 100 1.0000000 0.6666667
## 4 SSC20027 1137 0.4843342 0.8981636
## 5 SSC20029 51 0.3076923 0.7000000
## 6 SSC20048 924 0.5197368 0.9411765
#Now that we've made a new data frame we can continue with Q6.1 & Q7.1
#6.1
#Checking what the min and max values are to make a filter, removing infinite and 0 values
sort(Plotsdf$VecOldToYoung)
## [1] 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## [8] 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## [15] 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## [22] 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## [29] 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## [36] 0.0000000 0.1073914 0.1428571 0.1666667 0.1811497 0.2142857 0.2186312
## [43] 0.2307692 0.2413793 0.2454281 0.2500000 0.2500000 0.2573932 0.2587646
## [50] 0.2608696 0.2722104 0.2726428 0.2758621 0.2760358 0.2780123 0.2820513
## [57] 0.2848653 0.2920260 0.2931911 0.2938288 0.2982456 0.3017392 0.3044532
## [64] 0.3076923 0.3100327 0.3103448 0.3109885 0.3126129 0.3161290 0.3177900
## [71] 0.3194707 0.3249085 0.3253589 0.3292826 0.3314763 0.3329886 0.3333333
## [78] 0.3333333 0.3333333 0.3333333 0.3391456 0.3464311 0.3495935 0.3518519
## [85] 0.3529412 0.3529412 0.3548366 0.3580071 0.3588686 0.3653034 0.3655536
## [92] 0.3679245 0.3698269 0.3736070 0.3736908 0.3743914 0.3744240 0.3750000
## [99] 0.3759018 0.3773585 0.3773756 0.3779855 0.3818182 0.3821656 0.3822786
## [106] 0.3872549 0.3888228 0.3909535 0.3928419 0.3940363 0.3963134 0.3970588
## [113] 0.4000000 0.4000000 0.4006684 0.4027253 0.4055944 0.4066667 0.4078024
## [120] 0.4081633 0.4090909 0.4110123 0.4139073 0.4146341 0.4155496 0.4181818
## [127] 0.4191593 0.4198291 0.4210526 0.4211382 0.4214815 0.4225031 0.4232268
## [134] 0.4242899 0.4270824 0.4285714 0.4290176 0.4293433 0.4311985 0.4323127
## [141] 0.4328358 0.4333333 0.4342984 0.4366197 0.4383460 0.4389158 0.4410399
## [148] 0.4414616 0.4426230 0.4434763 0.4438299 0.4448669 0.4482527 0.4503382
## [155] 0.4534413 0.4545455 0.4545455 0.4564573 0.4593401 0.4614458 0.4634146
## [162] 0.4642857 0.4660494 0.4667630 0.4681496 0.4685714 0.4686192 0.4700000
## [169] 0.4730603 0.4743363 0.4758221 0.4765669 0.4766147 0.4800000 0.4807361
## [176] 0.4831933 0.4843342 0.4859997 0.4870912 0.4878049 0.4927536 0.4952479
## [183] 0.4981132 0.5000000 0.5000000 0.5000000 0.5000000 0.5000000 0.5000000
## [190] 0.5000000 0.5000000 0.5000000 0.5000000 0.5000000 0.5023447 0.5033557
## [197] 0.5041322 0.5076336 0.5090909 0.5180863 0.5197368 0.5197740 0.5202086
## [204] 0.5228113 0.5239437 0.5240113 0.5251010 0.5268256 0.5275545 0.5279312
## [211] 0.5284160 0.5306122 0.5317919 0.5319149 0.5333333 0.5377358 0.5384615
## [218] 0.5421687 0.5448635 0.5471698 0.5491924 0.5523256 0.5528455 0.5554388
## [225] 0.5578145 0.5585473 0.5590778 0.5621891 0.5625000 0.5625000 0.5636364
## [232] 0.5637672 0.5641026 0.5641026 0.5652174 0.5652174 0.5667656 0.5681818
## [239] 0.5695839 0.5742574 0.5747664 0.5769231 0.5769231 0.5776658 0.5782748
## [246] 0.5796915 0.5833333 0.5833333 0.5833333 0.5839416 0.5853659 0.5869565
## [253] 0.5877863 0.5944518 0.5950096 0.6000000 0.6000000 0.6000000 0.6011342
## [260] 0.6015038 0.6033058 0.6036585 0.6071429 0.6086957 0.6138614 0.6185567
## [267] 0.6201923 0.6264151 0.6264237 0.6285714 0.6296296 0.6363636 0.6389452
## [274] 0.6400000 0.6428571 0.6458333 0.6470588 0.6486486 0.6511628 0.6554054
## [281] 0.6603774 0.6608696 0.6636364 0.6645963 0.6650943 0.6666667 0.6666667
## [288] 0.6666667 0.6666667 0.6666667 0.6666667 0.6705882 0.6739130 0.6745763
## [295] 0.6775956 0.6779141 0.6883117 0.6888889 0.6923077 0.6923077 0.6923077
## [302] 0.6944444 0.6950355 0.6969697 0.7087379 0.7125749 0.7127660 0.7142857
## [309] 0.7162162 0.7206478 0.7244094 0.7254902 0.7272727 0.7333333 0.7415730
## [316] 0.7419355 0.7423313 0.7459459 0.7471264 0.7480916 0.7488372 0.7500000
## [323] 0.7500000 0.7500000 0.7511737 0.7596154 0.7643312 0.7647059 0.7647059
## [330] 0.7660377 0.7692308 0.7826087 0.7835821 0.7959184 0.8090909 0.8095238
## [337] 0.8125000 0.8205128 0.8214286 0.8260870 0.8303571 0.8333333 0.8347826
## [344] 0.8541667 0.8571429 0.8598726 0.8888889 0.9000000 0.9000000 0.9014085
## [351] 0.9056604 0.9090909 0.9090909 0.9117647 0.9181818 0.9230769 0.9285714
## [358] 0.9395973 0.9411765 0.9444444 0.9622642 0.9642857 0.9705882 0.9756098
## [365] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [372] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [379] 1.0000000 1.0000000 1.0000000 1.0204082 1.0312500 1.0625000 1.0652174
## [386] 1.0740741 1.0800000 1.1351351 1.1428571 1.1428571 1.1449275 1.1639344
## [393] 1.1666667 1.1666667 1.1666667 1.1864407 1.1875000 1.2000000 1.2000000
## [400] 1.2000000 1.2040816 1.2380952 1.2500000 1.2500000 1.2666667 1.2727273
## [407] 1.2894737 1.3333333 1.3333333 1.3333333 1.3333333 1.3333333 1.3333333
## [414] 1.3333333 1.3600000 1.3695652 1.3823529 1.4000000 1.4285714 1.4666667
## [421] 1.5000000 1.5000000 1.5000000 1.5000000 1.5000000 1.5555556 1.5833333
## [428] 1.6000000 1.6176471 1.6538462 1.6666667 1.6666667 1.6666667 1.6666667
## [435] 1.6969697 1.8000000 1.8500000 1.8750000 2.0000000 2.0000000 2.0384615
## [442] 2.0454545 2.1666667 2.2000000 2.2222222 2.2307692 2.3000000 2.3333333
## [449] 2.3333333 2.6000000 2.6666667 2.8333333 3.0000000 3.0000000 3.0000000
## [456] 3.0000000 3.0000000 3.0000000 3.0000000 3.3333333 3.4444444 3.6666667
## [463] 3.6666667 4.3333333 4.6666667 6.0000000 8.0000000 Inf Inf
## [470] Inf Inf Inf Inf Inf Inf Inf
## [477] Inf Inf Inf Inf Inf Inf Inf
## [484] Inf Inf Inf Inf Inf Inf Inf
## [491] Inf Inf Inf Inf Inf Inf Inf
## [498] Inf Inf Inf
#The min excluding 0 is .107 and the max excluding infinite is 8.0
PlotOldToYoung <- filter(Plotsdf,VecOldToYoung>0.1 & VecOldToYoung <8.5)
head(PlotOldToYoung)
## VecUniqueRegions VecUniquePopTotals VecOldToYoung VecGenderRatio
## 1 SSC20005 33 1.0625000 1.5384615
## 2 SSC20012 425 0.7206478 0.9585253
## 3 SSC20018 100 1.0000000 0.6666667
## 4 SSC20027 1137 0.4843342 0.8981636
## 5 SSC20029 51 0.3076923 0.7000000
## 6 SSC20048 924 0.5197368 0.9411765
#Calculating pearsons correlation coefficient
cor.test(x=PlotOldToYoung$VecUniquePopTotals, y=PlotOldToYoung$VecOldToYoung)
##
## Pearson's product-moment correlation
##
## data: PlotOldToYoung$VecUniquePopTotals and PlotOldToYoung$VecOldToYoung
## t = -5.2121, df = 429, p-value = 2.908e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.3308645 -0.1531083
## sample estimates:
## cor
## -0.2440351
#We can see there is a low negative linear relationship
#6.1
ggplot(data=PlotOldToYoung,aes(x=VecUniquePopTotals,y=VecOldToYoung))+ #Create basic plot of gender ratio by region
geom_point(size=0.5, position="jitter") + #Change point size and jitter points to avoid them hitting each other
geom_hline(aes(yintercept=mean(PlotOldToYoung$VecOldToYoung, na.rm=T)), color="blue", linetype="dashed", size=1) +
#Add a blue mean line for the y axis
geom_hline(aes(yintercept=median(PlotOldToYoung$VecOldToYoung, na.rm=T)), color="red", linetype="dashed", size=1) +
##Add a red median line for the y axis
geom_smooth(method="lm")+ #Add linear regression line
coord_cartesian(ylim = c(0.1,9.4)) + #Setting the view frame, this still includes the lowest & highest values
scale_y_continuous(trans = 'log2') + #Setting the y axis scale to log2 but maintain raw values
scale_x_continuous(trans = 'log2') + #Setting the x axis scale to log2 but maintain raw values
labs(y = "People 40 and over per Person Under 40", x = "Region Total Population") + #Label axis'
geom_text(label=PlotOldToYoung$VecUniqueRegions,size=2, alpha= 0.5) + #Label each dot with its region name in text
ggtitle("Age Ratio (People 40 and over per person Under 40) for each region as a function of its Population")+ #Add title
theme(panel.background = element_rect(fill="#F2CF59",color="pink"),
plot.background = element_rect("cornflowerblue"),#Change plot background colour
plot.title = element_text(hjust= 0.5, size=11), #Centre title, change title size
panel.border = element_rect(linetype = "dashed", fill = NA)) #Changed panel colour, add dashed border
# 6.1 Plot the ratio of old to young people using 40 years old as a cut off.
Plot gender ratio as a function of the population of the region (remove any with zero M or F)
#7
sort(Plotsdf$VecGenderRatio, decreasing = F)
## [1] 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## [8] 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## [15] 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## [22] 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
## [29] 0.1764706 0.1875000 0.2142857 0.2307692 0.2500000 0.2500000 0.3333333
## [36] 0.3333333 0.3333333 0.3333333 0.3333333 0.3392857 0.3846154 0.4000000
## [43] 0.4242424 0.4285714 0.4375000 0.4426230 0.4666667 0.5000000 0.5000000
## [50] 0.5000000 0.5000000 0.5000000 0.5000000 0.5000000 0.5000000 0.5200000
## [57] 0.5217391 0.5263158 0.5294118 0.5333333 0.5625000 0.5641026 0.5681818
## [64] 0.5833333 0.5892857 0.6000000 0.6000000 0.6000000 0.6288660 0.6363636
## [71] 0.6363636 0.6428571 0.6442308 0.6444444 0.6470588 0.6530612 0.6603774
## [78] 0.6666667 0.6666667 0.6666667 0.6666667 0.6666667 0.6666667 0.6800000
## [85] 0.6818182 0.6875000 0.6923077 0.6923077 0.7000000 0.7011494 0.7105263
## [92] 0.7142857 0.7192982 0.7272727 0.7368421 0.7500000 0.7500000 0.7500000
## [99] 0.7500000 0.7511521 0.7529880 0.7555556 0.7560976 0.7575758 0.7714286
## [106] 0.7777778 0.7777778 0.7777778 0.7812500 0.7870968 0.7894737 0.7906977
## [113] 0.7969925 0.8000000 0.8000000 0.8099174 0.8181818 0.8235294 0.8309859
## [120] 0.8333333 0.8360656 0.8361446 0.8364312 0.8367347 0.8421053 0.8500000
## [127] 0.8540707 0.8571429 0.8571429 0.8571429 0.8598131 0.8611111 0.8666667
## [134] 0.8666667 0.8672566 0.8679245 0.8684211 0.8692658 0.8723404 0.8745875
## [141] 0.8750000 0.8750000 0.8837209 0.8837209 0.8840580 0.8846154 0.8861985
## [148] 0.8863243 0.8909091 0.8913043 0.8936170 0.8953488 0.8957521 0.8968254
## [155] 0.8981636 0.8994709 0.9000000 0.9049296 0.9069149 0.9090909 0.9090909
## [162] 0.9090909 0.9100000 0.9113082 0.9115044 0.9122807 0.9134221 0.9138187
## [169] 0.9142857 0.9166667 0.9178744 0.9180328 0.9181818 0.9230769 0.9240506
## [176] 0.9263302 0.9284428 0.9285714 0.9310345 0.9333333 0.9354839 0.9411765
## [183] 0.9415584 0.9468085 0.9479769 0.9489796 0.9534471 0.9557377 0.9572650
## [190] 0.9573284 0.9585253 0.9597315 0.9606299 0.9617457 0.9640206 0.9661885
## [197] 0.9662162 0.9673591 0.9675768 0.9687500 0.9696970 0.9722222 0.9725830
## [204] 0.9775313 0.9779412 0.9791667 0.9793241 0.9794521 0.9795918 0.9807074
## [211] 0.9815696 0.9820261 0.9825207 0.9834057 0.9838710 0.9846154 0.9867144
## [218] 0.9879931 0.9882629 0.9883598 0.9891975 0.9897631 0.9914350 0.9914471
## [225] 0.9929577 0.9930039 0.9936610 0.9940653 0.9953271 0.9963100 0.9966963
## [232] 0.9974805 0.9990244 0.9992487 1.0000000 1.0000000 1.0000000 1.0000000
## [239] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [246] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [253] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [260] 1.0000000 1.0000000 1.0000000 1.0000000 1.0026619 1.0034805 1.0038314
## [267] 1.0039876 1.0041911 1.0049628 1.0053763 1.0056406 1.0063331 1.0072464
## [274] 1.0077519 1.0096576 1.0100000 1.0101523 1.0104231 1.0110294 1.0114155
## [281] 1.0118930 1.0126390 1.0139442 1.0140351 1.0143619 1.0144532 1.0171429
## [288] 1.0182094 1.0182395 1.0184524 1.0197978 1.0198020 1.0211931 1.0215054
## [295] 1.0253521 1.0276302 1.0281905 1.0281955 1.0285714 1.0288925 1.0289141
## [302] 1.0294118 1.0294752 1.0307692 1.0331675 1.0337812 1.0337904 1.0337939
## [309] 1.0338983 1.0341211 1.0343384 1.0344828 1.0350679 1.0352910 1.0372727
## [316] 1.0377838 1.0392857 1.0400391 1.0425428 1.0428571 1.0433839 1.0455840
## [323] 1.0465116 1.0472028 1.0476190 1.0483871 1.0487957 1.0488452 1.0499081
## [330] 1.0500000 1.0501982 1.0515222 1.0522307 1.0540541 1.0540541 1.0542636
## [337] 1.0571429 1.0576433 1.0581733 1.0584145 1.0600707 1.0612245 1.0625000
## [344] 1.0634921 1.0649351 1.0650888 1.0669945 1.0673797 1.0680628 1.0683671
## [351] 1.0686037 1.0705251 1.0721848 1.0725126 1.0753065 1.0769231 1.0800000
## [358] 1.0816327 1.0820981 1.0838184 1.0895331 1.0921053 1.0932203 1.0940171
## [365] 1.0952381 1.0953947 1.1007194 1.1077586 1.1111111 1.1205357 1.1242718
## [372] 1.1250000 1.1250000 1.1533193 1.1646091 1.1700442 1.1719533 1.1739130
## [379] 1.1774194 1.1785714 1.1914894 1.1958763 1.2000000 1.2000000 1.2121212
## [386] 1.2123288 1.2133333 1.2173913 1.2196532 1.2218371 1.2246377 1.2250000
## [393] 1.2285714 1.2307692 1.2356688 1.2388060 1.2500000 1.2500000 1.2521008
## [400] 1.2589928 1.2615385 1.2688172 1.2857143 1.2857143 1.3000000 1.3076923
## [407] 1.3164557 1.3295455 1.3333333 1.3333333 1.3333333 1.3333333 1.3333333
## [414] 1.3333333 1.3333333 1.3888889 1.3939394 1.4000000 1.4000000 1.4074074
## [421] 1.4166667 1.4444444 1.4642857 1.4675325 1.4761905 1.5000000 1.5000000
## [428] 1.5000000 1.5000000 1.5000000 1.5000000 1.5000000 1.5384615 1.5384615
## [435] 1.5625000 1.5797101 1.5833333 1.6000000 1.6666667 1.6666667 1.6666667
## [442] 1.7333333 1.7692308 1.8076923 2.0000000 2.0000000 2.0000000 2.0500000
## [449] 2.1538462 2.1666667 2.1666667 2.1666667 2.2857143 2.4615385 2.5000000
## [456] 2.5000000 2.5555556 2.5882353 2.6666667 2.8333333 3.0000000 3.0000000
## [463] 3.0000000 3.0000000 3.0000000 3.2500000 3.3333333 3.3333333 3.6000000
## [470] 3.6666667 4.3333333 5.0000000 5.0000000 5.3333333 Inf Inf
## [477] Inf Inf Inf Inf Inf Inf Inf
## [484] Inf Inf Inf Inf Inf Inf Inf
## [491] Inf Inf Inf Inf Inf Inf Inf
## [498] Inf Inf Inf
#Lowest is 0.176 to highest which is 5.33
head(Plotsdf)
## VecUniqueRegions VecUniquePopTotals VecOldToYoung VecGenderRatio
## 1 SSC20005 33 1.0625000 1.5384615
## 2 SSC20012 425 0.7206478 0.9585253
## 3 SSC20018 100 1.0000000 0.6666667
## 4 SSC20027 1137 0.4843342 0.8981636
## 5 SSC20029 51 0.3076923 0.7000000
## 6 SSC20048 924 0.5197368 0.9411765
#Filtering out the regions with 0 males or 0 females
PlotGenRatio <- filter(Plotsdf,VecGenderRatio>0.1 & VecGenderRatio <5.5)
head(PlotGenRatio)
## VecUniqueRegions VecUniquePopTotals VecOldToYoung VecGenderRatio
## 1 SSC20005 33 1.0625000 1.5384615
## 2 SSC20012 425 0.7206478 0.9585253
## 3 SSC20018 100 1.0000000 0.6666667
## 4 SSC20027 1137 0.4843342 0.8981636
## 5 SSC20029 51 0.3076923 0.7000000
## 6 SSC20048 924 0.5197368 0.9411765
#Amended this by getting the pearsons correlation
cor.test(x=PlotGenRatio$VecUniquePopTotals, y=PlotGenRatio$VecGenderRatio)
##
## Pearson's product-moment correlation
##
## data: PlotGenRatio$VecUniquePopTotals and PlotGenRatio$VecGenderRatio
## t = -1.2368, df = 444, p-value = 0.2168
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.15062780 0.03444502
## sample estimates:
## cor
## -0.05859485
#We can see there is no linear relationship between the two variables
# 7.1 Plot the gender ratio as a function of the population of the region.
#There is no relationship between the x and y variables in this graph
ggplot(data=PlotGenRatio,aes(x=VecUniquePopTotals,y=VecGenderRatio))+ #Create basic plot of gender ratio by region
geom_point(size=0.5, position="jitter") + #Change point size and jitter points to avoid them hitting each other
geom_hline(aes(yintercept=mean(PlotGenRatio$VecGenderRatio, na.rm=T)), color="blue", linetype="dashed", size=1) +
geom_hline(aes(yintercept=median(PlotGenRatio$VecGenderRatio, na.rm=T)), color="red", linetype="dashed", size=1) +
coord_cartesian(ylim = c(0.2,5.5)) + #Setting the view frame, this still includes the lowest & highest values
scale_y_continuous(trans = 'log2') + #Setting the y axis scale to log2 but maintain raw values
scale_x_continuous(trans = 'log2') + #Setting the x axis scale to log2 but maintain raw values
labs(y = "Females per Male Gender Ratio", x = "Region Total Population") + #Label axis'
geom_text(label=PlotGenRatio$VecUniqueRegions,size=2, alpha= 0.5) + #Label each dot with its region name in text
ggtitle("Gender Ratio (Females per Male) for each region as a function of its Population ")+ #Add title
theme(panel.background = element_rect(fill="#f4edca",color="pink"),plot.background = element_rect("#46dbdf"),
plot.title = element_text(hjust= 0.5), #Centre title
panel.border = element_rect(linetype = "dashed", fill = NA)) + #Change panel and plot background colours, Centre title
geom_smooth(method='loess', formula= y~x, na.rm = T, colour="black", size=0.9) #Use loess for non-linear model
Slice by a single age group, e.g. 20-30 year olds. Find the top 2 regions and the percentage for male and female. (Requires an inner join from dplyr). Find the top 2 regions and the percentage for male and female. Either ones with the largest population of the selected demographic, or the largest percentage is also fine as long as their justification is logical.
#8
#Make a vector of the age column from our looped data frame
VecEight.One <- df$age
head(VecEight.One,10)
## [1] 0 0 0 0 0 0 0 0 0 0
#Check the mean and median values to get an idea of where to begin our search
mean(VecEight.One)
## [1] 27.76707
median(VecEight.One)
## [1] 28
# Input Parameters
# Modify here to iterate with different ages, I already did it and found 21 + or - 2 is the best
ideal.age = 21
span = 2
###########################
min.age = ideal.age - span
max.age = ideal.age + span
# 1) Filter by target age
# Target.Age is a data frame with the individuals with the range of age of interest.
Target.Age = Assignment1Dataset %>% filter(age >= min.age & age <= max.age )
# 2) Filter by gender
Male.Targets <- Target.Age %>% filter(gender == 'M')
Female.Targets <- Target.Age %>% filter(gender == 'F')
# 3) Get the amount of people per region by summing the population column
# Male.Region.Target is a data frame with the region and the number of people in the region
Male.Region.Target = aggregate(x = Male.Targets$population,
by = list(Male.Targets$region),
FUN = sum)
# rename the columns for readability
colnames(Male.Region.Target) = c("region","total.males")
# apply the same to females
Female.Region.Target = aggregate(x = Female.Targets$population,
by = list(Female.Targets$region),
FUN = sum)
colnames(Female.Region.Target) = c("region","total.females")
# 4) Merge by Region Males and Females using the region ID as an anchor
Target.Cohort = inner_join(Male.Region.Target, Female.Region.Target, by="region")
# 5) Create a new column that sums the total males and females to bu used to sort and select the most
# There are abundant regions with the cohort of interest.
# Adding a column to the data frame
Target.Cohort$Total = Target.Cohort$total.males + Target.Cohort$total.females
# 6) Order the data frame based on Total and select the top 2 regions
# 8.2) The print shows the top 2 regions with the most numbers of males and females for the range of age selected
# These regions will maximize the audience of the product
print(Target.Cohort[order(Target.Cohort$Total, decreasing = T),][1:2,] ) # order by decreasing order & select the top two rows [1:2,]
## region total.males total.females Total
## 80 SSC20492 2499 3434 5933
## 346 SSC22015 1416 1505 2921
You will need to make a sample of ages from the cumulative distribution calculated for the most populous region. Select N samples of 1000 age values each from the cumulative distribution for the region with the most people. Then output a plot after summing 1, 10, 100 and 1000 random samples.
# Question 9 Demonstrate the following
# 9.1) The central limit theorem by selecting N samples of 100 age values each from the cumulative distribution
# for the region with the most people (Q4d).
#################
# Function ageFrom, this function determines the age of random number
# in the mapping space from 1 to cumsum(population). i.e for values of i_ran 1 to 878, the output age
# is going to be 0
# Inputs:
# i_rand - integer value assumed to be from 1 to max of cumsum(population)
# df - data.frame continuing the Largest region
# Output:
# age corresponding to the cumsum
#############################################
ageFrom <- function(i_rand, df){
id <- max(1, which.max(df$cumsum > i_rand)-1)
if(i_rand==max(df$cumsum)) id <- length(df$cumsum)
return(df$age[id])
}
#################
# Function sample.ages, loops from 1 to N to contain sampling every time Total random ages
# this function calls ageFrom. Results are returned as a matrix of size TotalxN
# Inputs:
# Largest.Region - data.frame continuing the Largest region it needs to have a column cumsum
# N - the number samples contingin Total measurements to collect
# Total - the number of measurements to take, 100 as default
# Output:
# mtx, is a matrix containing the N samples
#############################################
sample.ages = function(Largest.Region, N, Total){
# Generate a matrix for the sampling results
# the rows contain the 100 ages
# and the cols are the number of samples from 1 to N
mtx = matrix(0, nrow = Total, ncol = N)
for(i in 1:N){
# this takes 100 numbers from 1 to the total of the cumulative distribution
# this simulates taking a random sample of 100 people
# i_rand is a vector of size 100
i_rand = round( runif(Total, 1, max(Largest.Region$cumsum)) )
# select for a every i_rand number check which age category corresponds
# using the ageFrom function, this will return 100 ages
# store in a column as [,i]
mtx[,i] = sapply(i_rand, ageFrom, df = Largest.Region)
}
return(mtx)
}
# Most crowded region
Largest.Region = Assignment1Dataset %>% filter (region=="SSC22015")
Largest.Region$cumsum = cumsum(Largest.Region$population)
# N is the number of samples, each sample contains 100 age values from the cumulative distribution
N = 1000
Total = 100
# Take N samples by calling sample.ages
# N.Samples is a matrix with Total number of rows and N columns
N.Samples = sample.ages(Largest.Region, N, Total)
# Population mean mu, this is the value of the mean of the population
mu = weighted.mean(Largest.Region$age, Largest.Region$population)
# the apply function will take the mean of the first 10 columns of N.Samples, apply only works on matrices
x.bar = mean(apply(N.Samples[,1:10], 2, mean)) # mean of sample means, this by CLT should be similar to mu
# 9.1) CLT if we take N samples of size 100, we would expect that the means of the N samples are Normally
# distributed around the population mean mu
# plot 9.1 using ggplot
# use stack on matrices to make the ggplot data type happy
merged.data = stack(as.data.frame(N.Samples[,1:10]))
ggplot(merged.data) +
geom_boxplot(aes(x = ind, y = values, fill=ind))+
labs(title="CLT Using 10 Samples of 100 Observations: Population (red) vs Sample (blue) means",
x ="Sample (N)", y = "Age Distribution (years)") +
geom_hline(yintercept = mu, col="red", size=1)+ # Plot the population mean mu
geom_hline(yintercept = x.bar, col="blue",linetype="dotted", size=1) + # Plot the means of the 10 samples, CLT stats they shold be quite similar
theme(plot.title = element_text(hjust=.5, size=12), panel.border = element_rect(linetype = "dashed", fill = NA),
plot.background = element_rect("skyblue"), panel.background = element_rect(fill='#c5d7c0'))
# 9.2) Add another sample of 100 ages drawn from the same sample.
# The effect of adding leads to a smoother histogram.
x1 = N.Samples[,1] ## First Sample
x2 = N.Samples[,2] ## Second Sample
x = x1 + x2
# gglplot
#Need to make titles smaller
# syntax for vectors, ylab is based on the probability not in the counts, that will make histograms comparable
p1 = ggplot() + aes(x1) + geom_histogram(aes(y = ..density..), binwidth=density(x1)$bw) +
geom_density(fill="red", alpha = 0.1) + # This plots the density curve based on your hist data
labs(title="First Sample N=1",
x ="N=1", y = "Density") +
theme(plot.title = element_text(size=11, face= "bold", hjust=0.5))
p2 = ggplot() + aes(x2) + geom_histogram(aes(y = ..density..), binwidth=density(x2)$bw) +
geom_density(fill="blue", alpha = 0.1) + # This plots the density curve based on your hist data
labs(title="Second Sample N=2",
x ="N=2", y = "Density") +
theme(plot.title = element_text(size=11, face= "bold", hjust=0.5))
p3 = ggplot() + aes(x) + geom_histogram(aes(y = ..density..), binwidth=density(x)$bw) +
geom_density(fill="green", alpha = 0.1) + # This plots the density curve based on your hist data
labs(title="Sum of Samples N=1 + N=2",
x ="N = N1 + N2", y = "Density") +
theme(plot.title = element_text(size=11, face= "bold", hjust=0.5))
plot_grid(p1, p2, p3, nrow = 1) # This arranges your plots in a canvas, one row to plot all
# 9.3) Repeat this process for N = 1 (the original sample), 10, 100 and 1000 samples.
# Already generated in a previous step
print(dim(N.Samples))
## [1] 100 1000
# 9.4) 9.4 Plot the resulting distribution along with a normal distribution;
# using the mean and standard deviation after each addition.
par(mfrow = c(2, 2)) # tell the layout, 2 row s and 2 cols
## For N=10
## instead of doing a for loop we can use apply
## sum all the rows of from 1 to 100 cols
# this returns a vector of size 100 with the total sum.
X.10 = apply(N.Samples[,1:10],1,sum)
# For N=100
X.100 = apply(N.Samples[,1:100],1,sum)
# For N=1000, all the matrix
X.1000 = apply(N.Samples,1,sum)
# ggplot
df.Orig = data.frame(sample = N.Samples[,1])
Orig = ggplot(df.Orig, aes(x = sample)) +
geom_histogram(aes(y =..density..),
colour = "black",
fill = "white") +
stat_function(fun = dnorm, args = list(mean = mean(df.Orig$sample), sd = sd(df.Orig$sample)), size=1) +# this will overlay a normal dist based on your data, N[,1] in this case
labs(title="N=1 vs Normal(mean(N=1), sd(N=1))",
x ="sample", y = "Density") +
theme(plot.title = element_text(size=10, face= "bold", hjust=0.5))
df.Sample.10 = data.frame(sample = X.10)
Sample.10 = ggplot(df.Sample.10, aes(x = sample)) +
geom_histogram(aes(y =..density..),
colour = "black",
fill = "white") +
stat_function(fun = dnorm, args = list(mean = mean(df.Sample.10$sample), sd = sd(df.Sample.10$sample)), col="blue", size=1) +# this will overlay a normal dist based on your data, N[,1] in this case
labs(title="N10 = { N=1 + ...+ N=10} vs Normal(mean(N), sd(N)",
x ="sample", y = "Density") +
theme(plot.title = element_text(size=10, face="bold", hjust=0.5))
df.Sample.100 = data.frame(sample = X.100)
Sample.100 = ggplot(df.Sample.100, aes(x = sample)) +
geom_histogram(aes(y =..density..),
colour = "black",
fill = "white") +
stat_function(fun = dnorm, args = list(mean = mean(df.Sample.100$sample), sd = sd(df.Sample.100$sample)), col="red", size=1) +# this will overlay a normal dist based on your data, N[,1] in this case
labs(title="N100 = { N=1 + ...+ N=100} vs Normal(mean(N), sd(N)",
x ="sample", y = "Density") +
theme(plot.title = element_text(size=10, face= "bold", hjust=0.5))
df.Sample.1000 = data.frame(sample = X.1000)
Sample.1000 = ggplot(df.Sample.1000, aes(x = sample)) +
geom_histogram(aes(y =..density..),
colour = "black",
fill = "white") +
stat_function(fun = dnorm, args = list(mean = mean(df.Sample.1000$sample), sd = sd(df.Sample.1000$sample)), col="green", size=1) +# this will overlay a normal dist based on your data, N[,1] in this case
labs(title="N100 = { N=1 + ...+ N=100} vs Normal(mean(N), sd(N)",
x ="sample", y = "Density") +
theme(plot.title = element_text(size=10, face="bold", hjust=.5))
plot_grid(Orig, Sample.10 ,Sample.100, Sample.1000, nrow = 2, ncol=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Note that the distributions are normal and by increasing N the variation reduces