Introduction

options(warn= -1)
#Load libraries
library(ggplot2)
library(dplyr)
library(radiant.data)
library(ggthemes)
library(hexbin)
library(rmarkdown)
library(RColorBrewer)
library(readr)
library(ggthemes)
library(cowplot)
library(knitr)

Question 1:

Check data and find: Number of regions, number of genders, age ranges , total rows, number of columns.

# read the data 
Assignment1Dataset <- read.csv("Assignment1DataSet.txt")

#Checking for missing data
any(is.na(Assignment1Dataset)) #We have no missing data

## [1] FALSE

# 1.1 Print out the dimensions of the data frame.
#To find out the dimensions of the dataset we run
print(dim(Assignment1Dataset))

## [1] 56000     4

# 1.2 Print out the names and type of each of the data frame’s columns. 
#To find out the column names
print(colnames(Assignment1Dataset))

## [1] "region"     "age"        "gender"     "population"

#Finding out what type of variable R thinks each column is
print(class(Assignment1Dataset$region))

## [1] "character"

print(class(Assignment1Dataset$age))

## [1] "integer"

print(class(Assignment1Dataset$gender))

## [1] "character"

print(class(Assignment1Dataset$population))

## [1] "integer"

# 1.3 Print out the number of unique regions in the dataset. 
print( length( unique(Assignment1Dataset$region) ) ) #Print the length of the unique regions

## [1] 500

# 1.4 What is the minimum age bin? 
print( min(Assignment1Dataset$age) ) #Print the minimum age bin

## [1] 0

# 1.5 What is the maximum age bin? 
print( max(Assignment1Dataset$age)  ) #Print the maximum age bin

## [1] 55

# 1.6 What is the bin size for the age field?
print(1) #It goes from 0 to 55 in intervals of 1, therefore the bin size is 1.

## [1] 1

Question 2:

Summary statistics of all data. Note entire data set

# 2.1 Use the expected value for the age to find the mean age for the whole data sample. 
WeightedAverage <- weighted.mean(Assignment1Dataset$age, Assignment1Dataset$population)
WeightedAverage #Use the weighted.mean function to calculate the weighted mean, assign it to variable WeightedAverage

## [1] 27.80027

# 2.2 Provide the standard deviation for the whole data sample. 
weighted.sd(Assignment1Dataset$age, Assignment1Dataset$population) #Calculate weighted standard deviation

## [1] 15.77804

Question 3:

Record the following statistics on the means from each region: Create a new data frame based on the means from each region. make sure they combine male and female values to get age distributions

# We can write a loop to make a separate entry for every person in the age column of a row,
# lets unwrap the Assignment1Dataset, we will use this later
region.l = list()
age.l = list()
gender.l = list()
# no need for population as we are adding every individual
# we will make a loop for all rows
# add to list for performance
for(i in 1:length(Assignment1Dataset$region)){
  X = NULL; X = Assignment1Dataset[i,] # loop for every entry
  # We need to add X$population repeated entries
  population = NULL;  population = X$population[1]
  if(population==0){
    population = 1 # otherwise the rep function breaks
  }
  region.l[[i]] =  rep(X$region[1],population ) # append data to the vector
  age.l[[i]] =  rep(X$age[1], population) # append data to the vector
  gender.l[[i]] =  rep(X$gender[1], population) # append data to the vector
}

# This will take a while (5-10 minutes)
region = Reduce(c, region.l)
age = Reduce(c, age.l)
gender = Reduce(c, gender.l)

# remove unnecessary data
rm(region.l, age.l, gender.l)

df = data.frame(ID = seq(from = 1, to = length(region), by = 1), # useful to make a new data frame
                region = region,
                age = age,
                gender = gender)
# a new entry was made, retaining the information in the region, age & gender columns 
#(from the Assignment1Dataset data frame)

Looped_df_age_avg = df %>% group_by(region) %>%
  summarize(avg_age = mean(age), #Mean age for each unique region
            SD = sd(age), #Standard deviation for each unique region
            Min = min(age), #Minimum age for each unique region
            Q1 = quantile(age,probs =.25), #Quartile 1 for each unique region
            Median = median(age), #Median for each unique region
            Q3 = quantile(age,probs = .75), #Quartile 3 for each unique region
            Max = max(age), #Max age for each unique region
            IQR = IQR(age), .groups = "keep") #IQR for each region. To suppress the warning we add groups = "keep"
Looped_df_age_avg #See what the variable looks like

## # A tibble: 500 x 9
## # Groups:   region [500]
##    region   avg_age    SD   Min    Q1 Median    Q3   Max   IQR
##    <chr>      <dbl> <dbl> <int> <dbl>  <dbl> <dbl> <int> <dbl>
##  1 SSC20005    28.0  16.2     0    13     28  42.5    55  29.5
##  2 SSC20012    29.8  17.3     0    14     30  47      55  33  
##  3 SSC20018    29.7  17.0     0    14     30  45.5    55  31.5
##  4 SSC20027    27.9  16.8     0    13     28  43      55  30  
##  5 SSC20029    26.0  16.3     0    12     25  41      55  29  
##  6 SSC20048    29.3  16.2     0    15     31  44      55  29  
##  7 SSC20062    28.8  16.1     0    15     29  43      55  28  
##  8 SSC20076    27.9  16.8     0    13     28  43      55  30  
##  9 SSC20079    29.1  16.7     0    14     29  45      55  31  
## 10 SSC20099    27.8  16.3     0    14     28  42      55  28  
## # ... with 490 more rows

#We can now get the mean of all the unique regions values

## 3.1 Calculate the mean 
mean(Looped_df_age_avg$avg_age) #Mean of the unique regions mean ages

## [1] 28.26092

## 3.2 Calculate the standard deviation 
sd(Looped_df_age_avg$avg_age) #Standard deviation of the 500 unique regions mean age values

## [1] 1.254157

## 3.3 Calculate the minimum 
min(Looped_df_age_avg$avg_age) #Minimum of the 500 unique regions mean age values

## [1] 24.84414

## 3.4 Calculate the first quartile 
quantile(Looped_df_age_avg$avg_age, probs = .25) #Q1 of the 500 unique regions mean age values

##      25% 
## 27.43381

## 3.5 Calculate the median
median(Looped_df_age_avg$avg_age) #Median of the 500 unique regions mean age values

## [1] 28.13439

## 3.6 Calculate the third quartile 
quantile(Looped_df_age_avg$avg_age, probs = .75) #Q3 of the 500 unique regions mean age values

##      75% 
## 28.99397

## 3.7 Calculate the maximum 
max(Looped_df_age_avg$avg_age) #Maximum of the 500 unique regions mean age values

## [1] 32.33333

## 3.8 Calculate the interquartile range 
IQR(Looped_df_age_avg$avg_age) #IQR of the 500 unique regions mean age values

## [1] 1.560161

## 3.9 histogram plot of the distribution of means from each region 
ggplot(data=Looped_df_age_avg, aes(avg_age)) +
  geom_histogram(binwidth = .5, colour ="black", fill="#FF6666", alpha=.2) + #Make bin size 0.5, change colour, fill & alpha
  scale_x_continuous(breaks=seq(24, 33, .5)) +  # X axis Ticks from 24-34, every .5
  scale_y_continuous(breaks=seq(0, 100, 5)) + # Y axis Ticks from 0-30, every 2
  geom_vline(aes(xintercept=median(avg_age, na.rm=T)), color="red", linetype="dashed", size=1) + #Add median line
  geom_vline(aes(xintercept=mean(avg_age, na.rm=T)), color="blue", linetype="dashed", size=1) + #Add mean line
  ggtitle("Histogram plot of the distribution of mean age from each unique region") + #Add title
  labs(x = "Average age of the unique region", y= "Count of unique regions") + #Label the axis'
  theme(plot.title = element_text(hjust= 0.5), 
        panel.border = element_rect(linetype = "dashed", fill=NA)) #Centre title, added dash around border

#We can see the centre follows a normal distribution but it's not a 100% perfect normal distribution on the edges 
#Make it a square/rect
ggplot(Looped_df_age_avg, aes(sample=avg_age)) +
  stat_qq() + #Add QQ stat
  stat_qq_line(color = "red") + #Make QQPlot line red
  ggtitle("Quantile-Quantile Plot of the distribution of mean age from each unique region")+ #Add title
  theme_economist()+ #Add economist theme
  theme(plot.title = element_text(hjust=.5, size=12), ##Centre title
        panel.border = element_rect(linetype = "dashed", fill=NA)) #Add a panel border

Question 4:

Get descriptive statistics for the region with the smallest population

#Aggregate the number of people in each unique region, assign it to variable Region_Pop
Region_Pop <- aggregate(x = Assignment1Dataset$population,
                        by = list(Assignment1Dataset$region),
                        FUN = sum)
#4.1 Show which region has the least people and how many it has.
#Sort the regions to find out the one with the least people in it
sort(Region_Pop$x, decreasing = F)

##   [1]     3     3     3     3     3     3     3     3     3     3     3     3
##  [13]     3     3     3     3     3     3     3     3     3     3     3     3
##  [25]     3     3     3     3     3     3     3     3     3     3     3     4
##  [37]     4     5     5     5     6     6     6     6     6     6     6     6
##  [49]     6     6     6     6     6     6     6     6     7     7     7     7
##  [61]     7     8     8     9     9     9     9     9     9     9     9     9
##  [73]    10    10    10    10    12    12    12    12    12    12    12    12
##  [85]    12    12    12    12    12    12    12    13    13    14    14    15
##  [97]    15    15    15    15    15    15    15    15    16    16    16    16
## [109]    16    17    17    17    18    18    18    18    18    18    18    18
## [121]    19    19    19    20    21    21    21    21    21    21    21    22
## [133]    22    22    22    22    23    23    23    23    24    24    24    24
## [145]    24    24    25    25    26    27    27    28    28    28    28    29
## [157]    29    30    30    30    32    32    32    33    33    33    34    34
## [169]    34    35    35    36    36    36    36    36    37    37    38    38
## [181]    38    39    40    40    40    41    41    41    42    42    42    44
## [193]    45    45    46    47    48    50    50    51    51    52    52    52
## [205]    54    54    54    55    56    57    57    57    58    58    59    60
## [217]    60    60    61    61    61    61    62    63    65    65    66    67
## [229]    67    69    69    69    69    70    71    71    72    72    73    73
## [241]    74    74    75    76    76    77    78    79    79    81    81    84
## [253]    86    86    87    88    88    88    89    89    93    98    99   100
## [265]   102   103   104   108   109   112   117   120   120   123   127   128
## [277]   129   130   130   130   132   132   135   138   144   144   147   148
## [289]   148   152   154   158   159   159   161   162   163   166   171   176
## [301]   178   183   183   190   190   191   191   194   198   199   201   202
## [313]   204   205   208   210   211   211   211   213   216   217   219   229
## [325]   239   239   245   249   260   265   267   268   270   277   283   284
## [337]   286   289   290   292   299   304   307   310   314   323   337   351
## [349]   353   359   373   376   380   425   427   440   468   475   489   494
## [361]   526   528   538   547   568   574   616   663   698   714   750   762
## [373]   779   790   794   808   831   833   847   862   924   942   988  1011
## [385]  1055  1079  1082  1082  1094  1137  1148  1159  1166  1188  1207  1213
## [397]  1226  1229  1282  1288  1289  1301  1330  1367  1412  1434  1536  1554
## [409]  1752  1790  1799  1826  1879  1908  1933  1953  2049  2257  2289  2306
## [421]  2342  2386  2391  2545  2548  2563  2575  2615  2643  2661  2823  2960
## [433]  3104  3168  3269  3363  3391  3439  3504  3555  3735  3772  3838  3981
## [445]  4060  4074  4153  4177  4306  4310  4978  4978  5016  5033  5056  5402
## [457]  5821  6048  6281  6461  6550  6666  6803  7282  7287  7776  7866  8174
## [469]  9046  9116  9729  9802  9818  9837 10596 10770 10893 10937 11218 11357
## [481] 12350 12544 13253 13730 14546 15608 15972 16669 16705 17140 17442 17809
## [493] 17928 18540 19180 19274 19340 20939 22979 37948

#We can see there are 35 regions with 3 people in them
# Populations is a data.frame with the regions and the summed population as columns
Populations = aggregate(x = Assignment1Dataset$population,
                        by = list(Assignment1Dataset$region),
                        FUN = sum)
# Now get the min population region using which
# index will contain the row numbers of the minimum populations
index = which(Populations$x == min(Populations$x))
min.Subpopulations = Populations[index,]
print(min.Subpopulations)

##      Group.1 x
## 10  SSC20099 3
## 14  SSC20127 3
## 17  SSC20151 3
## 51  SSC20346 3
## 62  SSC20383 3
## 66  SSC20398 3
## 71  SSC20422 3
## 82  SSC20502 3
## 83  SSC20503 3
## 87  SSC20516 3
## 160 SSC21012 3
## 164 SSC21026 3
## 167 SSC21037 3
## 213 SSC21283 3
## 345 SSC22012 3
## 348 SSC22021 3
## 349 SSC22028 3
## 359 SSC22070 3
## 364 SSC22084 3
## 377 SSC22157 3
## 384 SSC22193 3
## 391 SSC22237 3
## 398 SSC22281 3
## 416 SSC22367 3
## 446 SSC22597 3
## 448 SSC22605 3
## 449 SSC22606 3
## 451 SSC22616 3
## 459 SSC22698 3
## 461 SSC22702 3
## 467 SSC22719 3
## 477 SSC22772 3
## 482 SSC22809 3
## 484 SSC22840 3
## 493 SSC22873 3

#4.2 Plot the distribution of ages for the region with the least people. 
#I will graph ssc20099 because it is the first region which comes up
Region.SSC20099.2 <- Assignment1Dataset %>% filter(region =="SSC20099",population>=1)
Region.SSC20099.2

##     region age gender population
## 1 SSC20099  46      M          3

#Plot
ggplot(data=Region.SSC20099.2, aes(x=age, y=population))+ #Tell ggplot what to map
  geom_col(color="black",fill="red",alpha=0.4) + #Column chart, fill it red, have a dashed line & make alpha 0.4
  xlab("Age in years") + ylab("Number of people") + ggtitle("Distribution of people by age in the SSC20099 region") +
  coord_cartesian(ylim=c(1, 3)) + #Set max and min points for y axis data
  expand_limits(x=c(0,56)) + #Expand the axis point view limits 
  scale_x_continuous(breaks=seq(0, 56, 2)) +  # Ticks from 0-56, every 2
  scale_y_continuous(breaks=seq(1, 3, 1)) + # Ticks from 1-3, every 1
  theme(plot.title = element_text(hjust=.5), #Make title centred, add a dashed panel border
        panel.border = element_rect(colour = "black", fill=NA, size=2), 
        panel.background = element_rect(fill='skyblue'))

Question 5:

Get descriptive statistics for the region with the highest population

#We can slightly amend the code from earlier to make it in decreasing order, to find out the largest region
sort(Region_Pop$x, decreasing = T)

##   [1] 37948 22979 20939 19340 19274 19180 18540 17928 17809 17442 17140 16705
##  [13] 16669 15972 15608 14546 13730 13253 12544 12350 11357 11218 10937 10893
##  [25] 10770 10596  9837  9818  9802  9729  9116  9046  8174  7866  7776  7287
##  [37]  7282  6803  6666  6550  6461  6281  6048  5821  5402  5056  5033  5016
##  [49]  4978  4978  4310  4306  4177  4153  4074  4060  3981  3838  3772  3735
##  [61]  3555  3504  3439  3391  3363  3269  3168  3104  2960  2823  2661  2643
##  [73]  2615  2575  2563  2548  2545  2391  2386  2342  2306  2289  2257  2049
##  [85]  1953  1933  1908  1879  1826  1799  1790  1752  1554  1536  1434  1412
##  [97]  1367  1330  1301  1289  1288  1282  1229  1226  1213  1207  1188  1166
## [109]  1159  1148  1137  1094  1082  1082  1079  1055  1011   988   942   924
## [121]   862   847   833   831   808   794   790   779   762   750   714   698
## [133]   663   616   574   568   547   538   528   526   494   489   475   468
## [145]   440   427   425   380   376   373   359   353   351   337   323   314
## [157]   310   307   304   299   292   290   289   286   284   283   277   270
## [169]   268   267   265   260   249   245   239   239   229   219   217   216
## [181]   213   211   211   211   210   208   205   204   202   201   199   198
## [193]   194   191   191   190   190   183   183   178   176   171   166   163
## [205]   162   161   159   159   158   154   152   148   148   147   144   144
## [217]   138   135   132   132   130   130   130   129   128   127   123   120
## [229]   120   117   112   109   108   104   103   102   100    99    98    93
## [241]    89    89    88    88    88    87    86    86    84    81    81    79
## [253]    79    78    77    76    76    75    74    74    73    73    72    72
## [265]    71    71    70    69    69    69    69    67    67    66    65    65
## [277]    63    62    61    61    61    61    60    60    60    59    58    58
## [289]    57    57    57    56    55    54    54    54    52    52    52    51
## [301]    51    50    50    48    47    46    45    45    44    42    42    42
## [313]    41    41    41    40    40    40    39    38    38    38    37    37
## [325]    36    36    36    36    36    35    35    34    34    34    33    33
## [337]    33    32    32    32    30    30    30    29    29    28    28    28
## [349]    28    27    27    26    25    25    24    24    24    24    24    24
## [361]    23    23    23    23    22    22    22    22    22    21    21    21
## [373]    21    21    21    21    20    19    19    19    18    18    18    18
## [385]    18    18    18    18    17    17    17    16    16    16    16    16
## [397]    15    15    15    15    15    15    15    15    15    14    14    13
## [409]    13    12    12    12    12    12    12    12    12    12    12    12
## [421]    12    12    12    12    10    10    10    10     9     9     9     9
## [433]     9     9     9     9     9     8     8     7     7     7     7     7
## [445]     6     6     6     6     6     6     6     6     6     6     6     6
## [457]     6     6     6     6     5     5     5     4     4     3     3     3
## [469]     3     3     3     3     3     3     3     3     3     3     3     3
## [481]     3     3     3     3     3     3     3     3     3     3     3     3
## [493]     3     3     3     3     3     3     3     3

index2 = which(Populations$x == max(Populations$x))
max.Subpopulations = Populations[index2,]
print(max.Subpopulations)

##      Group.1     x
## 346 SSC22015 37948

#5.1 Plot cumulative distribution for the regions with the most people. 
LargestRegion2 <- Assignment1Dataset %>% filter (region=="SSC22015")
LargestRegion2

##       region age gender population
## 1   SSC22015   0      M        455
## 2   SSC22015   0      F        423
## 3   SSC22015   1      M        492
## 4   SSC22015   1      F        479
## 5   SSC22015   2      M        465
## 6   SSC22015   2      F        453
## 7   SSC22015   3      M        478
## 8   SSC22015   3      F        497
## 9   SSC22015   4      M        527
## 10  SSC22015   4      F        438
## 11  SSC22015   5      M        434
## 12  SSC22015   5      F        413
## 13  SSC22015   6      M        415
## 14  SSC22015   6      F        404
## 15  SSC22015   7      M        396
## 16  SSC22015   7      F        396
## 17  SSC22015   8      M        409
## 18  SSC22015   8      F        358
## 19  SSC22015   9      M        354
## 20  SSC22015   9      F        341
## 21  SSC22015  10      M        371
## 22  SSC22015  10      F        352
## 23  SSC22015  11      M        329
## 24  SSC22015  11      F        329
## 25  SSC22015  12      M        303
## 26  SSC22015  12      F        274
## 27  SSC22015  13      M        305
## 28  SSC22015  13      F        292
## 29  SSC22015  14      M        295
## 30  SSC22015  14      F        255
## 31  SSC22015  15      M        249
## 32  SSC22015  15      F        290
## 33  SSC22015  16      M        284
## 34  SSC22015  16      F        261
## 35  SSC22015  17      M        286
## 36  SSC22015  17      F        265
## 37  SSC22015  18      M        258
## 38  SSC22015  18      F        286
## 39  SSC22015  19      M        282
## 40  SSC22015  19      F        259
## 41  SSC22015  20      M        254
## 42  SSC22015  20      F        243
## 43  SSC22015  21      M        263
## 44  SSC22015  21      F        325
## 45  SSC22015  22      M        306
## 46  SSC22015  22      F        340
## 47  SSC22015  23      M        311
## 48  SSC22015  23      F        338
## 49  SSC22015  24      M        316
## 50  SSC22015  24      F        363
## 51  SSC22015  25      M        302
## 52  SSC22015  25      F        411
## 53  SSC22015  26      M        372
## 54  SSC22015  26      F        424
## 55  SSC22015  27      M        320
## 56  SSC22015  27      F        390
## 57  SSC22015  28      M        356
## 58  SSC22015  28      F        447
## 59  SSC22015  29      M        380
## 60  SSC22015  29      F        411
## 61  SSC22015  30      M        455
## 62  SSC22015  30      F        482
## 63  SSC22015  31      M        428
## 64  SSC22015  31      F        478
## 65  SSC22015  32      M        414
## 66  SSC22015  32      F        482
## 67  SSC22015  33      M        410
## 68  SSC22015  33      F        406
## 69  SSC22015  34      M        396
## 70  SSC22015  34      F        459
## 71  SSC22015  35      M        413
## 72  SSC22015  35      F        391
## 73  SSC22015  36      M        339
## 74  SSC22015  36      F        382
## 75  SSC22015  37      M        348
## 76  SSC22015  37      F        386
## 77  SSC22015  38      M        364
## 78  SSC22015  38      F        313
## 79  SSC22015  39      M        325
## 80  SSC22015  39      F        335
## 81  SSC22015  40      M        294
## 82  SSC22015  40      F        294
## 83  SSC22015  41      M        305
## 84  SSC22015  41      F        312
## 85  SSC22015  42      M        297
## 86  SSC22015  42      F        291
## 87  SSC22015  43      M        294
## 88  SSC22015  43      F        312
## 89  SSC22015  44      M        347
## 90  SSC22015  44      F        354
## 91  SSC22015  45      M        293
## 92  SSC22015  45      F        319
## 93  SSC22015  46      M        275
## 94  SSC22015  46      F        311
## 95  SSC22015  47      M        264
## 96  SSC22015  47      F        294
## 97  SSC22015  48      M        271
## 98  SSC22015  48      F        265
## 99  SSC22015  49      M        248
## 100 SSC22015  49      F        271
## 101 SSC22015  50      M        218
## 102 SSC22015  50      F        227
## 103 SSC22015  51      M        211
## 104 SSC22015  51      F        261
## 105 SSC22015  52      M        262
## 106 SSC22015  52      F        218
## 107 SSC22015  53      M        238
## 108 SSC22015  53      F        274
## 109 SSC22015  54      M        194
## 110 SSC22015  54      F        217
## 111 SSC22015  55      M        175
## 112 SSC22015  55      F        212

#Grouping by region and age and then doing a sum and a cum sum
LargestRegion5.1 <- aggregate(x = LargestRegion2$population,
                              by = list(LargestRegion2$age),
                              FUN = sum)
LargestRegion5.1

##    Group.1   x
## 1        0 878
## 2        1 971
## 3        2 918
## 4        3 975
## 5        4 965
## 6        5 847
## 7        6 819
## 8        7 792
## 9        8 767
## 10       9 695
## 11      10 723
## 12      11 658
## 13      12 577
## 14      13 597
## 15      14 550
## 16      15 539
## 17      16 545
## 18      17 551
## 19      18 544
## 20      19 541
## 21      20 497
## 22      21 588
## 23      22 646
## 24      23 649
## 25      24 679
## 26      25 713
## 27      26 796
## 28      27 710
## 29      28 803
## 30      29 791
## 31      30 937
## 32      31 906
## 33      32 896
## 34      33 816
## 35      34 855
## 36      35 804
## 37      36 721
## 38      37 734
## 39      38 677
## 40      39 660
## 41      40 588
## 42      41 617
## 43      42 588
## 44      43 606
## 45      44 701
## 46      45 612
## 47      46 586
## 48      47 558
## 49      48 536
## 50      49 519
## 51      50 445
## 52      51 472
## 53      52 480
## 54      53 512
## 55      54 411
## 56      55 387

names(LargestRegion5.1)[1] <- "age" #Rename column to age
names(LargestRegion5.1)[2] <- "population" #Rename column to population
LargestRegion5.1

##    age population
## 1    0        878
## 2    1        971
## 3    2        918
## 4    3        975
## 5    4        965
## 6    5        847
## 7    6        819
## 8    7        792
## 9    8        767
## 10   9        695
## 11  10        723
## 12  11        658
## 13  12        577
## 14  13        597
## 15  14        550
## 16  15        539
## 17  16        545
## 18  17        551
## 19  18        544
## 20  19        541
## 21  20        497
## 22  21        588
## 23  22        646
## 24  23        649
## 25  24        679
## 26  25        713
## 27  26        796
## 28  27        710
## 29  28        803
## 30  29        791
## 31  30        937
## 32  31        906
## 33  32        896
## 34  33        816
## 35  34        855
## 36  35        804
## 37  36        721
## 38  37        734
## 39  38        677
## 40  39        660
## 41  40        588
## 42  41        617
## 43  42        588
## 44  43        606
## 45  44        701
## 46  45        612
## 47  46        586
## 48  47        558
## 49  48        536
## 50  49        519
## 51  50        445
## 52  51        472
## 53  52        480
## 54  53        512
## 55  54        411
## 56  55        387

# 5.1 Plot cumulative distribution for the regions with the most people. 
#This graph is symmetric it has no skew, which we can see because the mean and median are equal
ggplot(data=LargestRegion5.1, aes(x=age,y=cumsum(population))) + #Plot the cum sum for the largest region
  geom_point(aes(x=age,y=cumsum(population))) + #Use geom_point as our plotting method
  geom_line() + #Add in a geom_line between points
  ggtitle("Cumulative Distribution of people by age in the SSC22015 region") + #Add a title
  geom_vline(aes(xintercept=median(age, na.rm=T)), color="red", linetype="dashed", size=1) + #Add median line
  geom_vline(aes(xintercept=mean(age, na.rm=T)), color="blue", linetype="dashed", size=.5) + #Add mean line
  scale_x_continuous(breaks=seq(0, 56, 2)) +# x axis Tick marks from 0-56, every 2
  scale_y_continuous(breaks=seq(0, 40000, 5000)) + #Set y axis tick marks from 0 to 40000 every 5000
  theme_fivethirtyeight() + #Use fivethirtyeight theme
  theme(plot.title = element_text(hjust=.5, size=14)) + #Centre title
  theme(axis.title = element_text()) + ylab("Cumulative Sum of the number people") + xlab("Age in years") #Label axis'

# 5.2 Plot the cumulative distribution for males and females on the same plot. 
#We can see there are more cumulative females than cumulative males at every interval
ggplot(data=LargestRegion2, aes(x=age,y=cumsum(population),color=gender)) + #Plot the cum sum for the largest region
  geom_point(aes(x=age,y=cumsum(population), alpha= 0.4)) + #Use geom_point as our plotting method
  geom_line() + #Add in a geom_line between points
  xlab("Age in Years") + ylab("Cumulative Sum of the number of people") + #Add in axis labels
  ggtitle("Cumulative Distribution of People by Age and Gender in the SSC22015 Region") + #Add title
  scale_x_continuous(breaks=seq(0, 56, 2)) +# x axis Tick marks from 0-56, every 2
  scale_y_continuous(breaks=seq(0, 40000, 5000)) + #Set y axis tick marks from 0 to 40000 every 5000
  theme(plot.title = element_text(hjust=.5), #Centre title  
        panel.border = element_rect(linetype = "dashed", fill = NA), #add dashed border, change panel background,
        plot.background = element_rect("skyblue"), 
        panel.background = element_rect(fill='#c5d7c0'))#change plot background

Question 6:

Plot age ratio as a function of the population of the region (remove any with zero young or old)

#Question 6 & 7
#Making a new data frame for these two questions
unique_region_totals <- Assignment1Dataset %>%
  group_by(region) %>%
  summarize(sum_population = sum(population))
unique_region_totals

## # A tibble: 500 x 2
##    region   sum_population
##    <chr>             <int>
##  1 SSC20005             33
##  2 SSC20012            425
##  3 SSC20018            100
##  4 SSC20027           1137
##  5 SSC20029             51
##  6 SSC20048            924
##  7 SSC20062            359
##  8 SSC20076           5821
##  9 SSC20079           4978
## 10 SSC20099              3
## # ... with 490 more rows

#Vector of unique regions
VecUniqueRegions <- unique_region_totals$region
VecUniqueRegions

##   [1] "SSC20005" "SSC20012" "SSC20018" "SSC20027" "SSC20029" "SSC20048"
##   [7] "SSC20062" "SSC20076" "SSC20079" "SSC20099" "SSC20101" "SSC20106"
##  [13] "SSC20107" "SSC20127" "SSC20135" "SSC20140" "SSC20151" "SSC20161"
##  [19] "SSC20163" "SSC20167" "SSC20170" "SSC20173" "SSC20177" "SSC20179"
##  [25] "SSC20190" "SSC20191" "SSC20196" "SSC20200" "SSC20201" "SSC20204"
##  [31] "SSC20205" "SSC20211" "SSC20241" "SSC20249" "SSC20257" "SSC20266"
##  [37] "SSC20269" "SSC20275" "SSC20276" "SSC20282" "SSC20293" "SSC20305"
##  [43] "SSC20306" "SSC20312" "SSC20313" "SSC20325" "SSC20330" "SSC20337"
##  [49] "SSC20343" "SSC20345" "SSC20346" "SSC20352" "SSC20353" "SSC20355"
##  [55] "SSC20356" "SSC20360" "SSC20361" "SSC20366" "SSC20367" "SSC20373"
##  [61] "SSC20378" "SSC20383" "SSC20391" "SSC20392" "SSC20393" "SSC20398"
##  [67] "SSC20401" "SSC20407" "SSC20415" "SSC20417" "SSC20422" "SSC20423"
##  [73] "SSC20433" "SSC20446" "SSC20452" "SSC20453" "SSC20473" "SSC20477"
##  [79] "SSC20491" "SSC20492" "SSC20500" "SSC20502" "SSC20503" "SSC20506"
##  [85] "SSC20514" "SSC20515" "SSC20516" "SSC20518" "SSC20519" "SSC20522"
##  [91] "SSC20523" "SSC20525" "SSC20534" "SSC20536" "SSC20551" "SSC20556"
##  [97] "SSC20558" "SSC20564" "SSC20567" "SSC20571" "SSC20577" "SSC20578"
## [103] "SSC20579" "SSC20583" "SSC20605" "SSC20615" "SSC20617" "SSC20620"
## [109] "SSC20628" "SSC20656" "SSC20660" "SSC20668" "SSC20691" "SSC20701"
## [115] "SSC20706" "SSC20718" "SSC20725" "SSC20739" "SSC20742" "SSC20747"
## [121] "SSC20770" "SSC20773" "SSC20774" "SSC20787" "SSC20796" "SSC20798"
## [127] "SSC20799" "SSC20810" "SSC20814" "SSC20817" "SSC20822" "SSC20827"
## [133] "SSC20830" "SSC20836" "SSC20837" "SSC20852" "SSC20858" "SSC20864"
## [139] "SSC20865" "SSC20870" "SSC20879" "SSC20883" "SSC20890" "SSC20894"
## [145] "SSC20911" "SSC20912" "SSC20934" "SSC20943" "SSC20948" "SSC20953"
## [151] "SSC20957" "SSC20975" "SSC20977" "SSC20986" "SSC20989" "SSC20995"
## [157] "SSC20999" "SSC21004" "SSC21011" "SSC21012" "SSC21013" "SSC21014"
## [163] "SSC21020" "SSC21026" "SSC21032" "SSC21034" "SSC21037" "SSC21040"
## [169] "SSC21049" "SSC21055" "SSC21061" "SSC21062" "SSC21077" "SSC21088"
## [175] "SSC21092" "SSC21101" "SSC21102" "SSC21113" "SSC21120" "SSC21121"
## [181] "SSC21125" "SSC21126" "SSC21136" "SSC21137" "SSC21141" "SSC21143"
## [187] "SSC21144" "SSC21152" "SSC21169" "SSC21170" "SSC21175" "SSC21178"
## [193] "SSC21184" "SSC21189" "SSC21190" "SSC21191" "SSC21193" "SSC21200"
## [199] "SSC21207" "SSC21210" "SSC21211" "SSC21214" "SSC21218" "SSC21233"
## [205] "SSC21238" "SSC21244" "SSC21251" "SSC21256" "SSC21260" "SSC21264"
## [211] "SSC21267" "SSC21281" "SSC21283" "SSC21286" "SSC21294" "SSC21304"
## [217] "SSC21305" "SSC21308" "SSC21316" "SSC21320" "SSC21324" "SSC21329"
## [223] "SSC21332" "SSC21342" "SSC21347" "SSC21348" "SSC21350" "SSC21351"
## [229] "SSC21367" "SSC21369" "SSC21374" "SSC21380" "SSC21387" "SSC21405"
## [235] "SSC21408" "SSC21412" "SSC21416" "SSC21417" "SSC21421" "SSC21426"
## [241] "SSC21440" "SSC21441" "SSC21457" "SSC21459" "SSC21461" "SSC21462"
## [247] "SSC21475" "SSC21479" "SSC21494" "SSC21505" "SSC21511" "SSC21514"
## [253] "SSC21517" "SSC21534" "SSC21536" "SSC21544" "SSC21547" "SSC21561"
## [259] "SSC21562" "SSC21567" "SSC21575" "SSC21585" "SSC21590" "SSC21594"
## [265] "SSC21597" "SSC21601" "SSC21602" "SSC21620" "SSC21630" "SSC21638"
## [271] "SSC21639" "SSC21654" "SSC21660" "SSC21666" "SSC21671" "SSC21674"
## [277] "SSC21678" "SSC21681" "SSC21683" "SSC21690" "SSC21691" "SSC21701"
## [283] "SSC21702" "SSC21719" "SSC21730" "SSC21732" "SSC21734" "SSC21736"
## [289] "SSC21741" "SSC21743" "SSC21755" "SSC21760" "SSC21769" "SSC21771"
## [295] "SSC21778" "SSC21779" "SSC21784" "SSC21798" "SSC21801" "SSC21802"
## [301] "SSC21803" "SSC21805" "SSC21808" "SSC21812" "SSC21817" "SSC21820"
## [307] "SSC21823" "SSC21830" "SSC21839" "SSC21848" "SSC21849" "SSC21858"
## [313] "SSC21862" "SSC21863" "SSC21888" "SSC21889" "SSC21890" "SSC21892"
## [319] "SSC21899" "SSC21900" "SSC21902" "SSC21905" "SSC21907" "SSC21914"
## [325] "SSC21915" "SSC21916" "SSC21918" "SSC21919" "SSC21928" "SSC21932"
## [331] "SSC21939" "SSC21945" "SSC21946" "SSC21947" "SSC21950" "SSC21951"
## [337] "SSC21956" "SSC21960" "SSC21963" "SSC21979" "SSC21988" "SSC22003"
## [343] "SSC22007" "SSC22011" "SSC22012" "SSC22015" "SSC22019" "SSC22021"
## [349] "SSC22028" "SSC22029" "SSC22030" "SSC22039" "SSC22043" "SSC22046"
## [355] "SSC22052" "SSC22053" "SSC22059" "SSC22065" "SSC22070" "SSC22072"
## [361] "SSC22075" "SSC22076" "SSC22082" "SSC22084" "SSC22085" "SSC22086"
## [367] "SSC22089" "SSC22096" "SSC22101" "SSC22106" "SSC22110" "SSC22124"
## [373] "SSC22125" "SSC22133" "SSC22134" "SSC22139" "SSC22157" "SSC22168"
## [379] "SSC22170" "SSC22175" "SSC22180" "SSC22185" "SSC22190" "SSC22193"
## [385] "SSC22200" "SSC22209" "SSC22224" "SSC22227" "SSC22232" "SSC22236"
## [391] "SSC22237" "SSC22239" "SSC22254" "SSC22263" "SSC22265" "SSC22273"
## [397] "SSC22274" "SSC22281" "SSC22283" "SSC22284" "SSC22288" "SSC22296"
## [403] "SSC22309" "SSC22323" "SSC22327" "SSC22330" "SSC22331" "SSC22333"
## [409] "SSC22338" "SSC22340" "SSC22342" "SSC22344" "SSC22349" "SSC22356"
## [415] "SSC22366" "SSC22367" "SSC22371" "SSC22373" "SSC22386" "SSC22396"
## [421] "SSC22398" "SSC22424" "SSC22442" "SSC22453" "SSC22467" "SSC22468"
## [427] "SSC22470" "SSC22476" "SSC22482" "SSC22489" "SSC22490" "SSC22495"
## [433] "SSC22500" "SSC22505" "SSC22513" "SSC22517" "SSC22521" "SSC22523"
## [439] "SSC22525" "SSC22532" "SSC22534" "SSC22556" "SSC22561" "SSC22569"
## [445] "SSC22586" "SSC22597" "SSC22598" "SSC22605" "SSC22606" "SSC22609"
## [451] "SSC22616" "SSC22621" "SSC22650" "SSC22652" "SSC22653" "SSC22659"
## [457] "SSC22660" "SSC22696" "SSC22698" "SSC22701" "SSC22702" "SSC22703"
## [463] "SSC22706" "SSC22708" "SSC22710" "SSC22717" "SSC22719" "SSC22729"
## [469] "SSC22734" "SSC22737" "SSC22744" "SSC22747" "SSC22752" "SSC22755"
## [475] "SSC22761" "SSC22764" "SSC22772" "SSC22773" "SSC22779" "SSC22800"
## [481] "SSC22803" "SSC22809" "SSC22838" "SSC22840" "SSC22844" "SSC22846"
## [487] "SSC22849" "SSC22861" "SSC22864" "SSC22867" "SSC22869" "SSC22871"
## [493] "SSC22873" "SSC22877" "SSC22886" "SSC22910" "SSC22912" "SSC22915"
## [499] "SSC22918" "SSC22922"

#Vector of unique regions population totals
VecUniquePopTotals <- unique_region_totals$sum_population
VecUniquePopTotals

##   [1]    33   425   100  1137    51   924   359  5821  4978     3    38    17
##  [13]  2049     3  1826   270     3  2643   178    12   314  6666    24    15
##  [25]  3439  3168    18  9837    10    41     9    60    54    89     8    27
##  [37]   161    52     6   138    58  3391    57  9116  9046    29    21    60
##  [49]  3104    36     3   112    61    38    16  9802 11357    40   104   123
##  [61]     6     3   159   103   211     3    21  2615     6  3555     3    75
##  [73]    52     5   166  3735  1188   120   213 16705    39     3     3    29
##  [85]   190    23     3  3772  1282     7   194  6803  5402 15972     7   239
##  [97]  3363   198  9729  5016  1011  1790  7287   132    54    48   494  2306
## [109]     9    12 17442    14    58   427    18   148  6461   211  1082    45
## [121]   117 18540    12   171  2563   219    16   284    50  3981  1367   568
## [133]  1752    18   847    72  7866   239 17809     6 15608  1434    88     9
## [145] 19340  1055   862 12350   132    89    79    14    15     9    12   547
## [157]    69    32    65     3   210     5    54     3   130    37     3 17140
## [169]  1554     7   147   475   468     4     6    46    76   286    67  6281
## [181] 20939    61  6550     5   217 19180 11218   616    44  1288    22 13730
## [193] 14546     6     7     9   663    30     6  1908     6  2575    25   183
## [205]  2661    84    18   229   304   128    35    41     3   808    24  2386
## [217]    12  9818    21  2289    41  4978  5033    18  2391    18  1207   211
## [229]    12    52     9   249    22    18   176   127    12   102    88  1094
## [241]  2548   310    22   942    50    74    73   698    28   323   988   376
## [253]   538  1799   135  7282    70  3504    21     8   528    28    63    22
## [265]   289  1148    36    51    74   750    12   440    81  2960 22979  1213
## [277]    10    23    15    12   129     6    77    26 10770    28  4310    30
## [289]    15 16669   794   489    34  2545  1289    21    36   148   144    81
## [301]    55     6   265   245    40   526    69    61    40    59    15   201
## [313]    61   283    38     6    19    23   779   191    47   790    45    12
## [325]   307   216   292    37  1166   154    13    19     4    17    72  2823
## [337]   162    79    67    69  6048    36   120    20     3 37948   267     3
## [349]     3 13253  7776    71    15    98   208   109     9   204     3  1879
## [361]    60  1226   130     3    15   380    33    62    99 10596  1536    88
## [373]   337   130   159  1953     3   163    13     6    10    65    42     3
## [385]    19  5056    86   373    33  4177     3  1330  4060    12   833    12
## [397]    21     3    93   260   762   290  1933  1229    28    22    42 10937
## [409]   183   199     6   108     9    36    34     3    78    17   351    56
## [421]    73    34    12   152  1301     6    24  1412    16   714    57    30
## [433]    16    24   190     7   574   202    66    32  8174 17928    35 19274
## [445]   277     3   299     3     3    10     3    87   831    86    71   268
## [457]  2257    76     3    27     3     9    24  3838    12  4306     3   158
## [469]    18  1082 10893  4153 12544  2342  4074    21     3    12  1159    42
## [481]    57     3   205     3    32    69    15  1079     6     6   144    25
## [493]     3  3269    16   353   191    23    24    15

#6 
#Old to young ratio
OldToYoung <- Assignment1Dataset %>%
  group_by(region) %>%
  summarise(AgeRatio = sum(population[age>=40]) / sum(population[age<40]))

#Testing the vector works, it does
OldToYoung

## # A tibble: 500 x 2
##    region   AgeRatio
##    <chr>       <dbl>
##  1 SSC20005    1.06 
##  2 SSC20012    0.721
##  3 SSC20018    1    
##  4 SSC20027    0.484
##  5 SSC20029    0.308
##  6 SSC20048    0.520
##  7 SSC20062    0.453
##  8 SSC20076    0.495
##  9 SSC20079    0.559
## 10 SSC20099  Inf    
## # ... with 490 more rows

#Vector of Old to young age ratios
VecOldToYoung <- OldToYoung$AgeRatio
VecOldToYoung

##   [1] 1.0625000 0.7206478 1.0000000 0.4843342 0.3076923 0.5197368 0.4534413
##   [8] 0.4952479 0.5585473       Inf 0.9000000 0.0000000 0.5268256 0.0000000
##  [15] 0.4232268 0.9014085 0.0000000 0.5251010 0.8541667 1.0000000 0.6185567
##  [22] 0.3963134 3.0000000 0.2500000 0.4293433 0.4155496 1.0000000 0.3736908
##  [29] 2.3333333 0.4642857 2.0000000 0.4634146 0.7419355 1.6176471 1.0000000
##  [36] 8.0000000 0.7127660 1.1666667 1.0000000 0.7692308 0.4146341 0.3292826
##  [43] 1.8500000 0.2760358 0.4110123 0.2608696 1.3333333 0.7647059 0.4410399
##  [50] 0.0000000 0.0000000 0.6969697 0.6486486 1.0000000 0.2307692 0.2587646
##  [57] 0.3253589 1.6666667 0.9622642 0.7826087 0.0000000       Inf 0.5000000
##  [64] 0.4927536 0.9181818       Inf 0.1666667 0.4765669       Inf 0.4323127
##  [71] 0.0000000 0.9230769 1.0800000 0.0000000 0.3495935 0.3464311 0.5448635
##  [78] 0.3333333 0.6015038 0.1073914 1.6000000       Inf 0.0000000 2.2222222
##  [85] 0.3970588 0.6428571       Inf 0.4006684 0.2186312 0.7500000 0.6033058
##  [92] 0.3100327 0.4564573 0.4681496 0.0000000 0.6950355 0.3177900 0.6923077
##  [99] 0.2848653 0.4078024 0.6201923 0.2722104 0.2454281 0.6923077 2.6000000
## [106] 0.4545455 0.6745763 0.3588686 0.0000000 0.0000000 0.3126129 3.6666667
## [113] 0.8125000 0.4139073 1.0000000 0.6086957 0.4434763 0.4066667 0.5590778
## [120] 0.5000000 1.2500000 0.3109885 3.0000000 0.6285714 0.5023447 0.7244094
## [127] 4.3333333 0.6705882 1.2727273 0.3779855 0.4730603 0.7423313 0.3518519
## [134] 0.5000000 0.6011342 0.5319149 0.4211382 0.7835821 0.4027253       Inf
## [141] 0.4270824 0.3314763 0.7959184 0.0000000 0.4383460 0.5491924 0.6264151
## [148] 0.5228113 1.1639344 1.6969697 2.0384615 3.6666667 1.5000000 0.5000000
## [155] 3.0000000 0.6779141 0.5681818 1.0000000 0.5853659       Inf 0.6666667
## [162] 0.0000000 0.4210526 0.0000000 0.6666667 1.4666667       Inf 0.3017392
## [169] 0.3655536 1.3333333 0.4700000 0.4660494 0.7660377 0.0000000 1.0000000
## [176] 1.5555556 0.8095238 0.7125749 2.0454545 0.4593401 0.3548366 0.4878049
## [183] 0.3909535 0.0000000 0.5839416 0.2726428 0.3249085 0.4426230 1.2000000
## [190] 0.4342984 0.6923077 0.4225031 0.4389158       Inf 0.0000000 0.5000000
## [197] 0.4766147 1.0000000 0.0000000 0.3580071 0.0000000 0.4807361 0.5625000
## [204] 0.6636364 0.5275545 1.3333333 0.6363636 0.7480916 0.7471264 0.5421687
## [211] 0.6666667 0.6400000 0.0000000 0.6389452 1.6666667 0.3744240 0.0000000
## [218] 0.4859997 0.9090909 0.4758221 0.5769231 0.5279312 0.4290176 1.0000000
## [225] 0.5637672 2.0000000 0.5944518 0.8347826       Inf 1.1666667 0.5000000
## [232] 0.5090909 0.3750000 1.0000000 0.7087379 0.7162162 3.0000000 0.4366197
## [239] 0.7254902 0.5695839 0.5041322 0.7415730 1.2000000 0.5621891 0.2820513
## [246] 0.2982456 0.3773585 0.3194707 0.6470588 0.7459459 0.5782748 0.7488372
## [253] 0.5197740 0.4685714 0.5000000 0.3698269 0.9444444 0.3888228 6.0000000
## [260]       Inf 0.5667656 0.7500000 0.4000000 1.2000000 0.9395973 0.2573932
## [267] 0.3333333 0.8214286 0.2758621 0.5306122 1.0000000 0.6296296 1.3823529
## [274] 0.5554388 0.3822786 0.4614458 2.3333333 0.3529412 0.6666667 1.0000000
## [281] 1.1864407 0.0000000 0.7500000 1.1666667 0.4191593 1.8000000 0.4482527
## [288] 0.6666667 1.5000000 0.3773756 0.4981132 0.5377358 4.6666667 0.2931911
## [295] 0.3329886 0.4000000 0.2413793 1.1449275 0.5652174 0.9756098 0.9642857
## [302]       Inf 0.5317919 0.6554054 0.4285714 0.6036585 1.8750000 0.5641026
## [309] 3.4444444 1.3600000 1.5000000 0.4055944 0.6944444 0.3872549 0.3103448
## [316]       Inf 0.9000000 0.3529412 0.5769231 0.6608696 1.2380952 0.5076336
## [323] 0.6071429 0.3333333 0.6775956 0.5652174 0.8598726 0.4800000 0.5202086
## [330] 0.6739130 0.8571429 2.1666667       Inf 0.2142857 0.7142857 0.3743914
## [337] 1.0000000 1.1351351 0.9705882 0.4081633 0.4328358 1.2500000 0.5000000
## [344] 0.5384615       Inf 0.2938288 0.5523256       Inf 0.0000000 0.3653034
## [351] 0.4870912 0.8205128 1.1428571 1.3333333 0.5877863 1.3695652 0.5000000
## [358] 0.3161290       Inf 0.5776658 1.1428571 0.5578145 0.6883117       Inf
## [365] 1.5000000 0.4448669 2.3000000 1.5833333 1.0204082 0.2780123 0.3391456
## [372] 0.6603774 0.5747664 1.0312500 0.5742574 0.3821656       Inf 0.6138614
## [379] 3.3333333 0.0000000 1.5000000 0.9117647 0.8260870 0.0000000 0.5833333
## [386] 0.5284160 0.5636364 0.7511737 0.9411765 0.4438299       Inf 0.5833333
## [393] 0.4667630 3.0000000 0.4743363       Inf 1.3333333       Inf 0.4090909
## [400] 0.7333333 0.5000000 0.3679245 0.4414616 0.5796915 0.3333333 2.6666667
## [407] 0.9090909 0.2920260 0.7596154 0.8090909 1.0000000 1.2040816 0.0000000
## [414] 1.4000000 1.2666667       Inf 0.4181818 1.4285714 0.4686192 1.0740741
## [421] 0.5869565 0.8888889 3.0000000 0.6888889 0.5180863 0.0000000 0.1428571
## [428] 0.4831933 3.0000000 0.6264237 0.7272727 0.7647059 0.6000000 0.6000000
## [435] 1.0652174 1.3333333 0.5471698 0.9056604 0.8333333 2.2000000 0.4242899
## [442] 0.3759018 1.1875000 0.1811497 0.7643312       Inf 0.5333333       Inf
## [449] 0.0000000 0.6666667       Inf 1.2894737 0.5950096 0.4333333 0.6511628
## [456] 0.6645963 0.4311985 0.3818182 0.0000000 0.9285714       Inf 0.5000000
## [463] 0.0000000 0.4214815 0.0000000 0.3044532       Inf 0.6458333 1.0000000
## [470] 0.5239437 0.3940363 0.4198291 0.5033557 0.3736070 0.4503382 1.3333333
## [477]       Inf 0.0000000 0.5641026 2.2307692 0.5833333       Inf 0.8303571
## [484] 0.0000000 0.6000000 1.6538462 0.0000000 0.5240113       Inf 0.0000000
## [491] 0.4545455 0.5625000       Inf 0.3928419 1.6666667 0.6650943 0.5528455
## [498] 2.8333333 1.6666667 0.2500000

#Vector of Gender ratio Females per Male
#Question 7
gender_ratio <- Assignment1Dataset %>%
  group_by(region) %>%
  summarise(gender.ratio = sum(population[gender=="F"])/sum(population[gender=="M"]))

gender_ratio

## # A tibble: 500 x 2
##    region   gender.ratio
##    <chr>           <dbl>
##  1 SSC20005        1.54 
##  2 SSC20012        0.959
##  3 SSC20018        0.667
##  4 SSC20027        0.898
##  5 SSC20029        0.7  
##  6 SSC20048        0.941
##  7 SSC20062        0.899
##  8 SSC20076        0.991
##  9 SSC20079        1.01 
## 10 SSC20099        0    
## # ... with 490 more rows

#Vector for gender ratio
VecGenderRatio <- gender_ratio$gender.ratio
VecGenderRatio

##   [1] 1.5384615 0.9585253 0.6666667 0.8981636 0.7000000 0.9411765 0.8994709
##   [8] 0.9914471 1.0056406 0.0000000 0.9000000 1.1250000 0.9990244       Inf
##  [15] 1.0820981 0.8367347       Inf 1.0114155 1.5797101 3.0000000 1.2589928
##  [22] 0.9815696 0.3333333 1.5000000 0.9640206 1.0063331 1.0000000 1.0104231
##  [29] 0.6666667 1.5625000 2.0000000 1.3076923 0.8000000 0.8936170 1.0000000
##  [36] 1.2500000 1.3000000 1.0800000       Inf 1.4642857 0.7575758 1.0184524
##  [43] 0.7812500 0.9930039 1.0039876 1.4166667 1.3333333 0.9354839 1.0501982
##  [50] 1.4000000       Inf 0.8360656 0.5641026 0.5200000 0.7777778 1.0895331
##  [57] 1.0581733 1.5000000 0.5294118 0.9838710 1.0000000 0.0000000 1.0649351
##  [64] 1.1914894 0.8672566       Inf 0.7500000 1.0038314       Inf 1.0753065
##  [71] 0.0000000 0.3392857 0.8571429       Inf 1.2133333 0.9134221 1.0101523
##  [78] 0.8750000 1.1958763 1.1700442 0.8571429       Inf       Inf 0.5263158
##  [85] 1.4675325 0.4375000 0.0000000 1.0522307 1.2218371 0.0000000 0.9795918
##  [92] 0.9897631 1.0705251 1.0281905 1.3333333 0.7969925 1.0669945 0.6363636
##  [99] 0.8957521 1.0540541 1.0139442 1.0838184 1.0343384 1.2000000 0.9285714
## [106] 1.6666667 0.8364312 0.9675768 0.5000000 1.0000000 0.9974805 3.6666667
## [113] 0.9333333 0.9953271 1.0000000 0.7011494 1.0576433 0.9181818 0.9963100
## [120] 0.5000000 0.8571429 1.0488452 3.0000000 0.6442308 1.0686037 0.8099174
## [127] 4.3333333 1.0285714 1.1739130 1.0126390 0.9725830 0.8745875 1.1007194
## [134] 1.0000000 0.9882629 1.2500000 1.0683671 0.8968254 0.9834057 1.0000000
## [141] 1.0499081 0.9069149 1.0465116 2.0000000 0.9966963 0.9573284 0.9113082
## [148] 1.0289141 1.0952381 0.5892857 0.7555556 1.3333333 1.5000000 0.5000000
## [155] 0.0000000 1.0110294 0.5333333 0.7777778 0.7105263       Inf 1.0000000
## [162]       Inf 1.0769231       Inf 0.8840580 0.6818182       Inf 0.9284428
## [169] 1.0077519 1.3333333 1.2615385 1.1205357 0.9180328 0.0000000       Inf
## [176] 0.9166667 0.7272727 1.0428571 0.9142857 1.0392857 0.9534471 1.1785714
## [183] 1.0487957 0.0000000 1.3333333 1.0197978 1.0377838 0.9807074 1.4444444
## [190] 0.9138187 0.6923077 0.9775313 1.0341211 1.0000000 1.3333333 0.5000000
## [197] 0.9673591 1.0000000 1.0000000 1.0276302 1.0000000 1.0211931 0.9230769
## [204] 0.9468085 0.9992487 0.6470588 0.3846154 0.9572650 1.1111111 1.2857143
## [211] 0.5217391 1.7333333 0.0000000 1.0049628 0.6000000 0.9557377 3.0000000
## [218] 1.0143619 0.9090909 1.0096576 2.1538462 0.9793241 1.0253521 5.0000000
## [225] 1.0041911 0.5000000 0.8540707 1.2688172 0.3333333 1.4761905 0.0000000
## [232] 0.9606299 0.8333333 5.0000000 0.6603774 1.0483871 1.0000000 1.0816327
## [239] 0.4426230 1.1242718 1.0953947 0.7714286 2.6666667 1.0433839 1.5000000
## [246] 1.0000000 1.8076923 1.0650888 0.7500000 1.2123288 1.0932203 1.0215054
## [253] 0.9779412 1.0350679 1.1774194 0.9617457 1.0000000 1.0515222 1.3333333
## [260] 0.6000000 1.0307692 0.7500000 0.9090909 0.4666667 0.9794521 1.0140351
## [267] 1.4000000 1.2173913 0.6444444 1.2388060 0.3333333 0.7529880 0.8837209
## [274] 1.0584145 1.0400391 0.9820261 0.6666667 2.8333333 0.2500000 0.3333333
## [281] 0.9846154       Inf 0.7500000 1.6000000 1.0455840 0.8666667 0.9825207
## [288] 1.5000000 1.5000000 0.9936610 0.9178744 1.1077586 1.0000000 1.0182395
## [295] 0.9891975 2.5000000 0.8000000 1.0000000 1.0571429 0.6530612 1.2000000
## [302]       Inf 1.0542636 1.0940171 0.8181818 1.1646091 1.0294118 2.0500000
## [309] 1.0000000 1.0344828 0.6666667 1.0100000 2.5882353 0.9929577 2.1666667
## [316] 1.0000000 5.3333333 2.2857143 0.8861985 0.9489796 0.4242424 1.0680628
## [323] 2.4615385 1.0000000 1.2246377 0.9115044 0.9597315 0.8500000 1.0600707
## [330] 0.7906977 3.3333333 0.1875000 0.0000000 0.2142857 0.6363636 1.0294752
## [337] 0.8837209 1.3939394 0.8611111 3.6000000 0.9940653 1.0000000 1.0338983
## [344] 0.1764706       Inf 1.0352910 1.2250000 0.0000000       Inf 0.9914350
## [351] 0.9867144 0.9722222 0.8750000 0.7192982 0.8909091 0.9122807 0.5000000
## [358] 1.0198020 0.0000000 0.9883598 0.7142857 1.0331675 1.0634921 0.0000000
## [365] 0.2500000 0.7511521 0.7368421 0.8235294 0.8679245 1.0337812 1.2196532
## [372] 0.8723404 0.9479769 0.8309859 1.0921053 0.9687500 0.0000000 0.8953488
## [379] 3.3333333 1.0000000 1.5000000 1.4074074 0.9090909       Inf 2.1666667
## [386] 1.0288925 1.3888889 1.0053763 1.0625000 1.0425428       Inf 1.0182094
## [393] 1.0118930 1.0000000 0.8846154 1.0000000 2.5000000       Inf 1.5833333
## [400] 0.9696970 0.8361446 1.2307692 1.0673797 1.0725126 0.8666667 0.6923077
## [407] 0.6800000 1.0034805 1.3164557 0.8598131 0.0000000 1.7692308       Inf
## [414] 2.0000000 1.0000000       Inf 1.2285714 3.2500000 1.2356688 0.9310345
## [421] 1.2121212 0.7894737 1.0000000 0.9240506 1.1719533 1.0000000 0.3333333
## [428] 0.9263302 1.2857143 1.1250000 0.5833333 0.4285714 0.2307692 0.6000000
## [435] 0.9791667 0.0000000 1.0500000 1.0612245 1.5384615 2.5555556 1.1533193
## [442] 1.0372727 0.8421053 0.8692658 0.7870968 0.0000000 0.9415584 0.0000000
## [449] 0.0000000 0.0000000 0.0000000 0.8913043 1.0072464 1.0476190 0.8684211
## [456] 1.2521008 1.0026619 1.0540541       Inf 0.6875000 0.0000000 0.0000000
## [463] 1.6666667 0.9661885 3.0000000 1.0721848       Inf 0.6288660 0.5000000
## [470] 0.9049296 1.0337939 1.0337904 1.0144532 1.0472028 0.9662162 0.4000000
## [477]       Inf 1.0000000 0.9879931 0.5000000 2.1666667 0.0000000 1.3295455
## [484] 0.0000000 1.6666667 0.5681818 0.6666667 1.0281955 1.0000000 0.0000000
## [491] 0.7560976 0.5625000 0.0000000 0.8863243 0.7777778 1.0171429 0.9100000
## [498] 0.6428571 3.0000000 0.6666667

#Making a new df for our 6.1 and 7.1 plots
Plotsdf <- data.frame(VecUniqueRegions, VecUniquePopTotals, VecOldToYoung, VecGenderRatio)
View(Plotsdf)
head(Plotsdf)

##   VecUniqueRegions VecUniquePopTotals VecOldToYoung VecGenderRatio
## 1         SSC20005                 33     1.0625000      1.5384615
## 2         SSC20012                425     0.7206478      0.9585253
## 3         SSC20018                100     1.0000000      0.6666667
## 4         SSC20027               1137     0.4843342      0.8981636
## 5         SSC20029                 51     0.3076923      0.7000000
## 6         SSC20048                924     0.5197368      0.9411765

#Now that we've made a new data frame we can continue with Q6.1 & Q7.1
#6.1
#Checking what the min and max values are to make a filter, removing infinite and 0 values
sort(Plotsdf$VecOldToYoung)

##   [1] 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
##   [8] 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
##  [15] 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
##  [22] 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
##  [29] 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
##  [36] 0.0000000 0.1073914 0.1428571 0.1666667 0.1811497 0.2142857 0.2186312
##  [43] 0.2307692 0.2413793 0.2454281 0.2500000 0.2500000 0.2573932 0.2587646
##  [50] 0.2608696 0.2722104 0.2726428 0.2758621 0.2760358 0.2780123 0.2820513
##  [57] 0.2848653 0.2920260 0.2931911 0.2938288 0.2982456 0.3017392 0.3044532
##  [64] 0.3076923 0.3100327 0.3103448 0.3109885 0.3126129 0.3161290 0.3177900
##  [71] 0.3194707 0.3249085 0.3253589 0.3292826 0.3314763 0.3329886 0.3333333
##  [78] 0.3333333 0.3333333 0.3333333 0.3391456 0.3464311 0.3495935 0.3518519
##  [85] 0.3529412 0.3529412 0.3548366 0.3580071 0.3588686 0.3653034 0.3655536
##  [92] 0.3679245 0.3698269 0.3736070 0.3736908 0.3743914 0.3744240 0.3750000
##  [99] 0.3759018 0.3773585 0.3773756 0.3779855 0.3818182 0.3821656 0.3822786
## [106] 0.3872549 0.3888228 0.3909535 0.3928419 0.3940363 0.3963134 0.3970588
## [113] 0.4000000 0.4000000 0.4006684 0.4027253 0.4055944 0.4066667 0.4078024
## [120] 0.4081633 0.4090909 0.4110123 0.4139073 0.4146341 0.4155496 0.4181818
## [127] 0.4191593 0.4198291 0.4210526 0.4211382 0.4214815 0.4225031 0.4232268
## [134] 0.4242899 0.4270824 0.4285714 0.4290176 0.4293433 0.4311985 0.4323127
## [141] 0.4328358 0.4333333 0.4342984 0.4366197 0.4383460 0.4389158 0.4410399
## [148] 0.4414616 0.4426230 0.4434763 0.4438299 0.4448669 0.4482527 0.4503382
## [155] 0.4534413 0.4545455 0.4545455 0.4564573 0.4593401 0.4614458 0.4634146
## [162] 0.4642857 0.4660494 0.4667630 0.4681496 0.4685714 0.4686192 0.4700000
## [169] 0.4730603 0.4743363 0.4758221 0.4765669 0.4766147 0.4800000 0.4807361
## [176] 0.4831933 0.4843342 0.4859997 0.4870912 0.4878049 0.4927536 0.4952479
## [183] 0.4981132 0.5000000 0.5000000 0.5000000 0.5000000 0.5000000 0.5000000
## [190] 0.5000000 0.5000000 0.5000000 0.5000000 0.5000000 0.5023447 0.5033557
## [197] 0.5041322 0.5076336 0.5090909 0.5180863 0.5197368 0.5197740 0.5202086
## [204] 0.5228113 0.5239437 0.5240113 0.5251010 0.5268256 0.5275545 0.5279312
## [211] 0.5284160 0.5306122 0.5317919 0.5319149 0.5333333 0.5377358 0.5384615
## [218] 0.5421687 0.5448635 0.5471698 0.5491924 0.5523256 0.5528455 0.5554388
## [225] 0.5578145 0.5585473 0.5590778 0.5621891 0.5625000 0.5625000 0.5636364
## [232] 0.5637672 0.5641026 0.5641026 0.5652174 0.5652174 0.5667656 0.5681818
## [239] 0.5695839 0.5742574 0.5747664 0.5769231 0.5769231 0.5776658 0.5782748
## [246] 0.5796915 0.5833333 0.5833333 0.5833333 0.5839416 0.5853659 0.5869565
## [253] 0.5877863 0.5944518 0.5950096 0.6000000 0.6000000 0.6000000 0.6011342
## [260] 0.6015038 0.6033058 0.6036585 0.6071429 0.6086957 0.6138614 0.6185567
## [267] 0.6201923 0.6264151 0.6264237 0.6285714 0.6296296 0.6363636 0.6389452
## [274] 0.6400000 0.6428571 0.6458333 0.6470588 0.6486486 0.6511628 0.6554054
## [281] 0.6603774 0.6608696 0.6636364 0.6645963 0.6650943 0.6666667 0.6666667
## [288] 0.6666667 0.6666667 0.6666667 0.6666667 0.6705882 0.6739130 0.6745763
## [295] 0.6775956 0.6779141 0.6883117 0.6888889 0.6923077 0.6923077 0.6923077
## [302] 0.6944444 0.6950355 0.6969697 0.7087379 0.7125749 0.7127660 0.7142857
## [309] 0.7162162 0.7206478 0.7244094 0.7254902 0.7272727 0.7333333 0.7415730
## [316] 0.7419355 0.7423313 0.7459459 0.7471264 0.7480916 0.7488372 0.7500000
## [323] 0.7500000 0.7500000 0.7511737 0.7596154 0.7643312 0.7647059 0.7647059
## [330] 0.7660377 0.7692308 0.7826087 0.7835821 0.7959184 0.8090909 0.8095238
## [337] 0.8125000 0.8205128 0.8214286 0.8260870 0.8303571 0.8333333 0.8347826
## [344] 0.8541667 0.8571429 0.8598726 0.8888889 0.9000000 0.9000000 0.9014085
## [351] 0.9056604 0.9090909 0.9090909 0.9117647 0.9181818 0.9230769 0.9285714
## [358] 0.9395973 0.9411765 0.9444444 0.9622642 0.9642857 0.9705882 0.9756098
## [365] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [372] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [379] 1.0000000 1.0000000 1.0000000 1.0204082 1.0312500 1.0625000 1.0652174
## [386] 1.0740741 1.0800000 1.1351351 1.1428571 1.1428571 1.1449275 1.1639344
## [393] 1.1666667 1.1666667 1.1666667 1.1864407 1.1875000 1.2000000 1.2000000
## [400] 1.2000000 1.2040816 1.2380952 1.2500000 1.2500000 1.2666667 1.2727273
## [407] 1.2894737 1.3333333 1.3333333 1.3333333 1.3333333 1.3333333 1.3333333
## [414] 1.3333333 1.3600000 1.3695652 1.3823529 1.4000000 1.4285714 1.4666667
## [421] 1.5000000 1.5000000 1.5000000 1.5000000 1.5000000 1.5555556 1.5833333
## [428] 1.6000000 1.6176471 1.6538462 1.6666667 1.6666667 1.6666667 1.6666667
## [435] 1.6969697 1.8000000 1.8500000 1.8750000 2.0000000 2.0000000 2.0384615
## [442] 2.0454545 2.1666667 2.2000000 2.2222222 2.2307692 2.3000000 2.3333333
## [449] 2.3333333 2.6000000 2.6666667 2.8333333 3.0000000 3.0000000 3.0000000
## [456] 3.0000000 3.0000000 3.0000000 3.0000000 3.3333333 3.4444444 3.6666667
## [463] 3.6666667 4.3333333 4.6666667 6.0000000 8.0000000       Inf       Inf
## [470]       Inf       Inf       Inf       Inf       Inf       Inf       Inf
## [477]       Inf       Inf       Inf       Inf       Inf       Inf       Inf
## [484]       Inf       Inf       Inf       Inf       Inf       Inf       Inf
## [491]       Inf       Inf       Inf       Inf       Inf       Inf       Inf
## [498]       Inf       Inf       Inf

#The min excluding 0 is .107 and the max excluding infinite is 8.0
PlotOldToYoung <- filter(Plotsdf,VecOldToYoung>0.1 & VecOldToYoung <8.5)

head(PlotOldToYoung)

##   VecUniqueRegions VecUniquePopTotals VecOldToYoung VecGenderRatio
## 1         SSC20005                 33     1.0625000      1.5384615
## 2         SSC20012                425     0.7206478      0.9585253
## 3         SSC20018                100     1.0000000      0.6666667
## 4         SSC20027               1137     0.4843342      0.8981636
## 5         SSC20029                 51     0.3076923      0.7000000
## 6         SSC20048                924     0.5197368      0.9411765

#Calculating pearsons correlation coefficient
cor.test(x=PlotOldToYoung$VecUniquePopTotals, y=PlotOldToYoung$VecOldToYoung)

## 
##  Pearson's product-moment correlation
## 
## data:  PlotOldToYoung$VecUniquePopTotals and PlotOldToYoung$VecOldToYoung
## t = -5.2121, df = 429, p-value = 2.908e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.3308645 -0.1531083
## sample estimates:
##        cor 
## -0.2440351

#We can see there is a low negative linear relationship


#6.1
ggplot(data=PlotOldToYoung,aes(x=VecUniquePopTotals,y=VecOldToYoung))+ #Create basic plot of gender ratio by region
  geom_point(size=0.5, position="jitter") + #Change point size and jitter points to avoid them hitting each other
  geom_hline(aes(yintercept=mean(PlotOldToYoung$VecOldToYoung, na.rm=T)), color="blue", linetype="dashed", size=1) +
  #Add a blue mean line for the y axis
  geom_hline(aes(yintercept=median(PlotOldToYoung$VecOldToYoung, na.rm=T)), color="red", linetype="dashed", size=1) +
  ##Add a red median line for the y axis
  geom_smooth(method="lm")+ #Add linear regression line
  coord_cartesian(ylim = c(0.1,9.4)) + #Setting the view frame, this still includes the lowest & highest values
  scale_y_continuous(trans = 'log2') + #Setting the y axis scale to log2 but maintain raw values
  scale_x_continuous(trans = 'log2') + #Setting the x axis scale to log2 but maintain raw values
  labs(y = "People 40 and over per Person Under 40", x = "Region Total Population") + #Label axis'
  geom_text(label=PlotOldToYoung$VecUniqueRegions,size=2, alpha= 0.5) + #Label each dot with its region name in text
  ggtitle("Age Ratio (People 40 and over per person Under 40) for each region as a function of its Population")+ #Add title
  theme(panel.background = element_rect(fill="#F2CF59",color="pink"),
        plot.background = element_rect("cornflowerblue"),#Change plot background colour
        plot.title = element_text(hjust= 0.5, size=11), #Centre title, change title size
        panel.border = element_rect(linetype = "dashed", fill = NA)) #Changed panel colour, add dashed border

# 6.1 Plot the ratio of old to young people using 40 years old as a cut off.

Question 7:

Plot gender ratio as a function of the population of the region (remove any with zero M or F)

#7
sort(Plotsdf$VecGenderRatio, decreasing = F)

##   [1] 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
##   [8] 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
##  [15] 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
##  [22] 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
##  [29] 0.1764706 0.1875000 0.2142857 0.2307692 0.2500000 0.2500000 0.3333333
##  [36] 0.3333333 0.3333333 0.3333333 0.3333333 0.3392857 0.3846154 0.4000000
##  [43] 0.4242424 0.4285714 0.4375000 0.4426230 0.4666667 0.5000000 0.5000000
##  [50] 0.5000000 0.5000000 0.5000000 0.5000000 0.5000000 0.5000000 0.5200000
##  [57] 0.5217391 0.5263158 0.5294118 0.5333333 0.5625000 0.5641026 0.5681818
##  [64] 0.5833333 0.5892857 0.6000000 0.6000000 0.6000000 0.6288660 0.6363636
##  [71] 0.6363636 0.6428571 0.6442308 0.6444444 0.6470588 0.6530612 0.6603774
##  [78] 0.6666667 0.6666667 0.6666667 0.6666667 0.6666667 0.6666667 0.6800000
##  [85] 0.6818182 0.6875000 0.6923077 0.6923077 0.7000000 0.7011494 0.7105263
##  [92] 0.7142857 0.7192982 0.7272727 0.7368421 0.7500000 0.7500000 0.7500000
##  [99] 0.7500000 0.7511521 0.7529880 0.7555556 0.7560976 0.7575758 0.7714286
## [106] 0.7777778 0.7777778 0.7777778 0.7812500 0.7870968 0.7894737 0.7906977
## [113] 0.7969925 0.8000000 0.8000000 0.8099174 0.8181818 0.8235294 0.8309859
## [120] 0.8333333 0.8360656 0.8361446 0.8364312 0.8367347 0.8421053 0.8500000
## [127] 0.8540707 0.8571429 0.8571429 0.8571429 0.8598131 0.8611111 0.8666667
## [134] 0.8666667 0.8672566 0.8679245 0.8684211 0.8692658 0.8723404 0.8745875
## [141] 0.8750000 0.8750000 0.8837209 0.8837209 0.8840580 0.8846154 0.8861985
## [148] 0.8863243 0.8909091 0.8913043 0.8936170 0.8953488 0.8957521 0.8968254
## [155] 0.8981636 0.8994709 0.9000000 0.9049296 0.9069149 0.9090909 0.9090909
## [162] 0.9090909 0.9100000 0.9113082 0.9115044 0.9122807 0.9134221 0.9138187
## [169] 0.9142857 0.9166667 0.9178744 0.9180328 0.9181818 0.9230769 0.9240506
## [176] 0.9263302 0.9284428 0.9285714 0.9310345 0.9333333 0.9354839 0.9411765
## [183] 0.9415584 0.9468085 0.9479769 0.9489796 0.9534471 0.9557377 0.9572650
## [190] 0.9573284 0.9585253 0.9597315 0.9606299 0.9617457 0.9640206 0.9661885
## [197] 0.9662162 0.9673591 0.9675768 0.9687500 0.9696970 0.9722222 0.9725830
## [204] 0.9775313 0.9779412 0.9791667 0.9793241 0.9794521 0.9795918 0.9807074
## [211] 0.9815696 0.9820261 0.9825207 0.9834057 0.9838710 0.9846154 0.9867144
## [218] 0.9879931 0.9882629 0.9883598 0.9891975 0.9897631 0.9914350 0.9914471
## [225] 0.9929577 0.9930039 0.9936610 0.9940653 0.9953271 0.9963100 0.9966963
## [232] 0.9974805 0.9990244 0.9992487 1.0000000 1.0000000 1.0000000 1.0000000
## [239] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [246] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [253] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [260] 1.0000000 1.0000000 1.0000000 1.0000000 1.0026619 1.0034805 1.0038314
## [267] 1.0039876 1.0041911 1.0049628 1.0053763 1.0056406 1.0063331 1.0072464
## [274] 1.0077519 1.0096576 1.0100000 1.0101523 1.0104231 1.0110294 1.0114155
## [281] 1.0118930 1.0126390 1.0139442 1.0140351 1.0143619 1.0144532 1.0171429
## [288] 1.0182094 1.0182395 1.0184524 1.0197978 1.0198020 1.0211931 1.0215054
## [295] 1.0253521 1.0276302 1.0281905 1.0281955 1.0285714 1.0288925 1.0289141
## [302] 1.0294118 1.0294752 1.0307692 1.0331675 1.0337812 1.0337904 1.0337939
## [309] 1.0338983 1.0341211 1.0343384 1.0344828 1.0350679 1.0352910 1.0372727
## [316] 1.0377838 1.0392857 1.0400391 1.0425428 1.0428571 1.0433839 1.0455840
## [323] 1.0465116 1.0472028 1.0476190 1.0483871 1.0487957 1.0488452 1.0499081
## [330] 1.0500000 1.0501982 1.0515222 1.0522307 1.0540541 1.0540541 1.0542636
## [337] 1.0571429 1.0576433 1.0581733 1.0584145 1.0600707 1.0612245 1.0625000
## [344] 1.0634921 1.0649351 1.0650888 1.0669945 1.0673797 1.0680628 1.0683671
## [351] 1.0686037 1.0705251 1.0721848 1.0725126 1.0753065 1.0769231 1.0800000
## [358] 1.0816327 1.0820981 1.0838184 1.0895331 1.0921053 1.0932203 1.0940171
## [365] 1.0952381 1.0953947 1.1007194 1.1077586 1.1111111 1.1205357 1.1242718
## [372] 1.1250000 1.1250000 1.1533193 1.1646091 1.1700442 1.1719533 1.1739130
## [379] 1.1774194 1.1785714 1.1914894 1.1958763 1.2000000 1.2000000 1.2121212
## [386] 1.2123288 1.2133333 1.2173913 1.2196532 1.2218371 1.2246377 1.2250000
## [393] 1.2285714 1.2307692 1.2356688 1.2388060 1.2500000 1.2500000 1.2521008
## [400] 1.2589928 1.2615385 1.2688172 1.2857143 1.2857143 1.3000000 1.3076923
## [407] 1.3164557 1.3295455 1.3333333 1.3333333 1.3333333 1.3333333 1.3333333
## [414] 1.3333333 1.3333333 1.3888889 1.3939394 1.4000000 1.4000000 1.4074074
## [421] 1.4166667 1.4444444 1.4642857 1.4675325 1.4761905 1.5000000 1.5000000
## [428] 1.5000000 1.5000000 1.5000000 1.5000000 1.5000000 1.5384615 1.5384615
## [435] 1.5625000 1.5797101 1.5833333 1.6000000 1.6666667 1.6666667 1.6666667
## [442] 1.7333333 1.7692308 1.8076923 2.0000000 2.0000000 2.0000000 2.0500000
## [449] 2.1538462 2.1666667 2.1666667 2.1666667 2.2857143 2.4615385 2.5000000
## [456] 2.5000000 2.5555556 2.5882353 2.6666667 2.8333333 3.0000000 3.0000000
## [463] 3.0000000 3.0000000 3.0000000 3.2500000 3.3333333 3.3333333 3.6000000
## [470] 3.6666667 4.3333333 5.0000000 5.0000000 5.3333333       Inf       Inf
## [477]       Inf       Inf       Inf       Inf       Inf       Inf       Inf
## [484]       Inf       Inf       Inf       Inf       Inf       Inf       Inf
## [491]       Inf       Inf       Inf       Inf       Inf       Inf       Inf
## [498]       Inf       Inf       Inf

#Lowest is 0.176 to highest which is 5.33

head(Plotsdf)

##   VecUniqueRegions VecUniquePopTotals VecOldToYoung VecGenderRatio
## 1         SSC20005                 33     1.0625000      1.5384615
## 2         SSC20012                425     0.7206478      0.9585253
## 3         SSC20018                100     1.0000000      0.6666667
## 4         SSC20027               1137     0.4843342      0.8981636
## 5         SSC20029                 51     0.3076923      0.7000000
## 6         SSC20048                924     0.5197368      0.9411765

#Filtering out the regions with 0 males or 0 females
PlotGenRatio <- filter(Plotsdf,VecGenderRatio>0.1 & VecGenderRatio <5.5)
head(PlotGenRatio)

##   VecUniqueRegions VecUniquePopTotals VecOldToYoung VecGenderRatio
## 1         SSC20005                 33     1.0625000      1.5384615
## 2         SSC20012                425     0.7206478      0.9585253
## 3         SSC20018                100     1.0000000      0.6666667
## 4         SSC20027               1137     0.4843342      0.8981636
## 5         SSC20029                 51     0.3076923      0.7000000
## 6         SSC20048                924     0.5197368      0.9411765

#Amended this by getting the pearsons correlation
cor.test(x=PlotGenRatio$VecUniquePopTotals, y=PlotGenRatio$VecGenderRatio)

## 
##  Pearson's product-moment correlation
## 
## data:  PlotGenRatio$VecUniquePopTotals and PlotGenRatio$VecGenderRatio
## t = -1.2368, df = 444, p-value = 0.2168
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.15062780  0.03444502
## sample estimates:
##         cor 
## -0.05859485

#We can see there is no linear relationship between the two variables

# 7.1 Plot the gender ratio as a function of the population of the region. 
#There is no relationship between the x and y variables in this graph
ggplot(data=PlotGenRatio,aes(x=VecUniquePopTotals,y=VecGenderRatio))+ #Create basic plot of gender ratio by region
  geom_point(size=0.5, position="jitter") + #Change point size and jitter points to avoid them hitting each other
  geom_hline(aes(yintercept=mean(PlotGenRatio$VecGenderRatio, na.rm=T)), color="blue", linetype="dashed", size=1) +
  geom_hline(aes(yintercept=median(PlotGenRatio$VecGenderRatio, na.rm=T)), color="red", linetype="dashed", size=1) +
  coord_cartesian(ylim = c(0.2,5.5)) + #Setting the view frame, this still includes the lowest & highest values
  scale_y_continuous(trans = 'log2') + #Setting the y axis scale to log2 but maintain raw values
  scale_x_continuous(trans = 'log2') + #Setting the x axis scale to log2 but maintain raw values
  labs(y = "Females per Male Gender Ratio", x = "Region Total Population") + #Label axis'
  geom_text(label=PlotGenRatio$VecUniqueRegions,size=2, alpha= 0.5) + #Label each dot with its region name in text
  ggtitle("Gender Ratio (Females per Male) for each region as a function of its Population ")+ #Add title
  theme(panel.background = element_rect(fill="#f4edca",color="pink"),plot.background = element_rect("#46dbdf"),
        plot.title = element_text(hjust= 0.5), #Centre title
        panel.border = element_rect(linetype = "dashed", fill = NA)) + #Change panel and plot background colours, Centre title
  geom_smooth(method='loess', formula= y~x, na.rm = T, colour="black", size=0.9) #Use loess for non-linear model

Question 8:

Slice by a single age group, e.g. 20-30 year olds. Find the top 2 regions and the percentage for male and female. (Requires an inner join from dplyr). Find the top 2 regions and the percentage for male and female. Either ones with the largest population of the selected demographic, or the largest percentage is also fine as long as their justification is logical.

#8
#Make a vector of the age column from our looped data frame
VecEight.One <- df$age
head(VecEight.One,10)

##  [1] 0 0 0 0 0 0 0 0 0 0

#Check the mean and median values to get an idea of where to begin our search
mean(VecEight.One)

## [1] 27.76707

median(VecEight.One)

## [1] 28

# Input Parameters
# Modify here to iterate with different ages, I already did it and found 21 + or - 2 is the best
ideal.age = 21
span = 2
###########################

min.age = ideal.age - span
max.age = ideal.age + span
# 1) Filter by target age
# Target.Age is a data frame with the individuals with the range of age of interest.
Target.Age = Assignment1Dataset %>% filter(age >= min.age & age <= max.age )

# 2) Filter by gender
Male.Targets <- Target.Age %>% filter(gender == 'M')
Female.Targets <- Target.Age %>% filter(gender == 'F')

# 3) Get the amount of people per region by summing the population column
# Male.Region.Target is a data frame with the region and the number of people in the region
Male.Region.Target =   aggregate(x = Male.Targets$population,
                                 by = list(Male.Targets$region),
                                 FUN = sum)
# rename the columns for readability
colnames(Male.Region.Target) = c("region","total.males")

# apply the same to females
Female.Region.Target =  aggregate(x = Female.Targets$population,
                                  by = list(Female.Targets$region),
                                  FUN = sum)
colnames(Female.Region.Target) = c("region","total.females")

# 4) Merge by Region Males and Females using the region ID as an anchor
Target.Cohort = inner_join(Male.Region.Target, Female.Region.Target, by="region")

# 5) Create a new column that sums the total males and females to bu used to sort and select the most
# There are abundant regions with the cohort of interest.
# Adding a column to the data frame
Target.Cohort$Total = Target.Cohort$total.males + Target.Cohort$total.females

# 6) Order the data frame based on Total and select the top 2 regions
# 8.2) The print shows the top 2 regions with the most numbers of males and females for the range of age selected
# These regions will maximize the audience of the product
print(Target.Cohort[order(Target.Cohort$Total, decreasing = T),][1:2,] ) # order by decreasing order & select the top two rows [1:2,]

##       region total.males total.females Total
## 80  SSC20492        2499          3434  5933
## 346 SSC22015        1416          1505  2921

Question 9:

You will need to make a sample of ages from the cumulative distribution calculated for the most populous region. Select N samples of 1000 age values each from the cumulative distribution for the region with the most people. Then output a plot after summing 1, 10, 100 and 1000 random samples.

# Question 9 Demonstrate the following
# 9.1) The central limit theorem by selecting N samples of 100 age values each from the cumulative distribution
# for the region with the most people (Q4d).

#################
# Function ageFrom, this function determines the age of random number
# in the mapping space from 1 to cumsum(population). i.e for values of i_ran 1 to 878, the output age
# is going to be 0
# Inputs:
# i_rand - integer value assumed to be from 1 to max of cumsum(population)
# df     - data.frame continuing the Largest region
# Output:
# age corresponding to the cumsum
#############################################
ageFrom <- function(i_rand, df){
  id <- max(1, which.max(df$cumsum > i_rand)-1)
  if(i_rand==max(df$cumsum)) id <- length(df$cumsum)
  return(df$age[id])
}

#################
# Function sample.ages, loops from 1 to N to contain sampling every time Total random ages
# this function calls ageFrom. Results are returned as a matrix of size TotalxN
# Inputs:
# Largest.Region - data.frame continuing the Largest region it needs to have a column cumsum
# N    - the number samples contingin Total measurements to collect
# Total - the number of measurements to take, 100 as default
# Output:
# mtx, is a matrix containing the N samples
#############################################
sample.ages = function(Largest.Region, N, Total){
  
  # Generate a matrix for the sampling results
  # the rows contain the 100 ages
  # and the cols are the number of samples from 1 to N
  mtx = matrix(0, nrow = Total, ncol = N)
  for(i in 1:N){
    # this takes 100 numbers from 1 to the total of the cumulative distribution
    # this simulates taking a random sample of 100 people
    # i_rand is a vector of size 100
    i_rand = round( runif(Total, 1, max(Largest.Region$cumsum)) )
    # select for a every i_rand number check which age category corresponds
    # using the ageFrom function, this will return 100 ages
    # store in a column as [,i]
    mtx[,i] = sapply(i_rand, ageFrom, df = Largest.Region)
  }
  
  return(mtx)
  
}


# Most crowded region
Largest.Region =  Assignment1Dataset %>% filter (region=="SSC22015")
Largest.Region$cumsum = cumsum(Largest.Region$population)

# N is the number of samples, each sample contains 100 age values from the cumulative distribution
N =  1000
Total = 100

# Take N samples by calling sample.ages
# N.Samples is a matrix with Total number of rows and N columns
N.Samples = sample.ages(Largest.Region, N, Total)

# Population mean mu, this is the value of the mean of the population
mu = weighted.mean(Largest.Region$age, Largest.Region$population)

# the apply function will take the mean of the first 10 columns of N.Samples, apply only works on matrices
x.bar = mean(apply(N.Samples[,1:10], 2, mean)) # mean of sample means, this by CLT should be similar to mu


# 9.1) CLT if we take N samples of size 100, we would expect that the means of the N samples are Normally
# distributed around the population mean mu
# plot 9.1 using ggplot
# use stack on matrices to make the ggplot data type happy
merged.data = stack(as.data.frame(N.Samples[,1:10]))
ggplot(merged.data) +
  geom_boxplot(aes(x = ind, y = values, fill=ind))+
  labs(title="CLT Using 10 Samples of 100 Observations: Population (red) vs Sample (blue) means",
       x ="Sample (N)", y = "Age Distribution (years)") +
  geom_hline(yintercept = mu, col="red", size=1)+ # Plot the population mean mu
  geom_hline(yintercept = x.bar, col="blue",linetype="dotted", size=1) + # Plot the means of the 10 samples, CLT stats they shold be quite similar
  theme(plot.title = element_text(hjust=.5, size=12), panel.border = element_rect(linetype = "dashed", fill = NA),
        plot.background = element_rect("skyblue"), panel.background = element_rect(fill='#c5d7c0'))

# 9.2) Add another sample of 100 ages drawn from the same sample.
# The effect of adding leads to a smoother histogram.
x1 = N.Samples[,1] ## First Sample
x2 = N.Samples[,2] ## Second Sample
x = x1 + x2


# gglplot
#Need to make titles smaller
# syntax for vectors, ylab is based on the probability not in the counts, that will make histograms comparable
p1 = ggplot() + aes(x1) +  geom_histogram(aes(y = ..density..), binwidth=density(x1)$bw) +
  geom_density(fill="red", alpha = 0.1) + # This plots the density curve based on your hist data
  labs(title="First Sample N=1",
       x ="N=1", y = "Density") +
  theme(plot.title = element_text(size=11, face= "bold", hjust=0.5))

p2 = ggplot() + aes(x2) +  geom_histogram(aes(y = ..density..), binwidth=density(x2)$bw) +
  geom_density(fill="blue", alpha = 0.1) + # This plots the density curve based on your hist data
  labs(title="Second Sample N=2",
       x ="N=2", y = "Density") +
  theme(plot.title = element_text(size=11, face= "bold", hjust=0.5))

p3 = ggplot() + aes(x) +  geom_histogram(aes(y = ..density..), binwidth=density(x)$bw) +
  geom_density(fill="green", alpha = 0.1) + # This plots the density curve based on your hist data
  labs(title="Sum of Samples N=1 + N=2",
       x ="N = N1 + N2", y = "Density") +
  theme(plot.title = element_text(size=11, face= "bold", hjust=0.5))
plot_grid(p1, p2, p3, nrow = 1) # This arranges your plots in a canvas, one row to plot all

# 9.3) Repeat this process for N = 1 (the original sample), 10, 100 and 1000 samples.
# Already generated in a previous step
print(dim(N.Samples))

## [1]  100 1000

# 9.4) 9.4 Plot the resulting distribution along with a normal distribution;
# using the mean and standard deviation after each addition.
par(mfrow = c(2, 2)) # tell the layout, 2 row s and 2 cols



## For N=10
## instead of doing a for loop we can use apply
## sum all the rows of from 1 to 100 cols
# this returns a vector of size 100 with the total sum.
X.10 = apply(N.Samples[,1:10],1,sum)
# For N=100
X.100 = apply(N.Samples[,1:100],1,sum)
# For N=1000, all the matrix
X.1000 = apply(N.Samples,1,sum)

# ggplot
df.Orig = data.frame(sample = N.Samples[,1])
Orig = ggplot(df.Orig, aes(x = sample)) +
  geom_histogram(aes(y =..density..),
                 colour = "black",
                 fill = "white") +
  stat_function(fun = dnorm, args = list(mean = mean(df.Orig$sample), sd = sd(df.Orig$sample)), size=1) +# this will overlay a normal dist based on your data, N[,1] in this case
  labs(title="N=1 vs Normal(mean(N=1), sd(N=1))",
       x ="sample", y = "Density") +
  theme(plot.title = element_text(size=10, face= "bold", hjust=0.5))

df.Sample.10 = data.frame(sample = X.10)
Sample.10 = ggplot(df.Sample.10, aes(x = sample)) +
  geom_histogram(aes(y =..density..),
                 colour = "black",
                 fill = "white") +
  stat_function(fun = dnorm, args = list(mean = mean(df.Sample.10$sample), sd = sd(df.Sample.10$sample)), col="blue", size=1) +# this will overlay a normal dist based on your data, N[,1] in this case
  labs(title="N10 = { N=1 + ...+ N=10} vs Normal(mean(N), sd(N)",
       x ="sample", y = "Density") +
  theme(plot.title = element_text(size=10, face="bold", hjust=0.5))

df.Sample.100 = data.frame(sample = X.100)
Sample.100 = ggplot(df.Sample.100, aes(x = sample)) +
  geom_histogram(aes(y =..density..),
                 colour = "black",
                 fill = "white") +
  stat_function(fun = dnorm, args = list(mean = mean(df.Sample.100$sample), sd = sd(df.Sample.100$sample)), col="red", size=1) +# this will overlay a normal dist based on your data, N[,1] in this case
  labs(title="N100 = { N=1 + ...+ N=100} vs Normal(mean(N), sd(N)",
       x ="sample", y = "Density") +
  theme(plot.title = element_text(size=10, face= "bold", hjust=0.5))

df.Sample.1000 = data.frame(sample = X.1000)
Sample.1000 = ggplot(df.Sample.1000, aes(x = sample)) +
  geom_histogram(aes(y =..density..),
                 colour = "black",
                 fill = "white") +
  stat_function(fun = dnorm, args = list(mean = mean(df.Sample.1000$sample), sd = sd(df.Sample.1000$sample)), col="green", size=1) +# this will overlay a normal dist based on your data, N[,1] in this case
  labs(title="N100 = { N=1 + ...+ N=100} vs Normal(mean(N), sd(N)",
       x ="sample", y = "Density") +
  theme(plot.title = element_text(size=10, face="bold", hjust=.5))

plot_grid(Orig, Sample.10 ,Sample.100, Sample.1000, nrow = 2, ncol=2)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Note that the distributions are normal and by increasing N the variation reduces

Data Analytics 1

Chris Berry

20/09/2020