library(tidyverse)
library(lubridate)
library(readxl)
theme_set(theme_bw())
library(imputeTS)# time series imputation
library(naniar)# for missing data visualisation
library(knitr) # for rmarkdown rendering
source('time_impute.R', echo = T) # time series wrapper imputation function
##
## > time_impute = function(df, ..., cols_to_impute = c("Value",
## + "lowerCI", "upperCI"), year_column = "year") {
## + dots = as.character(ensyms(. .... [TRUNCATED]
source('updateCodes.R', echo = T)
##
## > updateCodes = function(df) {
## + if (any(df$areaName == "Shepway")) {
## + df[df$areaName == "Shepway", "areaName"] = "Folkestone and Hythe"
## .... [TRUNCATED]
# deaths = read.csv('OriginalData/deathrecords2020.csv') # caveat: only 2020
# deaths = read.csv('OriginalData/DEATHS02010_2019.csv') # caveat: only by wider regions (around 13)
# -- not using for now as only 2020 deaths
lifeexpect = read.csv('OriginalData/lifeexpectancies.csv')
# population = read_xlsx('OriginalData/popestimates.xlsx',
# sheet = 'Dataset',
# range = cell_rows(c(3, 121433)))
population = read.csv('OriginalData/POPULATION2018_2019.csv')
suicides = read.csv('OriginalData/suicide2018.csv')
wellbeing = read.csv('OriginalData/wellbeing.csv')
work = read.csv('OriginalData/working.csv')
work_ts = read.csv('OriginalData/working_time_series.csv')
gdp = read_xlsx('OriginalData/regionalgrossdomesticproductgdplocalauthorities.xlsx',
sheet = 'Table 5',
range = cell_rows(c(2, 384)))
unemployment = read_xls('OriginalData/unemployment.xls',
range = 'A3:HA375', sheet = 3)
Here I take data cleaning on a case-by-case basis: tidying and imputing each data set at a time, indepently of each other. A few comments that apply to all the ONS datasets: all data sets appear to have redundant columns (as in duplicates), secondly for some of them the data corresponds to an interval and for some others the data is available for every year.
# deaths table: some columns are duplicate so we remove those, we also change the week strings to week numbers
# str(deaths)
#
# deaths = deaths %>%
# select(-c(Data.Marking,calendar.years,week, cause.of.death, place.of.death, registration.or.occurrence)) %>%
# rename(Deaths = V4_1, year = time, areaCode = admin.geography,
# areaName = geography, deathCause = causeofdeath,
# deathPlace = placeofdeath, week = week.number) %>%
# mutate(week = as.integer(str_replace(week, 'week-', '')))
str(lifeexpect)
## 'data.frame': 15768 obs. of 12 variables:
## $ V4_3 : num 78.1 56.1 60.9 65.3 NA ...
## $ lower.confidence.limit : chr "77.44868" "54.45824" "59.15845" "63.54933" ...
## $ upper.confidence.limit : chr "78.65818" "57.69662" "62.65689" "67.0574" ...
## $ Data_Marking : chr "" "" "" "" ...
## $ two.year.intervals : chr "2013-15" "2013-15" "2013-15" "2013-15" ...
## $ time : chr "2013-15" "2013-15" "2013-15" "2013-15" ...
## $ admin.geography : chr "E06000003" "E06000009" "E08000022" "E06000007" ...
## $ geography : chr "Redcar and Cleveland" "Blackpool" "North Tyneside" "Warrington" ...
## $ life.expectancy.variable: chr "life-expectancy" "disability-free-life-expectancy" "healthy-life-expectancy" "disability-free-life-expectancy" ...
## $ lifeexpectancyvariable : chr "Life expectancy" "Disability-free life expectancy" "Healthy life expectancy" "Disability-free life expectancy" ...
## $ birth.cohort : chr "birth-males" "birth-males" "birth-males" "birth-males" ...
## $ birthcohort : chr "Males at birth" "Males at birth" "Males at birth" "Males at birth" ...
As there are many duplicated columns, we will delete those that are either duplicated or empty and give better names to the ones we keep, as well as changing the type of numeric variables that are registered as strings
lifeexpect = lifeexpect %>%
select(-c(Data_Marking,
two.year.intervals,
life.expectancy.variable,
birth.cohort)) %>%
rename(Value = V4_3,
lowerCI = lower.confidence.limit,
upperCI = upper.confidence.limit,
year = time,
areaCode = admin.geography,
areaName = geography,
LEvariable = lifeexpectancyvariable,
cohort = birthcohort) %>%
mutate(lowerCI = as.numeric(lowerCI),
upperCI = as.numeric(upperCI))
The life expectancy information is available in the year intervals r{unique(lifeexpect$year)}, to be consistent with the datasets that are available for every year we turn intervals into years. We do this by taking the upper value of each interval as the year column. e.g.: turning 2013-15 to 2015 and 2014-16 to 2016 and so forth
lifeexpect$year = as.numeric(lapply(strsplit(lifeexpect$year, '-'), '[[',1))+2
Now we check missing data in the Value column by cohort, type of life expectancy variable and year
kable(table(is.na(lifeexpect$Value), lifeexpect$cohort)) # all cohorts with same amount of data
| Females at age 65 | Females at birth | Males at age 65 | Males at birth | |
|---|---|---|---|---|
| FALSE | 2505 | 2505 | 2505 | 2505 |
| TRUE | 1437 | 1437 | 1437 | 1437 |
kable(table(is.na(lifeexpect$Value), lifeexpect$LEvariable)) # lifeexpectancy with the least missing data
| Disability-free life expectancy | Healthy life expectancy | Life expectancy | |
|---|---|---|---|
| FALSE | 2796 | 2796 | 4428 |
| TRUE | 2460 | 2460 | 828 |
kable(table(is.na(lifeexpect$Value), lifeexpect$year)) # 2017 with most missing data
| 2015 | 2016 | 2017 | |
|---|---|---|---|
| FALSE | 3576 | 3624 | 2820 |
| TRUE | 1680 | 1632 | 2436 |
lifeexpect = lifeexpect %>%
mutate(LEvariable = ifelse(LEvariable == 'Disability-free life expectancy',
'DFLE',
ifelse(LEvariable == 'Healthy life expectancy',
'HLE',
'LE')),
cohort = ifelse(cohort == 'Females at age 65',
'FemAt65',
ifelse(cohort == 'Males at age 65',
'MaleAt65',
ifelse(cohort == 'Females at birth',
'FemAtBirth',
'MaleAtBirth'))))
Here we wish to know how many entries, defined by unique combinations of the local authority code (areaCode), the life expectancy(LE) variable (LEvariable, e.g. healthy LE, Disability-free LE…) and cohort (cohort, e.g. Male at birth, Female at 65…) have no available values (aka all values missing) for all available years.
count = 0
combs = 0
for (reg in unique(lifeexpect$areaCode))
for (var in unique(lifeexpect$LEvariable))
for(coh in unique(lifeexpect$cohort))
{
combs = combs +1
if (all(is.na(lifeexpect %>% filter(areaCode == reg, LEvariable == var, cohort == coh) %>% select(Value)))){
count = count+1
}
}
cat(paste(count, 'combinations out of', combs, 'combinations have all missing entries'))
## 1632 combinations out of 5256 combinations have all missing entries
lifeexpect %>%
group_by(areaCode,
LEvariable,
cohort) %>%
summarise(NumberOfMissingEntries = sum(is.na(Value)),
ProprortionOfMissingEntries = paste0(round(mean(is.na(Value)),3)*100,' %')) %>%
group_by(NumberOfMissingEntries,
ProprortionOfMissingEntries) %>%
summarise(Count = n()) %>%
kable()
| NumberOfMissingEntries | ProprortionOfMissingEntries | Count |
|---|---|---|
| 0 | 0 % | 2772 |
| 1 | 33.3 % | 852 |
| 3 | 100 % | 1632 |
First we order the data by year, the time series works assuming the data is ordered.
lifeexpect = lifeexpect %>% arrange(areaCode, LEvariable, cohort, year)
lifeexpect = time_impute(lifeexpect, areaCode, LEvariable, cohort,
year_column = 'year')
## [1] "Missingness before imputation (%):"
## Value lowerCI upperCI
## 0.3645358 0.3645358 0.3645358
## Time difference of 1.286631 mins
## [1] "Missingness after imputation (%):"
## Value lowerCI upperCI
## 0.3105023 0.3645358 0.3645358
lifeexpect %>%
group_by(areaCode,
LEvariable,
cohort) %>%
summarise(NumberOfMissingEntries = sum(is.na(Value)),
ProportionOfMissingEntries = mean(is.na(Value))) %>%
group_by(NumberOfMissingEntries,
ProportionOfMissingEntries) %>%
summarise(Count = n()) %>%
kable()
| NumberOfMissingEntries | ProportionOfMissingEntries | Count |
|---|---|---|
| 0 | 0 | 3624 |
| 3 | 1 | 1632 |
We finally filter for the latest year (in this case 2017) and pivot the data to get a tidy format. We also keep a copy of the tidy time-series data (just in case)
lifeexpect_time = lifeexpect %>%
pivot_wider(names_from = c(LEvariable, cohort),
values_from = c(Value, lowerCI, upperCI))
lifeexpect = lifeexpect_time %>%
filter(year == 2017)
str(lifeexpect)
## tibble [438 × 39] (S3: tbl_df/tbl/data.frame)
## $ year : num [1:438] 2017 2017 2017 2017 2017 ...
## $ areaCode : chr [1:438] "E06000001" "E06000002" "E06000003" "E06000004" ...
## $ areaName : chr [1:438] "Hartlepool" "Middlesbrough" "Redcar and Cleveland" "Stockton-on-Tees" ...
## $ Value_DFLE_FemAt65 : num [1:438] 7.42 7.45 9.28 9.63 10.04 ...
## $ Value_DFLE_FemAtBirth : num [1:438] 56.2 58 61.2 61.5 60.9 ...
## $ Value_DFLE_MaleAt65 : num [1:438] 7.56 8.13 9.33 7.58 9.44 ...
## $ Value_DFLE_MaleAtBirth : num [1:438] 55.1 57.6 59.3 57.3 60.9 ...
## $ Value_HLE_FemAt65 : num [1:438] 8.22 8.31 11 10.26 10.47 ...
## $ Value_HLE_FemAtBirth : num [1:438] 57 57.6 61.4 60.9 64.3 ...
## $ Value_HLE_MaleAt65 : num [1:438] 7.67 8.82 9.89 8.15 9.21 ...
## $ Value_HLE_MaleAtBirth : num [1:438] 57.4 58.1 58.8 56.6 60.7 ...
## $ Value_LE_FemAt65 : num [1:438] 19.9 18.9 20.3 19.9 20.3 ...
## $ Value_LE_FemAtBirth : num [1:438] 81.4 79.9 81.5 81.4 82.3 ...
## $ Value_LE_MaleAt65 : num [1:438] 17.1 16.5 18 18.1 18.2 ...
## $ Value_LE_MaleAtBirth : num [1:438] 76.1 75.7 77.7 78.1 78.4 ...
## $ lowerCI_DFLE_FemAt65 : num [1:438] 6.22 6.14 7.9 8.08 8.61 ...
## $ lowerCI_DFLE_FemAtBirth : num [1:438] 54.3 56.1 59.4 59.5 59 ...
## $ lowerCI_DFLE_MaleAt65 : num [1:438] 6.39 6.71 8.14 5.94 8.02 ...
## $ lowerCI_DFLE_MaleAtBirth: num [1:438] 53.1 55.7 57.3 55.1 59 ...
## $ lowerCI_HLE_FemAt65 : num [1:438] 7.1 7.14 9.71 8.77 9.04 ...
## $ lowerCI_HLE_FemAtBirth : num [1:438] 55.3 55.8 59.6 58.8 62.4 ...
## $ lowerCI_HLE_MaleAt65 : num [1:438] 6.64 7.46 8.81 6.49 7.9 ...
## $ lowerCI_HLE_MaleAtBirth : num [1:438] 55.8 56.2 56.8 54.4 58.9 ...
## $ lowerCI_LE_FemAt65 : num [1:438] 19.4 18.5 19.9 19.6 19.9 ...
## $ lowerCI_LE_FemAtBirth : num [1:438] 80.7 79.3 81 80.9 81.7 ...
## $ lowerCI_LE_MaleAt65 : num [1:438] 16.7 16.1 17.6 17.8 17.8 ...
## $ lowerCI_LE_MaleAtBirth : num [1:438] 75.3 75.1 77.1 77.6 77.7 ...
## $ upperCI_DFLE_FemAt65 : num [1:438] 8.62 8.76 10.66 11.18 11.47 ...
## $ upperCI_DFLE_FemAtBirth : num [1:438] 58 59.8 63.1 63.5 62.9 ...
## $ upperCI_DFLE_MaleAt65 : num [1:438] 8.73 9.56 10.52 9.22 10.85 ...
## $ upperCI_DFLE_MaleAtBirth: num [1:438] 57 59.5 61.2 59.5 62.8 ...
## $ upperCI_HLE_FemAt65 : num [1:438] 9.35 9.48 12.28 11.75 11.9 ...
## $ upperCI_HLE_FemAtBirth : num [1:438] 58.8 59.4 63.3 62.9 66.1 ...
## $ upperCI_HLE_MaleAt65 : num [1:438] 8.7 10.18 10.98 9.81 10.52 ...
## $ upperCI_HLE_MaleAtBirth : num [1:438] 59 59.9 60.7 58.9 62.5 ...
## $ upperCI_LE_FemAt65 : num [1:438] 20.3 19.3 20.6 20.3 20.7 ...
## $ upperCI_LE_FemAtBirth : num [1:438] 82 80.4 82.1 81.8 82.9 ...
## $ upperCI_LE_MaleAt65 : num [1:438] 17.6 16.9 18.3 18.4 18.7 ...
## $ upperCI_LE_MaleAtBirth : num [1:438] 76.8 76.3 78.3 78.6 79.1 ...
vis_miss(lifeexpect) + coord_flip()
We notice the variables DFLE and HLE have a lot of missingness hence we only keep the LE variables
lifeexpect_time = lifeexpect_time %>% select(year, areaCode, areaName, contains('_LE_'))
lifeexpect = lifeexpect %>% select(year, areaCode, areaName, contains('_LE_'))
Pass through custom function to update old codes, then combine the entries with matching area name and area code
lifeexpect = updateCodes(lifeexpect)
combs_dup = lifeexpect %>%
group_by(areaCode, areaName, year) %>% summarise(duplicat = n()>1)
print(
lifeexpect[lifeexpect$areaCode %in%
combs_dup$areaCode[combs_dup$duplicat==T],c(2,3,6,7)])
## # A tibble: 6 x 4
## areaCode areaName Value_LE_MaleAt65 Value_LE_MaleAtBirth
## <chr> <chr> <dbl> <dbl>
## 1 E07000246 Somerset West and Taunton 18.9 79.4
## 2 E07000246 Somerset West and Taunton 20.0 81.0
## 3 E07000245 West Suffolk 19.5 80.6
## 4 E07000245 West Suffolk 20.3 81.6
## 5 E07000244 East Suffolk 20.2 81.8
## 6 E07000244 East Suffolk 19.1 79.0
This function combines entries by taking the mean when they are duplicated
for (row in which(combs_dup$duplicat==T)){
entry = combs_dup[row,]
lifeexpect[(lifeexpect$areaCode == entry$areaCode &
lifeexpect$areaName == entry$areaName &
lifeexpect$year == entry$year),
!colnames(lifeexpect) %in% c('areaCode','areaName', 'year')] =
t(apply(lifeexpect[(lifeexpect$areaCode == entry$areaCode &
lifeexpect$areaName == entry$areaName &
lifeexpect$year == entry$year),
!colnames(lifeexpect) %in% c('areaCode','areaName', 'year')],2,mean))
}
print(
lifeexpect[lifeexpect$areaCode %in%
combs_dup$areaCode[combs_dup$duplicat==T],c(2,3,6,7)])
## # A tibble: 6 x 4
## areaCode areaName Value_LE_MaleAt65 Value_LE_MaleAtBirth
## <chr> <chr> <dbl> <dbl>
## 1 E07000246 Somerset West and Taunton 19.5 80.2
## 2 E07000246 Somerset West and Taunton 19.5 80.2
## 3 E07000245 West Suffolk 19.9 81.1
## 4 E07000245 West Suffolk 19.9 81.1
## 5 E07000244 East Suffolk 19.7 80.4
## 6 E07000244 East Suffolk 19.7 80.4
Finally, select unique rows as we have generated duplicated entries in the previous step
lifeexpect = lifeexpect %>% distinct()
lifeexpect_time = updateCodes(lifeexpect_time)
combs_dup = lifeexpect_time %>%
group_by(areaCode, areaName, year) %>% summarise(duplicat = n()>1)
print(
lifeexpect_time[lifeexpect_time$areaCode %in%
combs_dup$areaCode[combs_dup$duplicat==T],c(3,6,7)])
## # A tibble: 18 x 3
## areaName Value_LE_MaleAt65 Value_LE_MaleAtBirth
## <chr> <dbl> <dbl>
## 1 Somerset West and Taunton 18.9 79.6
## 2 Somerset West and Taunton 18.9 79.4
## 3 Somerset West and Taunton 18.9 79.4
## 4 Somerset West and Taunton 19.4 80.5
## 5 Somerset West and Taunton 20.0 81.0
## 6 Somerset West and Taunton 20.0 81.0
## 7 West Suffolk 19.2 80.5
## 8 West Suffolk 19.5 80.6
## 9 West Suffolk 19.5 80.6
## 10 West Suffolk 20.1 81.9
## 11 West Suffolk 20.3 81.6
## 12 West Suffolk 20.3 81.6
## 13 East Suffolk 19.8 81.4
## 14 East Suffolk 20.2 81.8
## 15 East Suffolk 20.2 81.8
## 16 East Suffolk 19.1 79.4
## 17 East Suffolk 19.1 79.0
## 18 East Suffolk 19.1 79.0
This function combines entries by taking the mean when they are duplicated
for (row in which(combs_dup$duplicat==T)){
entry = combs_dup[row,]
lifeexpect_time[(lifeexpect_time$areaCode == entry$areaCode &
lifeexpect_time$areaName == entry$areaName &
lifeexpect_time$year == entry$year),
!colnames(lifeexpect_time) %in% c('areaCode','areaName', 'year')] =
t(apply(lifeexpect_time[(lifeexpect_time$areaCode == entry$areaCode &
lifeexpect_time$areaName == entry$areaName &
lifeexpect_time$year == entry$year),
!colnames(lifeexpect_time) %in% c('areaCode','areaName', 'year')],2,mean))
}
print(
lifeexpect_time[lifeexpect_time$areaCode %in%
combs_dup$areaCode[combs_dup$duplicat==T],c(3,6,7)])
## # A tibble: 18 x 3
## areaName Value_LE_MaleAt65 Value_LE_MaleAtBirth
## <chr> <dbl> <dbl>
## 1 Somerset West and Taunton 19.2 80.1
## 2 Somerset West and Taunton 19.5 80.2
## 3 Somerset West and Taunton 19.5 80.2
## 4 Somerset West and Taunton 19.2 80.1
## 5 Somerset West and Taunton 19.5 80.2
## 6 Somerset West and Taunton 19.5 80.2
## 7 West Suffolk 19.7 81.2
## 8 West Suffolk 19.9 81.1
## 9 West Suffolk 19.9 81.1
## 10 West Suffolk 19.7 81.2
## 11 West Suffolk 19.9 81.1
## 12 West Suffolk 19.9 81.1
## 13 East Suffolk 19.5 80.4
## 14 East Suffolk 19.7 80.4
## 15 East Suffolk 19.7 80.4
## 16 East Suffolk 19.5 80.4
## 17 East Suffolk 19.7 80.4
## 18 East Suffolk 19.7 80.4
Finally, select unique rows as we have generated duplicated entries in the previous step
lifeexpect_time = lifeexpect_time %>% distinct()
Out if this dataset we will extract information regarding age statistics and population size
tail(population)
## v4_0 calendar.years time admin.geography geography
## 1897771 194462 2018 2018 E06000049 Cheshire East
## 1897772 56865 2018 2018 E07000112 Shepway
## 1897773 48444 2018 2018 E07000142 West Lindsey
## 1897774 42731 2018 2018 E07000236 Redditch
## 1897775 51607 2018 2018 E07000237 Worcester
## 1897776 48885 2018 2018 S12000030 Stirling
## mid.year.pop.sex sex mid.year.pop.age age
## 1897771 2 Female total Total
## 1897772 2 Female total Total
## 1897773 2 Female total Total
## 1897774 2 Female total Total
## 1897775 2 Female total Total
## 1897776 2 Female total Total
We notice below that the column containing the population estimates contains no missing values which is great
sum(is.na(population$v4_0)) # no missing values, thank god, no need for imputation
## [1] 0
population = population %>%
select(-c(calendar.years,
mid.year.pop.sex,
mid.year.pop.age))%>%
rename(areaName = geography,
areaCode = admin.geography,
Value = v4_0,
year = time)
In this case we update the codes before tidying because we are going to perform aggregations later
population = updateCodes(population)
Distribution of age coloured by gender for 9 random local authority districts
population %>%
filter(year == 2018) %>%
filter(sex != 'All',
age != 'Total',
areaName %in% sample(areaName, 9)) %>%
mutate(age = as.numeric(ifelse(age == '90+', 90, age))) %>%
ggplot(aes(x = age,
y = Value,
fill = sex))+
geom_col()+
facet_wrap(vars(areaName), scales = 'free')+
scale_x_continuous(breaks = seq(0,100, by = 20))
Check if there are any holes in the age records
unique_ages = data.matrix(unique(population %>%
filter(age != 'Total') %>%
mutate(age = as.numeric(ifelse(age == '90+', 90, age))) %>%
select(age)))
all(unique_ages %in% c(0:90))
## [1] TRUE
For population I can extract: * age info: * mean * median * mode: when calculating the mode, sometimes for some miracle of the universe the frequency for a given age group and a given sex is the same (See Wrexham in population df as they have 1015 for 46 and 47 y/o males), in these cases I’m taking the mean for those cases (i.e. 46.5 in above example) * stdev * range * max * min in practice all LADs have at least one 0 year old and one 90+ year old * age bins, number of people in each age bin
All stratified by gender and not stratified
We define the age bins arbitrarily to represent: children and teens (0-16 y/o), young adults (17-29 y/o), adults (30-63 y/o), old (64-80) and very old (80+).
To calculate things like median and mode I actually ‘explode’ the age column with the rep() function getting the count for each age from the Value column and then calculate the statistics in question
age_time = population %>%
filter(age != 'Total' & sex != 'All') %>%
mutate(age = as.numeric(ifelse(age == '90+', 90, age))) %>%
group_by(year,
areaCode,
areaName,
sex) %>%
summarise(TotalPeople = sum(Value),
MeanAge = sum(age*Value)/TotalPeople,
MedianAge = median(rep(age,Value)),
ModeAge = mean(age[which(Value == max(Value))]),
stdevAge = sd(rep(age,Value)),
NumberOfTeens = sum(Value[which(age<17)]),
NumberOfYoungAdults = sum(Value[which(age>=17 & age<30)]),
NumberOfAdults = sum(Value[which(age>=30 & age<64)]),
NumberOfOld = sum(Value[which(age>=64 & age<81)]),
NumberOfVeryOld = sum(Value[which(age>80)]))
age = age_time %>% filter(year == 2018)
age_time = age_time %>%
pivot_wider(names_from = sex,
values_from = c(TotalPeople,
MeanAge,
MedianAge,
ModeAge,
stdevAge,
NumberOfTeens,
NumberOfYoungAdults,
NumberOfAdults,
NumberOfOld,
NumberOfVeryOld)
)
age = age %>%
pivot_wider(names_from = sex,
values_from = c(TotalPeople,
MeanAge,
MedianAge,
ModeAge,
stdevAge,
NumberOfTeens,
NumberOfYoungAdults,
NumberOfAdults,
NumberOfOld,
NumberOfVeryOld)
)
agetotal_time = population %>%
filter(age != 'Total' & sex != 'All') %>%
mutate(age = as.numeric(ifelse(age == '90+', 90, age))) %>%
group_by(year,
areaCode,
areaName) %>% # note we are not aggregating by sex
summarise(TotalPeople = sum(Value),
MeanAge = sum(age*Value)/TotalPeople,
MedianAge = median(rep(age,Value)),
ModeAge = mean(age[which(Value == max(Value))]),
stdevAge = sd(rep(age,Value)),
NumberOfTeens = sum(Value[which(age<17)]),
NumberOfYoungAdults = sum(Value[which(age>=17 & age<30)]),
NumberOfAdults = sum(Value[which(age>=30 & age<64)]),
NumberOfOld = sum(Value[which(age>=64 & age<81)]),
NumberOfVeryOld = sum(Value[which(age>80)]))
agetotal = agetotal_time %>% filter(year == 2018)
colnames(agetotal_time)[4:ncol(agetotal_time)] = paste0(
colnames(agetotal_time)[4:ncol(agetotal_time)], '_All')
colnames(agetotal)[4:ncol(agetotal)] = paste0(
colnames(agetotal)[4:ncol(agetotal)], '_All')
popclean_time = merge(age_time, agetotal_time, by = c('year','areaCode','areaName'))
popclean = merge(age, agetotal, by = c('year','areaCode','areaName'))
Pass through custom function to update old codes, then combine the entries with matching area name and area code
popclean = updateCodes(popclean)
combs_dup = popclean %>%
group_by(areaCode, areaName,year) %>% summarise(duplicat = n()>1)
# In this case luckily we had no bad entries
print(
popclean[popclean$areaCode %in%
combs_dup$areaCode[combs_dup$duplicat==T],c(3,6,7)])
## [1] areaName MeanAge_Female MeanAge_Male
## <0 rows> (or 0-length row.names)
popclean_time = updateCodes(popclean_time)
combs_dup = popclean_time %>%
group_by(areaCode, areaName,year) %>% summarise(duplicat = n()>1)
print(
popclean_time[popclean_time$areaCode %in%
combs_dup$areaCode[combs_dup$duplicat==T],c(3,6,7)])
## [1] areaName MeanAge_Female MeanAge_Male
## <0 rows> (or 0-length row.names)
str(suicides)
## 'data.frame': 6392 obs. of 5 variables:
## $ V4_0 : int 25 20 15 6 13 10 11 50 358 21 ...
## $ calendar.years : int 2015 2013 2005 2016 2010 2004 2012 2011 2006 2008 ...
## $ time : int 2015 2013 2005 2016 2010 2004 2012 2011 2006 2008 ...
## $ admin.geography: chr "E08000022" "E06000009" "E08000021" "E07000037" ...
## $ geography : chr "North Tyneside" "Blackpool" "Newcastle upon Tyne" "High Peak" ...
suicides = suicides %>%
select(-calendar.years) %>%
rename(Value = V4_0,
year = time,
areaCode = admin.geography,
areaName = geography)
sum(is.na(suicides$Value)) # no NAs good
## [1] 0
According the command below all locations have data available for all years
sum(is.na(suicides %>%
pivot_wider(names_from = year, values_from = Value)))
## [1] 0
suicides %>%
filter(areaName %in% sample(areaName, 9)) %>%
ggplot(aes(x = year, y = Value))+
geom_point()+
facet_wrap(vars(areaName))
suicides = updateCodes(suicides)
combs_dup = suicides %>%
group_by(areaCode, areaName, year) %>% summarise(duplicat = n()>1)
print(
suicides[suicides$areaCode %in%
combs_dup$areaCode[combs_dup$duplicat==T],c(1)])
## integer(0)
suicides_time = suicides %>% rename(NumberOfSuicides = Value)
suicides = suicides_time %>% filter(year == 2018)
tail(wellbeing)
## V4_3 Data.Marking Lower.limit Upper.limit yyyy.yy time admin.geography
## 69115 2.99 2.73 3.25 2013-14 2013-14 S12000019
## 69116 3.02 2.79 3.26 2017-18 2017-18 S12000021
## 69117 2.69 2.44 2.94 2016-17 2016-17 S12000024
## 69118 2.33 2.11 2.55 2016-17 2016-17 S12000026
## 69119 3.12 2.88 3.35 2012-13 2012-13 S12000039
## 69120 2.07 1.61 2.53 2015-16 2015-16 N09000001
## geography wellbeing.measureofwellbeing
## 69115 Midlothian anxiety
## 69116 North Ayrshire anxiety
## 69117 Perth and Kinross anxiety
## 69118 Scottish Borders anxiety
## 69119 West Dunbartonshire anxiety
## 69120 Antrim and Newtownabbey anxiety
## allmeasuresofwellbeing wellbeing.estimate estimate
## 69115 Anxiety average-mean Average (mean)
## 69116 Anxiety average-mean Average (mean)
## 69117 Anxiety average-mean Average (mean)
## 69118 Anxiety average-mean Average (mean)
## 69119 Anxiety average-mean Average (mean)
## 69120 Anxiety average-mean Average (mean)
We observe quite a lot of data missingness here
sum(is.na(wellbeing$V4_3))
## [1] 15914
mean(is.na(wellbeing$V4_3))
## [1] 0.2302373
wellbeing = wellbeing %>%
select(-c(Data.Marking,
yyyy.yy,
wellbeing.measureofwellbeing,
wellbeing.estimate)) %>%
rename(Value = V4_3,
lowerCI = Lower.limit,
upperCI = Upper.limit,
year = time,
areaCode = admin.geography,
areaName = geography,
WellbeingMetric = allmeasuresofwellbeing,
EstimateBin = estimate) %>%
mutate(lowerCI = as.numeric(lowerCI),
upperCI = as.numeric(upperCI))
Same operation that we did for the lifeexpect table, here the intervals are only 2 years long though. (e.g. 2014-15 –> 2015)
wellbeing$year = as.numeric(lapply(strsplit(wellbeing$year, '-'), '[[',1))+1
The wellbeing data is very rich, this data was gathered by running a survey with 4 self-assesed questions. For each question the user can score a value between 0 and 10. The wellbeing table contains the average score for each category (i.e. r{paste(unique(wellbeing$wellbeing.measureofwellbeing), collapse= ', ')}) as well as the proportion of people in different bins (i.e. r{paste(unique(wellbeing$estimate), collapse= ', ')}). I believe the average score is more interpretable than the bins and more concise, though there are some caveats to this. Read more about the methodology behing these metrics in the ONS site
wellbeing = wellbeing %>% filter(EstimateBin == 'Average (mean)')
sum(is.na(wellbeing$Value)) # 117 missing values
## [1] 117
In this case if the output is 8, all entries are missing for all 8 years the variable was measured
wellbeing %>%
group_by(areaCode,
WellbeingMetric) %>%
summarise(NumberOfMissingEntries = sum(is.na(Value)),
ProportionOfMissingEntries = mean(is.na(Value))) %>%
group_by(NumberOfMissingEntries,
ProportionOfMissingEntries) %>%
summarise(Count = n()) %>%
kable()
| NumberOfMissingEntries | ProportionOfMissingEntries | Count |
|---|---|---|
| 0 | 0.000 | 1668 |
| 1 | 0.125 | 51 |
| 2 | 0.250 | 1 |
| 8 | 1.000 | 8 |
wellbeing %>%
group_by(year) %>%
summarise(NumberOfMissingEntries = sum(is.na(Value)),
ProportionOfMissingEntries = mean(is.na(Value))) %>%
kable()
| year | NumberOfMissingEntries | ProportionOfMissingEntries |
|---|---|---|
| 2012 | 52 | 0.0300926 |
| 2013 | 8 | 0.0046296 |
| 2014 | 8 | 0.0046296 |
| 2015 | 9 | 0.0052083 |
| 2016 | 8 | 0.0046296 |
| 2017 | 8 | 0.0046296 |
| 2018 | 12 | 0.0069444 |
| 2019 | 12 | 0.0069444 |
wellbeing %>%
filter(areaName %in% sample(areaName, 9)) %>%
ggplot(aes(x = year,
y = Value,
color = WellbeingMetric)) +
geom_point() +
facet_wrap(vars(areaName))
The LADs with all missingness are the City of london and the Isles of Scilly. These two often have no available data from the ONS #### Time-series imputation Order the data before imputation (just in case)
wellbeing = wellbeing %>% arrange(areaCode, WellbeingMetric, year)
wellbeing = time_impute(wellbeing, areaCode, WellbeingMetric)
## [1] "Missingness before imputation (%):"
## Value lowerCI upperCI
## 0.008463542 0.008463542 0.008463542
## Time difference of 19.19313 secs
## [1] "Missingness after imputation (%):"
## Value lowerCI upperCI
## 0.004629630 0.008463542 0.008463542
wellbeing %>%
group_by(areaCode,
WellbeingMetric) %>%
summarise(NumberOfMissingEntries = sum(is.na(Value)),
ProportionOfMissingEntries = mean(is.na(Value))) %>%
group_by(NumberOfMissingEntries,
ProportionOfMissingEntries) %>%
summarise(Count = n()) %>%
kable()
| NumberOfMissingEntries | ProportionOfMissingEntries | Count |
|---|---|---|
| 0 | 0 | 1720 |
| 8 | 1 | 8 |
(Updated) missingness by year
wellbeing %>%
group_by(year) %>%
summarise(NumberOfMissingEntries = sum(is.na(Value))) %>%
kable()
| year | NumberOfMissingEntries |
|---|---|
| 2012 | 8 |
| 2013 | 8 |
| 2014 | 8 |
| 2015 | 8 |
| 2016 | 8 |
| 2017 | 8 |
| 2018 | 8 |
| 2019 | 8 |
wellbeing_time = wellbeing %>%
mutate(Range = upperCI - lowerCI) %>%
select(Value,
Range,
year,
areaCode,
areaName,
WellbeingMetric) %>%
pivot_wider(names_from = WellbeingMetric,
values_from = c(Value, Range))
wellbeing_time = updateCodes(wellbeing_time)
combs_dup = wellbeing_time %>%
group_by(areaCode, areaName, year) %>% summarise(duplicat = n()>1)
print(
wellbeing_time[wellbeing_time$areaCode %in%
combs_dup$areaCode[combs_dup$duplicat==T],c(1,2,3,4)] %>%
arrange(areaCode, areaName,year), n = 100)
## # A tibble: 48 x 4
## year areaCode areaName Value_Anxiety
## <dbl> <chr> <chr> <dbl>
## 1 2012 E07000244 East Suffolk 2.92
## 2 2012 E07000244 East Suffolk 3.27
## 3 2013 E07000244 East Suffolk 3.11
## 4 2013 E07000244 East Suffolk 2.51
## 5 2014 E07000244 East Suffolk 3.2
## 6 2014 E07000244 East Suffolk 2.96
## 7 2015 E07000244 East Suffolk 2.9
## 8 2015 E07000244 East Suffolk 3.13
## 9 2016 E07000244 East Suffolk 2.47
## 10 2016 E07000244 East Suffolk 2.81
## 11 2017 E07000244 East Suffolk 2.66
## 12 2017 E07000244 East Suffolk 2.78
## 13 2018 E07000244 East Suffolk 2.78
## 14 2018 E07000244 East Suffolk 2.69
## 15 2019 E07000244 East Suffolk 2.45
## 16 2019 E07000244 East Suffolk 3.1
## 17 2012 E07000245 West Suffolk 2.74
## 18 2012 E07000245 West Suffolk 3.18
## 19 2013 E07000245 West Suffolk 2.72
## 20 2013 E07000245 West Suffolk 3.1
## 21 2014 E07000245 West Suffolk 3.14
## 22 2014 E07000245 West Suffolk 2.58
## 23 2015 E07000245 West Suffolk 3.51
## 24 2015 E07000245 West Suffolk 2.47
## 25 2016 E07000245 West Suffolk 2.81
## 26 2016 E07000245 West Suffolk 3.12
## 27 2017 E07000245 West Suffolk 3.32
## 28 2017 E07000245 West Suffolk 2.29
## 29 2018 E07000245 West Suffolk 2.77
## 30 2018 E07000245 West Suffolk 2.63
## 31 2019 E07000245 West Suffolk 2.46
## 32 2019 E07000245 West Suffolk 2.23
## 33 2012 E07000246 Somerset West and Taunton 2.79
## 34 2012 E07000246 Somerset West and Taunton 3.31
## 35 2013 E07000246 Somerset West and Taunton 3.24
## 36 2013 E07000246 Somerset West and Taunton 3.08
## 37 2014 E07000246 Somerset West and Taunton 3.1
## 38 2014 E07000246 Somerset West and Taunton 2.19
## 39 2015 E07000246 Somerset West and Taunton 2.97
## 40 2015 E07000246 Somerset West and Taunton 2.55
## 41 2016 E07000246 Somerset West and Taunton 2.56
## 42 2016 E07000246 Somerset West and Taunton 2.92
## 43 2017 E07000246 Somerset West and Taunton 3.1
## 44 2017 E07000246 Somerset West and Taunton 2.3
## 45 2018 E07000246 Somerset West and Taunton 2.46
## 46 2018 E07000246 Somerset West and Taunton 2.67
## 47 2019 E07000246 Somerset West and Taunton 2.65
## 48 2019 E07000246 Somerset West and Taunton 2.67
Many duplicates now
#’ This function combines entries by taking the mean when they are duplicated
for (row in which(combs_dup$duplicat==T)){
entry = combs_dup[row,]
wellbeing_time[(wellbeing_time$areaCode == entry$areaCode &
wellbeing_time$areaName == entry$areaName &
wellbeing_time$year == entry$year),
!colnames(wellbeing_time) %in% c('areaCode','areaName', 'year')] =
t(apply(wellbeing_time[(wellbeing_time$areaCode == entry$areaCode &
wellbeing_time$areaName == entry$areaName &
wellbeing_time$year == entry$year),
!colnames(wellbeing_time) %in% c('areaCode','areaName', 'year')],2,mean))
}
print(
wellbeing_time[wellbeing_time$areaCode %in%
combs_dup$areaCode[combs_dup$duplicat==T],c(1,2,3,4)])
## # A tibble: 48 x 4
## year areaCode areaName Value_Anxiety
## <dbl> <chr> <chr> <dbl>
## 1 2012 E07000246 Somerset West and Taunton 3.05
## 2 2013 E07000246 Somerset West and Taunton 3.16
## 3 2014 E07000246 Somerset West and Taunton 2.64
## 4 2015 E07000246 Somerset West and Taunton 2.76
## 5 2016 E07000246 Somerset West and Taunton 2.74
## 6 2017 E07000246 Somerset West and Taunton 2.7
## 7 2018 E07000246 Somerset West and Taunton 2.56
## 8 2019 E07000246 Somerset West and Taunton 2.66
## 9 2012 E07000246 Somerset West and Taunton 3.05
## 10 2013 E07000246 Somerset West and Taunton 3.16
## # … with 38 more rows
Finally, select unique rows as we have generated duplicated entries in the previous step
wellbeing_time = wellbeing_time %>% distinct()
wellbeing = wellbeing_time %>%
filter(year == 2019)
The ASHE work table contains a column called coefficient of variation which is an estimate of the variation within the sample and it helps evaluate whether the sample can be considered a good representation of the population. Find explanation of the Coefficient of variation (CV) column here Moreover, in this section we have two tables, work and work_ts, the 1st one contains info for 2019 and the second one contains data from earlier years to enable us to impute the 2019 data.
str(work)
## 'data.frame': 1002672 obs. of 17 variables:
## $ V4_2 : num 27.5 38.4 37.5 37.5 39.9 40.1 36 35 NA 20.3 ...
## $ Data.Marking : chr "" "" "" "" ...
## $ CV : chr "0.1" "1.7" "0.0" "0.1" ...
## $ calendar.years : int 2019 2019 2019 2019 2019 2019 2019 2019 2019 2019 ...
## $ time : int 2019 2019 2019 2019 2019 2019 2019 2019 2019 2019 ...
## $ admin.geography : chr "K02000001" "E06000002" "K02000001" "E92000001" ...
## $ geography : chr "United Kingdom" "Middlesbrough" "United Kingdom" "England" ...
## $ ashe.statistics : chr "25" "80" "70" "70" ...
## $ statistics : chr "25th percentile" "80th percentile" "70th percentile" "70th percentile" ...
## $ ashe.sex : chr "all" "all" "all" "all" ...
## $ sex : chr "All" "All" "All" "All" ...
## $ ashe.working.pattern : chr "all" "all" "all" "all" ...
## $ workingpattern : chr "All" "All" "All" "All" ...
## $ ashe.hours.and.earnings : chr "paid-hours-worked-basic" "paid-hours-worked-basic" "paid-hours-worked-basic" "paid-hours-worked-basic" ...
## $ hoursandearnings : chr "Paid hours worked - Basic" "Paid hours worked - Basic" "Paid hours worked - Basic" "Paid hours worked - Basic" ...
## $ ashe.workplace.or.residence: chr "workplace" "workplace" "workplace" "workplace" ...
## $ workplaceorresidence : chr "Workplace" "Workplace" "Workplace" "Workplace" ...
work = work %>%
select(-c(Data.Marking,
calendar.years,
ashe.working.pattern,
ashe.sex,
ashe.statistics,
ashe.workplace.or.residence,
ashe.hours.and.earnings,
ashe.working.pattern)) %>%
rename(Value = V4_2,
year = time,
areaCode = admin.geography,
areaName = geography,
CoefficientOfVariation = CV) %>%
mutate(CoefficientOfVariation = as.numeric(CoefficientOfVariation))
unique(work$hoursandearnings)
## [1] "Paid hours worked - Basic" "Paid hours worked - Overtime"
## [3] "Weekly pay - Excluding overtime" "Basic pay - Including other pay"
## [5] "Weekly pay - Gross" "Overtime pay"
## [7] "Hourly pay - Gross" "Annual pay - Gross"
## [9] "Hourly pay - Excluding overtime" "Annual pay - Incentive"
## [11] "Paid hours worked - Total"
unique(work$workplaceorresidence)
## [1] "Workplace" "Residence"
unique(work$workingpattern)
## [1] "All" "Full-Time" "Part-Time"
unique(work$statistics)
## [1] "25th percentile" "80th percentile" "70th percentile" "90th percentile"
## [5] "30th percentile" "40th percentile" "60th percentile" "20th percentile"
## [9] "Median" "75th percentile" "10th percentile" "Mean"
Too many stats are available as seen above, we are only interested in a few. We select mean, median, 10th and 90th percentile
work = work %>%
filter(statistics %in% c('Mean', 'Median', '10th percentile', '90th percentile'),
workplaceorresidence == 'Workplace')
From the above command we see that hourly pay has the least missingness so we’ll go with that one
work %>%
group_by(hoursandearnings) %>%
summarise(NumberOfMissingEntries = mean(is.na(Value))) %>%
kable()
| hoursandearnings | NumberOfMissingEntries |
|---|---|
| Annual pay - Gross | 0.4504344 |
| Annual pay - Incentive | 0.9311480 |
| Basic pay - Including other pay | 0.3344523 |
| Hourly pay - Excluding overtime | 0.2819905 |
| Hourly pay - Gross | 0.2815298 |
| Overtime pay | 0.8890864 |
| Paid hours worked - Basic | 0.3138494 |
| Paid hours worked - Overtime | 0.8761190 |
| Paid hours worked - Total | 0.3143102 |
| Weekly pay - Excluding overtime | 0.3342549 |
| Weekly pay - Gross | 0.3350448 |
Now we evaulate the missingess of each earning metric per statistic to see if some combination of these is missing at an unacceptable rate
work %>%
group_by(hoursandearnings, statistics) %>%
summarise(NumberOfMissingEntries = mean(is.na(Value))) %>%
ggplot(aes(x = hoursandearnings,
y = statistics,
fill = NumberOfMissingEntries,
label = round(NumberOfMissingEntries,2)))+
geom_tile()+
geom_text(color = 'white')+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
this last graph reveals that the 90th and the 10th percentile stats have too much missingess hence I am dropping them, it also show how Hourly pay is the most covered stat
Based on previous graph I decided to choose hourly pay and mean and median only
work = work %>%
filter(hoursandearnings == 'Hourly pay - Gross',
statistics %in% c('Median', 'Mean'))
Given that now the hoursandearnings and the worplaceorresidence columns contain unique values only I drop them
work = work %>% select(-c(hoursandearnings, workplaceorresidence))
work_ts = work_ts %>%
select(-c(Data_Marking,
calendar.years,
ashe.working.pattern,
ashe.sex,
ashe.statistics,
ashe.workplace.or.residence,
ashe.hours.and.earnings,
ashe.working.pattern)) %>%
rename(Value = V4_2,
year = time,
areaCode = admin.geography,
areaName = geography,
CoefficientOfVariation = CV) %>%
mutate(CoefficientOfVariation = as.numeric(CoefficientOfVariation))
Filter the same way as for the work dataframe
work_ts = work_ts %>%
filter(statistics %in% c('Mean', 'Median', '10th percentile', '90th percentile'),
workplaceorresidence == 'Workplace')
work_ts = work_ts %>%
filter(hoursandearnings == 'Hourly pay - Gross',
statistics %in% c('Median', 'Mean'))
work_ts = work_ts %>% select(-c(hoursandearnings, workplaceorresidence))
unique(work$areaName[!(work$areaName %in% work_ts$areaName)])
## [1] "East Suffolk" "West Suffolk"
## [3] "Bournemouth, Christchurch and Poole" "Somerset West and Taunton"
Now merge work (data from 2019 only) and work_ts (2016 through 2018)
work = rbind(work, work_ts)
I believe not all towns have entries for all years in theory there should be 4 entries (2016-19) per combination of areacode, stat, sex and working pattern
work %>%
group_by(areaCode,
statistics,
sex,
workingpattern) %>%
summarise(NumberOfYearsWithAvailableData = n()) %>% ## this takes the numbers
# of entries per unique combination of the variables in the group_by
group_by(NumberOfYearsWithAvailableData) %>%
summarise(Count = n()) %>%
kable()
| NumberOfYearsWithAvailableData | Count |
|---|---|
| 1 | 126 |
| 3 | 306 |
| 4 | 7470 |
As you can see in the above table for some combinations there are only 3 available variables hence 3 missing variables represent 100% where as in other cases 4 missing varialbes represent 100% of variables
work %>%
group_by(areaCode,
statistics,
sex,
workingpattern) %>%
summarise(NumberOfMissingEntries = sum(is.na(Value)),
ProportionOfMissingEntries = mean(is.na(Value))) %>%
group_by(NumberOfMissingEntries,
ProportionOfMissingEntries) %>%
summarise(Count = n()) %>%
kable()
| NumberOfMissingEntries | ProportionOfMissingEntries | Count |
|---|---|---|
| 0 | 0.0000000 | 7384 |
| 1 | 0.2500000 | 213 |
| 1 | 0.3333333 | 7 |
| 1 | 1.0000000 | 1 |
| 2 | 0.5000000 | 138 |
| 2 | 0.6666667 | 13 |
| 3 | 0.7500000 | 81 |
| 3 | 1.0000000 | 6 |
| 4 | 1.0000000 | 59 |
Missingness by year
work %>%
group_by(year) %>%
summarise(NumberOfMissingEntries = sum(is.na(Value)),
ProportionOfMissingEntries = mean(is.na(Value))) %>%
kable()
| year | NumberOfMissingEntries | ProportionOfMissingEntries |
|---|---|---|
| 2016 | 266 | 0.0342078 |
| 2017 | 258 | 0.0331790 |
| 2018 | 265 | 0.0340792 |
| 2019 | 231 | 0.0304107 |
Make sure data is ordered by year
work = work %>% arrange(areaCode, statistics, sex, workingpattern, year)
work = time_impute(work, cols_to_impute = c('Value', 'CoefficientOfVariation'),
areaCode, statistics, sex, workingpattern)
## [1] "Missingness before imputation (%):"
## Value CoefficientOfVariation
## 0.03298409 0.01442246
## Time difference of 1.956548 mins
## [1] "Missingness after imputation (%):"
## Value CoefficientOfVariation
## 0.008246023 0.014422455
work %>%
group_by(areaCode,
statistics,
sex,
workingpattern) %>%
summarise(NumberOfMissingEntries = sum(is.na(Value)),
ProportionOfMissingEntries = mean(is.na(Value))) %>%
group_by(NumberOfMissingEntries,
ProportionOfMissingEntries) %>%
summarise(Count = n())
## # A tibble: 4 x 3
## # Groups: NumberOfMissingEntries [4]
## NumberOfMissingEntries ProportionOfMissingEntries Count
## <int> <dbl> <int>
## 1 0 0 7836
## 2 1 1 1
## 3 3 1 6
## 4 4 1 59
work %>%
group_by(year) %>%
summarise(NumberOfMissingEntries = sum(is.na(Value)),
ProportionOfMissingEntries = mean(is.na(Value))) %>%
kable()
| year | NumberOfMissingEntries | ProportionOfMissingEntries |
|---|---|---|
| 2016 | 65 | 0.0083591 |
| 2017 | 65 | 0.0083591 |
| 2018 | 65 | 0.0083591 |
| 2019 | 60 | 0.0078989 |
Finally we gotta be careful with those entries where records ends in 2018, we have to characterise them
work %>%
group_by(areaName) %>%
summarise(latestAvailableYear = max(year)) %>%
group_by(latestAvailableYear) %>%
summarise(Count = n()) %>%
kable()
| latestAvailableYear | Count |
|---|---|
| 2018 | 14 |
| 2019 | 421 |
Note: interestingly, by area code you get slightly different numbers than grouping by areaName
work %>%
group_by(areaCode) %>%
summarise(latestAvailableYear = max(year)) %>%
group_by(latestAvailableYear) %>%
summarise(Count = n()) %>%
kable()
| latestAvailableYear | Count |
|---|---|
| 2018 | 17 |
| 2019 | 422 |
Check count per number of available values
work %>%
group_by(areaCode,
areaName,
statistics,
sex,
workingpattern) %>%
summarise(NumberOfValues = n()) %>%
group_by(NumberOfValues) %>%
summarise(Count = n()) %>%
kable()
| NumberOfValues | Count |
|---|---|
| 1 | 126 |
| 3 | 306 |
| 4 | 7470 |
Check also which area names have several latest years available
tst = work %>% group_by(areaCode, areaName) %>% summarise(latest = max(year))
tab = table(tst$areaName)
tab[tab>1]
##
## Dorset Glasgow City North Lanarkshire West Midlands
## 2 2 2 2
Ok, so some codes have been updated which is fine, for example see Dorset code from 2018 corresponds to one that has been deprecated, what we’ll do now is for each area name choose a single area code (the area code corresponding to the latest year)
work %>% group_by(areaCode, areaName) %>% summarise(year = max(year)) %>% filter(year == 2018)
## # A tibble: 17 x 3
## # Groups: areaCode [17]
## areaCode areaName year
## <chr> <chr> <int>
## 1 E06000028 Bournemouth 2018
## 2 E06000029 Poole 2018
## 3 E07000048 Christchurch 2018
## 4 E07000049 East Dorset 2018
## 5 E07000050 North Dorset 2018
## 6 E07000051 Purbeck 2018
## 7 E07000052 West Dorset 2018
## 8 E07000053 Weymouth and Portland 2018
## 9 E07000190 Taunton Deane 2018
## 10 E07000191 West Somerset 2018
## 11 E07000201 Forest Heath 2018
## 12 E07000204 St Edmundsbury 2018
## 13 E07000205 Suffolk Coastal 2018
## 14 E07000206 Waveney 2018
## 15 E10000009 Dorset 2018
## 16 S12000044 North Lanarkshire 2018
## 17 S12000046 Glasgow City 2018
it further appears that many areaName for which latest figures come from 2018 are so called ‘abolished’ now (i.e. inactive), see Suffolk Coastal or East Dorset So I have not check all the 14 entries which have missing data, the robust way to do this would be filtering with nspl (unclear what I meant in this comment)
length(unique(work$areaName))
## [1] 435
length(unique(work$areaCode))
## [1] 439
A final problem which is not really a problem is the fact that because the work table has data on regions(also UK-wide, England-wide…) as well as local authorities, If a region happens to be called the same than a LA then it’ll appear twice (e.g. West Midlands E1200005 - Region code, E1100005 - LA code) #### Gather final data
work_time = work %>%
rename(Pay = Value,
CoV = CoefficientOfVariation) %>%
pivot_wider(names_from = c(statistics,
workingpattern,
sex),
values_from = c(Pay,
CoV))
work_time = updateCodes(work_time)
combs_dup = work_time %>%
group_by(areaCode, areaName, year) %>% summarise(duplicat = n()>1)
print(
work_time[work_time$areaCode %in%
combs_dup$areaCode[combs_dup$duplicat==T],c(1,2,3,4)] %>%
arrange(areaCode, areaName,year), n = 100)
## # A tibble: 21 x 4
## year areaCode areaName Pay_Mean_All_All
## <int> <chr> <chr> <dbl>
## 1 2016 E07000244 East Suffolk 15.1
## 2 2016 E07000244 East Suffolk 12.1
## 3 2017 E07000244 East Suffolk 15.1
## 4 2017 E07000244 East Suffolk 12.3
## 5 2018 E07000244 East Suffolk 14.1
## 6 2018 E07000244 East Suffolk 13.5
## 7 2019 E07000244 East Suffolk 15.0
## 8 2016 E07000245 West Suffolk 13.0
## 9 2016 E07000245 West Suffolk 13.8
## 10 2017 E07000245 West Suffolk 13.9
## 11 2017 E07000245 West Suffolk 13.7
## 12 2018 E07000245 West Suffolk 13.6
## 13 2018 E07000245 West Suffolk 14.4
## 14 2019 E07000245 West Suffolk 14.9
## 15 2016 E07000246 Somerset West and Taunton 14.0
## 16 2016 E07000246 Somerset West and Taunton 16.0
## 17 2017 E07000246 Somerset West and Taunton 13.7
## 18 2017 E07000246 Somerset West and Taunton 16.3
## 19 2018 E07000246 Somerset West and Taunton 13.9
## 20 2018 E07000246 Somerset West and Taunton 17.3
## 21 2019 E07000246 Somerset West and Taunton 15.9
Many duplicates now
This function combines entries by taking the mean when they are duplicated
for (row in which(combs_dup$duplicat==T)){
entry = combs_dup[row,]
work_time[(work_time$areaCode == entry$areaCode &
work_time$areaName == entry$areaName &
work_time$year == entry$year),
!colnames(work_time) %in% c('areaCode','areaName', 'year')] =
t(apply(work_time[(work_time$areaCode == entry$areaCode &
work_time$areaName == entry$areaName &
work_time$year == entry$year),
!colnames(work_time) %in% c('areaCode','areaName', 'year')],2,mean))
}
print(
work_time[work_time$areaCode %in%
combs_dup$areaCode[combs_dup$duplicat==T],c(1,2,3,4)])
## # A tibble: 21 x 4
## year areaCode areaName Pay_Mean_All_All
## <int> <chr> <chr> <dbl>
## 1 2016 E07000246 Somerset West and Taunton 15.0
## 2 2017 E07000246 Somerset West and Taunton 15.0
## 3 2018 E07000246 Somerset West and Taunton 15.6
## 4 2016 E07000246 Somerset West and Taunton 15.0
## 5 2017 E07000246 Somerset West and Taunton 15.0
## 6 2018 E07000246 Somerset West and Taunton 15.6
## 7 2016 E07000245 West Suffolk 13.4
## 8 2017 E07000245 West Suffolk 13.8
## 9 2018 E07000245 West Suffolk 14.0
## 10 2016 E07000245 West Suffolk 13.4
## # … with 11 more rows
Finally, select unique rows as we have generated duplicated entries in the previous step
work_time = work_time %>% distinct()
work = work_time %>% filter(year == 2019)
str(gdp)
## tibble [382 × 24] (S3: tbl_df/tbl/data.frame)
## $ NUTS1 Region: chr [1:382] "North East" "North East" "North East" "North East" ...
## $ LA code : chr [1:382] "E06000001" "E06000002" "E06000003" "E06000004" ...
## $ LA name : chr [1:382] "Hartlepool" "Middlesbrough" "Redcar and Cleveland" "Stockton-on-Tees" ...
## $ 1998 : num [1:382] 1022 1693 1799 2994 1634 ...
## $ 1999 : num [1:382] 1073 1799 2229 3105 1668 ...
## $ 2000 : num [1:382] 1081 1828 2131 3170 1796 ...
## $ 2001 : num [1:382] 1041 1837 2034 3194 1924 ...
## $ 2002 : num [1:382] 1102 1925 2102 3400 2067 ...
## $ 2003 : num [1:382] 1160 2007 2256 3681 2184 ...
## $ 2004 : num [1:382] 1203 2189 2280 3889 2345 ...
## $ 2005 : num [1:382] 1237 2303 2176 4046 2335 ...
## $ 2006 : num [1:382] 1299 2466 2200 4358 2387 ...
## $ 2007 : num [1:382] 1349 2564 2256 4518 2336 ...
## $ 2008 : num [1:382] 1428 2646 2339 4681 2407 ...
## $ 2009 : num [1:382] 1403 2666 2316 4845 2297 ...
## $ 2010 : num [1:382] 1456 2669 2620 4788 2450 ...
## $ 2011 : num [1:382] 1508 2733 2667 4817 2522 ...
## $ 2012 : num [1:382] 1496 2842 2792 4732 2547 ...
## $ 2013 : num [1:382] 1535 2728 2808 4847 2771 ...
## $ 2014 : num [1:382] 1557 2762 2544 5448 2937 ...
## $ 2015 : num [1:382] 1636 2894 2061 6121 3090 ...
## $ 2016 : num [1:382] 1731 3112 2005 6126 2916 ...
## $ 2017 : num [1:382] 1728 3215 2061 6040 3223 ...
## $ 20183 : num [1:382] 1732 3388 2159 5885 3076 ...
sum(is.na(gdp$`2018`)) ## good news
## [1] 0
gdp = gdp %>% rename(Region = `NUTS1 Region`,
areaCode = `LA code`,
areaName = `LA name`) %>%
pivot_longer(-c(Region, areaCode, areaName),
names_to = 'year',
values_to = 'GDP(millionGBP)') %>%
mutate(year = as.numeric(year))
Format year as it kep the superscript from the excel sheet
gdp$year = ifelse(gdp$year == 20183, 2018, gdp$year)
gdp = updateCodes(gdp)
combs_dup = gdp %>%
group_by(areaCode, areaName, year) %>% summarise(duplicat = n()>1)
print(
gdp[gdp$areaCode %in%
combs_dup$areaCode[combs_dup$duplicat==T],c(2,3,4)] %>%
arrange(areaCode, areaName,year), n = 100)
## # A tibble: 0 x 3
## # … with 3 variables: areaCode <chr>, areaName <chr>, year <dbl>
Many duplicates now
This function combines entries by taking the sum when they are duplicated
for (row in which(combs_dup$duplicat==T)){
entry = combs_dup[row,]
gdp[(gdp$areaCode == entry$areaCode &
gdp$areaName == entry$areaName &
gdp$year == entry$year),
!colnames(gdp) %in% c('areaCode','areaName', 'year')] =
t(apply(gdp[(gdp$areaCode == entry$areaCode &
gdp$areaName == entry$areaName &
gdp$year == entry$year),
!colnames(gdp) %in% c('areaCode','areaName', 'year')],2,sum))
}
print(
gdp[gdp$areaCode %in%
combs_dup$areaCode[combs_dup$duplicat==T],c(2,3,4, 5)] %>%
arrange(areaCode, areaName,year), n = 100)
## # A tibble: 0 x 4
## # … with 4 variables: areaCode <chr>, areaName <chr>, year <dbl>,
## # `GDP(millionGBP)` <dbl>
Check what min and max years are
min(gdp$year)
## [1] 1998
max(gdp$year)
## [1] 2018
gdp_time = gdp
gdp = gdp_time %>% filter(year == max(year))
This one is easy to handle as it is almost tidy, we just got to choose the column we are interested in which is the one with the unemployment rate in 2019. Because it is 2019 arleady, we don’t even need to update the LAD codes
str(unemployment)
## tibble [372 × 209] (S3: tbl_df/tbl/data.frame)
## $ UALAD : chr [1:372] NA NA "County Durham" "Darlington" ...
## $ ...2 : chr [1:372] NA NA "E06000047" "E06000005" ...
## $ ...3 : logi [1:372] NA NA NA NA NA NA ...
## $ 1996/97 : chr [1:372] "Rate" NA ":" "9.3000000000000007" ...
## $ ...5 : chr [1:372] "+/- (%)2" NA ":" "2" ...
## $ ...6 : logi [1:372] NA NA NA NA NA NA ...
## $ 1997/98 : chr [1:372] "Rate" NA ":" "8" ...
## $ ...8 : chr [1:372] "+/- (%)2" NA ":" "1.8" ...
## $ ...9 : logi [1:372] NA NA NA NA NA NA ...
## $ 1998/99 : chr [1:372] "Rate" NA ":" "7" ...
## $ ...11 : chr [1:372] "+/- (%)2" NA ":" "1.8" ...
## $ ...12 : logi [1:372] NA NA NA NA NA NA ...
## $ 1999/2000 : chr [1:372] "Rate" NA ":" "8.0999999999999996" ...
## $ ...14 : chr [1:372] "+/- (%)2" NA ":" "2" ...
## $ ...15 : logi [1:372] NA NA NA NA NA NA ...
## $ 2000/01 : chr [1:372] "Rate" NA ":" "6.9000000000000004" ...
## $ ...17 : chr [1:372] "+/- (%)2" NA ":" "1.1000000000000001" ...
## $ ...18 : logi [1:372] NA NA NA NA NA NA ...
## $ 2001/02 : chr [1:372] "Rate" NA ":" "6.5999999999999996" ...
## $ ...20 : chr [1:372] "+/- (%)2" NA ":" "1.1000000000000001" ...
## $ ...21 : logi [1:372] NA NA NA NA NA NA ...
## $ 2002/03 : chr [1:372] "Rate" NA ":" "5.5999999999999996" ...
## $ ...23 : chr [1:372] "+/- (%)2" NA ":" "1" ...
## $ ...24 : logi [1:372] NA NA NA NA NA NA ...
## $ 2003/04 : chr [1:372] "Rate" NA ":" "5" ...
## $ ...26 : chr [1:372] "+/- (%)2" NA ":" "0.90000000000000002" ...
## $ ...27 : logi [1:372] NA NA NA NA NA NA ...
## $ Jan 2004 to Dec 2004: chr [1:372] "Rate" NA "4.7999999999999998" "4.4000000000000004" ...
## $ ...29 : chr [1:372] "+/- (%)2" NA ":" "0.80000000000000004" ...
## $ ...30 : logi [1:372] NA NA NA NA NA NA ...
## $ Apr 2004 to Mar 2005: chr [1:372] "Rate" NA "4.7999999999999998" "4.5" ...
## $ ...32 : chr [1:372] "+/- (%)2" NA ":" "0.90000000000000002" ...
## $ ...33 : logi [1:372] NA NA NA NA NA NA ...
## $ Jul 2004 to Jun 2005: chr [1:372] "Rate" NA "4.9000000000000004" "4.5999999999999996" ...
## $ ...35 : chr [1:372] "+/- (%)2" NA ":" "0.90000000000000002" ...
## $ ...36 : logi [1:372] NA NA NA NA NA NA ...
## $ Oct 2004 to Sep 2005: chr [1:372] "Rate" NA "4.7999999999999998" "4.5999999999999996" ...
## $ ...38 : chr [1:372] "+/- (%)2" NA ":" "0.90000000000000002" ...
## $ ...39 : logi [1:372] NA NA NA NA NA NA ...
## $ Jan 2005 to Dec 2005: chr [1:372] "Rate" NA "4.7000000000000002" "4.7999999999999998" ...
## $ ...41 : chr [1:372] "+/- (%)2" NA ":" "0.90000000000000002" ...
## $ ...42 : logi [1:372] NA NA NA NA NA NA ...
## $ Apr 2005 to Mar 2006: chr [1:372] "Rate" NA "5.5" "4.90961113" ...
## $ ...44 : chr [1:372] "+/- (%)2" NA ":" "1.0438646008000001" ...
## $ ...45 : logi [1:372] NA NA NA NA NA NA ...
## $ Jul 2005 to Jun 2006: chr [1:372] "Rate" NA "5.5" "4.7195016299999999" ...
## $ ...47 : chr [1:372] "+/- (%)2" NA ":" "1.0602798555999999" ...
## $ ...48 : logi [1:372] NA NA NA NA NA NA ...
## $ Oct 2005 to Sep 2006: chr [1:372] "Rate" NA "5.7999999999999998" "4.7410787299999999" ...
## $ ...50 : chr [1:372] "+/- (%)2" NA ":" "1.0497842712000001" ...
## $ ...51 : logi [1:372] NA NA NA NA NA NA ...
## $ Jan 2006 to Dec 2006: chr [1:372] "Rate (%)" NA "5.9000000000000004" "5.1044561899999996" ...
## $ ...53 : chr [1:372] "+/- (%)2" NA ":" "1.0330758592" ...
## $ ...54 : logi [1:372] NA NA NA NA NA NA ...
## $ Apr 2006 to Mar 2007: chr [1:372] "Rate (%)" NA "5.5999999999999996" "5.1906668099999997" ...
## $ ...56 : chr [1:372] "+/- (%)2" NA ":" "0.96572381080000003" ...
## $ ...57 : logi [1:372] NA NA NA NA NA NA ...
## $ Jul 2006 to Jun 2007: chr [1:372] "Rate (%)" NA "5.4000000000000004" "5.5070090500000006" ...
## $ ...59 : chr [1:372] "+/- (%)2" NA ":" "0.88716959799999995" ...
## $ ...60 : logi [1:372] NA NA NA NA NA NA ...
## $ Oct 2006 to Sep 2007: chr [1:372] "Rate (%)" NA "5.2999999999999998" "5.8317064100000007" ...
## $ ...62 : chr [1:372] "+/- (%)2" NA ":" "0.96383774200000005" ...
## $ ...63 : logi [1:372] NA NA NA NA NA NA ...
## $ Jan 2007 to Dec 2007: chr [1:372] "Rate (%)" NA "5.0999999999999996" "5.7912505000000003" ...
## $ ...65 : chr [1:372] "+/- (%)2" NA ":" "1.0579039240000001" ...
## $ ...66 : logi [1:372] NA NA NA NA NA NA ...
## $ Apr 2007 to Mar 2008: chr [1:372] "Rate (%)" NA "5.2000000000000002" "5.9475737200000003" ...
## $ ...68 : chr [1:372] "+/- (%)2" NA ":" "1.0874065888" ...
## $ ...69 : logi [1:372] NA NA NA NA NA NA ...
## $ Jul 2007 to Jun 2008: chr [1:372] "Rate (%)" NA "5.5" "5.7331496" ...
## $ ...71 : chr [1:372] "+/- (%)2" NA ":" "1.1056296692000001" ...
## $ ...72 : logi [1:372] NA NA NA NA NA NA ...
## $ Oct 2007 to Sep 2008: chr [1:372] "Rate (%)" NA "5.5999999999999996" "5.8950876999999995" ...
## $ ...74 : chr [1:372] "+/- (%)2" NA ":" "1.0622337992000002" ...
## $ ...75 : logi [1:372] NA NA NA NA NA NA ...
## $ Jan 2008 to Dec 2008: chr [1:372] "Rate (%)" NA "6.2999999999999998" "6.3165477799999996" ...
## $ ...77 : chr [1:372] "+/- (%)2" NA ":" "1.1078058572" ...
## $ ...78 : logi [1:372] NA NA NA NA NA NA ...
## $ Apr 2008 to Mar 2009: chr [1:372] "Rate (%)" NA "7" "6.9856878900000003" ...
## $ ...80 : chr [1:372] "+/- (%)2" NA ":" "1.1598450136" ...
## $ ...81 : logi [1:372] NA NA NA NA NA NA ...
## $ Jul 2008 to Jun 2009: chr [1:372] "Rate (%)" NA "7.7999999999999998" "7.8599533799999994" ...
## $ ...83 : chr [1:372] "+/- (%)2" NA ":" "1.2499008984" ...
## $ ...84 : logi [1:372] NA NA NA NA NA NA ...
## $ Oct 2008 to Sep 2009: chr [1:372] "Rate (%)" NA "8.1999999999999993" "8.3100855300000003" ...
## $ ...86 : chr [1:372] "+/- (%)2" NA ":" "1.269439472" ...
## $ ...87 : logi [1:372] NA NA NA NA NA NA ...
## $ Jan 2009 to Dec 2009: chr [1:372] "Rate (%)" NA "8.5999999999999996" "8.6434309099999993" ...
## $ ...89 : chr [1:372] "+/- (%)2" NA ":" "1.2252655800000001" ...
## $ ...90 : logi [1:372] NA NA NA NA NA NA ...
## $ Apr 2009 to Mar 2010: chr [1:372] "Rate (%)" NA "8.9000000000000004" "8.9020545699999989" ...
## $ ...92 : chr [1:372] "+/- (%)2" NA ":" "1.2535635112000001" ...
## $ ...93 : logi [1:372] NA NA NA NA NA NA ...
## $ Jul 2009 to Jun 2010: chr [1:372] "Rate (%)" NA "8.8000000000000007" "8.9156750099999993" ...
## $ ...95 : chr [1:372] "+/- (%)2" NA ":" "1.2760016303999999" ...
## $ ...96 : logi [1:372] NA NA NA NA NA NA ...
## $ Oct 2009 to Sep 2010: chr [1:372] "Rate (%)" NA "8.5" "8.508013909999999" ...
## $ ...98 : chr [1:372] "+/- (%)2" NA ":" "1.2405666531999999" ...
## $ ...99 : logi [1:372] NA NA NA NA NA NA ...
## [list output truncated]
unemployment = unemployment[3:nrow(unemployment), -seq(3, ncol(unemployment), 3)]
The date format for this table is confusing so I am not gonna take care of the tidying yet
unemployment_time = unemployment[, c(1,2,seq(3, ncol(unemployment), 2))]
colnames(unemployment_time)[1:2] = c('areaName', 'areaCode')
unemployment_time = unemployment_time %>% pivot_longer(-c(areaName, areaCode),
names_to = 'year',
values_to = 'UnempRate')
unemployment = unemployment[, c(1, 2, ncol(unemployment)-1)]
colnames(unemployment) = c('areaName', 'areaCode', 'UnempRate')
unemployment$year = 2019
For each dataset the data that has been taken it’s that for the latest available year, we’ll log what that is for each dataframe
ONS_time = Reduce(function(x,y) merge(x = x, y = y,
by = c('areaCode', 'areaName', 'year'),
all = TRUE),
list(popclean_time, gdp_time, lifeexpect_time, suicides_time,
unemployment, wellbeing_time, work_time))
write.csv(ONS_time, 'CleanData/ONStime.csv')
## unemployment_time year format is impossible to understand and it is too much time to preprocess it
years_for_dfs = data.frame(df = character(),
year = integer(),
yearColumn = character())
for (obj in ls()){
if ('data.frame' %in% class(eval(parse(text = obj))) &
(obj != 'years_for_dfs' &
obj != 'tst' &
!grepl('_time', obj) &
obj != 'unemployment') &
obj != 'ONS_time'
){
years_for_dfs[nrow(years_for_dfs)+1, 'df'] = obj
curr_df = eval(parse(text = obj))
years_for_dfs[nrow(years_for_dfs), 'year'] =
ifelse('year' %in% colnames(curr_df),
curr_df$year, NA)
years_for_dfs[nrow(years_for_dfs), 'yearColumn'] =
ifelse('year' %in% colnames(curr_df),
'year', NA)
#remove year column
if ('year' %in% colnames(curr_df)){
curr_df = curr_df[,-which(colnames(curr_df) == 'year')]
}
eval(parse(text = paste(obj, ' = curr_df')))
}
}
kable(years_for_dfs)
| df | year | yearColumn |
|---|---|---|
| age | 2018 | year |
| agetotal | 2018 | year |
| combs_dup | 1998 | year |
| entry | 2018 | year |
| gdp | 2018 | year |
| lifeexpect | 2017 | year |
| popclean | 2018 | year |
| population | 2005 | year |
| suicides | 2018 | year |
| wellbeing | 2019 | year |
| work | 2019 | year |
| work_ts | 2018 | year |
remove(obj)
As it can be seen in the function call below we are performing outer joins(all = TRUE)
ONS = Reduce(function(x,y) merge(x = x, y = y,
by = c('areaCode', 'areaName'),
all = TRUE),
list(popclean, gdp, lifeexpect, suicides,
unemployment, wellbeing, work))
Finally we can visualise the missingness in the final dataset, though you may need a magnifier!
vis_miss(ONS)+coord_flip()
write.csv(ONS, 'CleanData/ONS.csv')
after this run:
rmarkdown::render('01_tidyingOriginalData.R',
output_format = c('html_document','pdf_document'))