Curated Indicators

In this example, we will curate Eurostat indicators. All indicators are national indicators, we are not doing geographical recoding with regional indicators.

library(indicators)
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
require(eurostat)
#> Loading required package: eurostat
#> Warning: package 'eurostat' was built under R version 4.0.5
require(dplyr)
#> Loading required package: dplyr
#> 
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#> 
#>     filter, lag
#> The following objects are masked from 'package:base':
#> 
#>     intersect, setdiff, setequal, union

Curation Process

Eurostat publishes data folders. We select only the indicators that are direcly relevant to our observatory, in this case, the music observatory.

  #cloud computer use
  isoc_cicci_use <- eurostat::get_eurostat(id = "isoc_cicci_use")
#> Table isoc_cicci_use cached at C:\Users\DANIEL~1\AppData\Local\Temp\RtmpMVfKKX/eurostat/isoc_cicci_use_date_code_FF.rds
  cloud_personal_raw <- eurostat::get_eurostat(id = "isoc_cicci_use") %>%
    filter ( .data$ind_type %in% c("STUD", "Y16_24", "IND_TOTAL"),
             .data$indic_is %in% c("I_CC", "I_CCS_CC", "I_CC_PAY"),
             .data$unit %in% c("PC_IND"))
#> Reading cache file C:\Users\DANIEL~1\AppData\Local\Temp\RtmpMVfKKX/eurostat/isoc_cicci_use_date_code_FF.rds
#> Table  isoc_cicci_use  read from cache file:  C:\Users\DANIEL~1\AppData\Local\Temp\RtmpMVfKKX/eurostat/isoc_cicci_use_date_code_FF.rds
  
  # Frequency of practicing of artistic activities by sex, age and educational attainment level[ilc_scp07]
  ilc_scp07 <- eurostat::get_eurostat(id = "ilc_scp07")
#> Table ilc_scp07 cached at C:\Users\DANIEL~1\AppData\Local\Temp\RtmpMVfKKX/eurostat/ilc_scp07_date_code_FF.rds
  artistic_activity_raw <- ilc_scp07 %>%
    filter ( .data$isced11 == "TOTAL",
             .data$age %in% c("Y_GE16", "Y16-24"))
  
  # Final consumption expenditure of households by consumption purpose (COICOP 3 digit)[nama_10_co3_p3]
  nama_10_co3_p3 <- eurostat::get_eurostat(id = "nama_10_co3_p3")
#> Table nama_10_co3_p3 cached at C:\Users\DANIEL~1\AppData\Local\Temp\RtmpMVfKKX/eurostat/nama_10_co3_p3_date_code_FF.rds

  household_consumption_raw <- nama_10_co3_p3  %>%
    filter (.data$unit == "PC_TOT",
            .data$coicop %in% c("CP09", "CP091"))

The preselection is important, because the imputation code contains many inefficiencies, and many unit tests. It is resource-intensive.

Tidy Indicators

The eurostat warehouse folder contains data in various formats. They are tidy (all observations are in rows, and all variables are in columns) but the columns vary product to product. To place it into our observatory database, we create a canonical form of the indicator with get_eurostat_indicator().

A similar function must be written to all major open data sources.

The id, for example, id = "isoc_cicci_use" is used to identify the relevant Eurostat dictionary to describe and label the data. If the preselected_indicators = NULL then it will try to download the id = "isoc_cicci_use" product from Eurostat. Because some Eurostat folders are very huge, it is unlikely that all functions will work on entire folders. So some curation is necessary, which we did above.

cloud_indicators <- get_eurostat_indicator(
  preselected_indicators = cloud_personal_raw,
  id = "isoc_cicci_use")

artistic_activity_indicators <- get_eurostat_indicator ( artistic_activity_raw, id = "ilc_scp07")

household_consumption_indicators<- get_eurostat_indicator (  household_consumption_raw, id = "nama_10_co3_p3")

The function returns a list with three tables.

Indicator values

  artistic_activity_indicators$indicator %>% 
  head() %>%
  select ( .data$geo, .data$time, .data$value, .data$estimate, .data$description_indicator )
#> # A tibble: 6 x 5
#>   geo     time       value estimate description_indicator                       
#>   <chr>   <date>     <dbl> <chr>    <chr>                                       
#> 1 EU27_2~ 2015-01-01   5.5 actual   Frequency of practicing of artistic activit~
#> 2 EU28    2015-01-01   6   actual   Frequency of practicing of artistic activit~
#> 3 EU27_2~ 2015-01-01   6   actual   Frequency of practicing of artistic activit~
#> 4 EA19    2015-01-01   5.1 actual   Frequency of practicing of artistic activit~
#> 5 EA18    2015-01-01   5   actual   Frequency of practicing of artistic activit~
#> 6 BE      2015-01-01   2.7 actual   Frequency of practicing of artistic activit~

Indicator labelling

The following labels were found in the curated data (we do not reproduce the entire Eurostat dictionary, only the items that we actually use.)

  artistic_activity_indicators$labelling 
#> # A tibble: 145 x 3
#>    var_label             var_code var_name
#>    <chr>                 <chr>    <chr>   
#>  1 [percentage]          PC       unit    
#>  2 Once a month          1M       frequenc
#>  3 All ISCED 2011 levels TOTAL    isced11 
#>  4 From 16 to 24 years   Y16-24   age     
#>  5 Females               F        sex     
#>  6 Once a month          1M       frequenc
#>  7 All ISCED 2011 levels TOTAL    isced11 
#>  8 From 16 to 24 years   Y16-24   age     
#>  9 Males                 M        sex     
#> 10 Once a month          1M       frequenc
#> # ... with 135 more rows

Indicator metadata

The metadata table contains many important information about the indicator. Some of this data will be later used to identify which indicators need to be re-freshed from source, and re-processed.

  artistic_activity_indicators$metadata %>%
      filter (.data$indicator_code == unique(.data$indicator_code[1])) %>%
      select ( all_of(c("indicator_code", "actual", "missing", "data_start", "last_update_data_source")))
#> # A tibble: 1 x 5
#>   indicator_code                  actual missing data_start last_update_data_so~
#>   <chr>                            <int>   <dbl> <chr>      <date>              
#> 1 eurostat_ilc_scp_07_1_m_total_~     37       0 2015       2019-03-20

Imputation

Currently we have some imputation functions that we apply on our example indicators.

This is a very resource intensive step, may take long.

  1. Approximate missing values withing time series with na_approx();
  2. Next observation carry forward for old missing values na_nocb();
  3. Forecast the time series ahead indicator_forecast();
  4. If the forecast did not work, try last observation carry forward na_locf().

The backcasting is not yet implemented.

# We will estimate the missing values with variuos imputation methods.
indicators_to_impute <- cloud_indicators$indicator %>%
    bind_rows ( artistic_activity_indicators$indicator ) %>%
    bind_rows ( household_consumption_indicators$indicator)

# We will updated the estimation columns after imputation.
metadata_to_update <- cloud_indicators$metadata %>%
    bind_rows ( artistic_activity_indicators$metadata )  %>%
    bind_rows ( household_consumption_indicators$metadta)

# We need the labels, too, but we won't do anything with them.
labelling_bind <- cloud_indicators$labelling %>%
    bind_rows ( artistic_activity_indicators$labelling )  %>%
    bind_rows ( household_consumption_indicators$labelling )

And then carry out the imputation. In case you have many indicators, maybe it is safer to do them subgroup by subgroup. Later, when we will anyway re-process what is changed at the source, this may not be a problem.

imp <- impute_indicators ( indic = indicators_to_impute )

Update the metadata

This is not yet unit-tested, and does not seem to work properly. Basically we count the number of approximated, next observation carry forward, forecasted, and last observation carry forward estimates.

updated_metadata <- update_metadata(imp, metadata = metadata_to_update )

They should show up here, but it suspicious that some are gone missing:

set.seed(2021)
updated_metadata %>%
  sample_n (12) %>%
  select ( all_of ( c("indicator_code", "actual", "missing", "nocb", "locf", "approximate", "forecast")))
#> # A tibble: 12 x 7
#>    indicator_code                actual missing  nocb  locf approximate forecast
#>    <chr>                          <int>   <dbl> <dbl> <dbl>       <dbl>    <dbl>
#>  1 eurostat_isoc_cicci_use_i_cc~     36       1     0     0           0        0
#>  2 eurostat_ilc_scp_07_sev_m_to~     37       0     0     0           0        0
#>  3 eurostat_ilc_scp_07_sev_m_to~     37       0     0     0           0        0
#>  4 eurostat_ilc_scp_07_1_m_tota~     37       0     0     0           0        0
#>  5 eurostat_isoc_cicci_use_i_cc~     37       0     0     0           0        0
#>  6 eurostat_ilc_scp_07_week_tot~     37       0     0     0           0        0
#>  7 eurostat_ilc_scp_07_week_tot~     37       0     0     0           0        0
#>  8 eurostat_isoc_cicci_use_i_cc~     36       1     0     0           0        0
#>  9 eurostat_ilc_scp_07_ge_1_y_t~     37       0     0     0           0        0
#> 10 eurostat_ilc_scp_07_week_tot~     37       0     0     0           0        0
#> 11 eurostat_ilc_scp_07_day_tota~     37       0     0     0           0        0
#> 12 eurostat_isoc_cicci_use_i_cc~    266      35     0     0           0        0

Map to Observatory

At last, we create keywords. The keywords help us placing the indicators and their metadata in the long-form documentation. At least four keywords must be used. The first keyword, “music”, identifies the music observatory, the “economy” the Music economy pillar, “Demand” is the first top-level division in the pillar, and “PCR”is the second. Any further keywords, if they exist, are added as a concatenated list to the keyword table, divided by __.

keywords <- add_keywords (artistic_activity_indicators$metadata, list( "music", "economy", "supply", "potential_supply")) %>%
  bind_rows ( add_keywords (cloud_indicators$metadata, list( "music", "economy", "demand", "pcr")) ) %>%
  bind_rows ( add_keywords (household_consumption_indicators$metadata, list( "music", "economy", "demand", "general")) ) %>%
  select ( all_of(c("indicator_code", "keyword_1", "keyword_2", "keyword_3", "keyword_4", "further_keywords")))

set.seed(2021) #fixed pseudo-random selection
keywords %>%
  sample_n(12)
#> # A tibble: 12 x 6
#>    indicator_code      keyword_1 keyword_2 keyword_3 keyword_4  further_keywords
#>    <chr>               <chr>     <chr>     <chr>     <chr>      <chr>           
#>  1 eurostat_ilc_scp_0~ music     economy   supply    potential~ <NA>            
#>  2 eurostat_isoc_cicc~ music     economy   demand    pcr        <NA>            
#>  3 eurostat_isoc_cicc~ music     economy   demand    pcr        <NA>            
#>  4 eurostat_ilc_scp_0~ music     economy   supply    potential~ <NA>            
#>  5 eurostat_ilc_scp_0~ music     economy   supply    potential~ <NA>            
#>  6 eurostat_nama_10_c~ music     economy   demand    general    <NA>            
#>  7 eurostat_isoc_cicc~ music     economy   demand    pcr        <NA>            
#>  8 eurostat_ilc_scp_0~ music     economy   supply    potential~ <NA>            
#>  9 eurostat_isoc_cicc~ music     economy   demand    pcr        <NA>            
#> 10 eurostat_ilc_scp_0~ music     economy   supply    potential~ <NA>            
#> 11 eurostat_isoc_cicc~ music     economy   demand    pcr        <NA>            
#> 12 eurostat_ilc_scp_0~ music     economy   supply    potential~ <NA>
not_included_path <- ifelse ( dir.exists("data-raw"), 
                              yes = file.path("data-raw", "dmo.db"), 
                              no = file.path("..", "data-raw", "dmo.db"))
create_database ( 
  indicator_tables = imp,
  metadata_tables = updated_metadata,
  labelling_table = labelling_bind,
  keywords_table = keywords,
  db_path = not_included_path
  )