Introduction

This week, we will look at how to get data from public data sources into a format readable by R. Much of this class will focus on how to work with the Eurostat, World Bank, and OECD data portals. The first part of this file revises the procedures for producing summary statistics, and the graphs that we looked at last week.

# Loading Libraries and Converting Data. Note that you only need to install once
# For subsequent weeks, you can run the 'library(readr)' code only.
# Remember, if installing a package for the firs time, use install.packages("name"). 
library(readr)
library(tidyverse)
library(ggplot2)

# We also need to install the new packages we will use this week.
library(haven)
library(Hmisc)
library(pastecs)
library(vtable)
library(lubridate)

# Next we set our working directory. You should ensure that this is the same
# for each week of class, and store all your R files and datasets in the same
# folder. Replace the C:/ line below with the address for your folder. 
setwd("C:/Users/eflaherty/iCloudDrive/Teaching/so_859_2025/so859_data_script_2025")
# Want to see what files are in your directory?
dir("C:/Users/eflaherty/iCloudDrive/Teaching/so_859_2025/so859_data_script_2025")
##  [1] "crim_hom_soff_spreadsheet.xlsx"                     
##  [2] "crim_off_cat__custom_15547764_page_spreadsheet.xlsx"
##  [3] "ESS11_codebook_csv.html"                            
##  [4] "ESS11_codebook_dta.html"                            
##  [5] "ess11_ireland.csv"                                  
##  [6] "ess11_ireland.dta"                                  
##  [7] "eurostat_sa_rate_male.xlsx"                         
##  [8] "eurostat_sa_rate_tot.csv"                           
##  [9] "eurostat_sa_rate_tot.xlsx"                          
## [10] "rsconnect"                                          
## [11] "selection_code_so859.R"                             
## [12] "so859_country_2025.csv"                             
## [13] "week2_so859_2025.R"                                 
## [14] "week2_week3_so859_2025.html"                        
## [15] "week2_week3_so859_2025.R"                           
## [16] "week2_week3_so859_2025.Rmd"                         
## [17] "week4_so859_2025.R"                                 
## [18] "week4_so859_2025.Rmd"
# Finally, assign our first dataset to the object so859_country
so859_country <- read_csv("so859_country_2025.csv")

Basic Summary Statistics Revised

# Let's continue from last week and look at some easier ways to call summary
# statistics. First, let's review the standard method, then add some additional
# columns.

# Remember we can take a basic line like this...
summarise(so859_country, life=mean(life, na.rm = TRUE))
## # A tibble: 1 × 1
##    life
##   <dbl>
## 1  78.8
# ...and add more detail using this. What additional statistics have we asked for?
so859_country %>% 
  group_by(eu) %>% 
  summarise(
    gini_mean = mean(gini, na.rm = TRUE),
    top1_mean = mean(top1, na.rm = TRUE),
    gini_med  = median(gini, na.rm = TRUE),
    top1_med  = median(top1, na.rm = TRUE),
    gini_sd   =   sd(gini, na.rm = TRUE),
    top1_sd   =   sd(top1, na.rm = TRUE)
  )
## # A tibble: 2 × 7
##   eu     gini_mean top1_mean gini_med top1_med gini_sd top1_sd
##   <chr>      <dbl>     <dbl>    <dbl>    <dbl>   <dbl>   <dbl>
## 1 EU          31.8      9.89     31.8      9.5    3.65    2.44
## 2 Non-EU      41.5     14.9      42.2     13.1   11.0     5.96
# Here is another selection we might want to make in future using a process
# called subsetting. We will create an object with our original variable set,
# but only for EU countries. We will create another for Non-EU so you can see
# how it works. 
eu <- so859_country[ which(so859_country$eu=='EU'), ]
NonEU <- so859_country[ which(so859_country$eu=='Non-EU'), ]

# Now we call call specific summary statistics for either group.
describe(eu$life, exclude.missing = TRUE)
## eu$life 
##        n  missing distinct     Info     Mean  pMedian      Gmd      .05 
##       28        0       28        1    79.56    79.66    3.084    74.53 
##      .10      .25      .50      .75      .90      .95 
##    74.89    77.41    80.87    81.49    82.28    82.46 
## 
## lowest : 74.322  74.4805 74.6146 75.0122 75.5683
## highest: 82.2049 82.2732 82.2927 82.5439 82.8317
describe(NonEU$life, exclude.missing = TRUE)
## NonEU$life 
##        n  missing distinct     Info     Mean  pMedian      Gmd      .05 
##       19        0       19        1    77.57    78.77    6.632    67.67 
##      .10      .25      .50      .75      .90      .95 
##    68.88    75.39    79.31    82.22    82.55    82.99 
## 
## 61.981 (1, 0.053), 68.302 (1, 0.053), 69.025 (1, 0.053), 71.1683 (1, 0.053),
## 75.284 (1, 0.053), 75.498 (1, 0.053), 76.092 (1, 0.053), 76.933 (1, 0.053),
## 78.6902 (1, 0.053), 79.315 (1, 0.053), 81.4568 (1, 0.053), 82.0244 (1, 0.053),
## 82.0512 (1, 0.053), 82.129 (1, 0.053), 82.3049 (1, 0.053), 82.4 (1, 0.053),
## 82.4683 (1, 0.053), 82.8976 (1, 0.053), 83.7939 (1, 0.053)
## 
## For the frequency table, variable is rounded to the nearest 0
describe(eu$gini, exclude.missing = TRUE)
## eu$gini 
##        n  missing distinct     Info     Mean  pMedian      Gmd      .05 
##       27        1       24    0.999    31.76     31.8     4.28    26.08 
##      .10      .25      .50      .75      .90      .95 
##    26.86    28.70    31.80    34.80    36.08    37.04 
## 
## lowest : 25.4 25.9 26.5 27.1 27.7, highest: 35.5 35.9 36   36.2 37.4
describe(NonEU$gini, exclude.missing = TRUE)
## NonEU$gini 
##        n  missing distinct     Info     Mean  pMedian      Gmd      .05 
##       10        9       10        1    41.51     41.5    12.88    27.64 
##      .10      .25      .50      .75      .90      .95 
##    27.77    33.65    42.20    46.63    52.47    57.73 
##                                                             
## Value      27.5 27.8 32.3 37.7 41.5 42.9 43.4 47.7 51.3 63.0
## Frequency     1    1    1    1    1    1    1    1    1    1
## Proportion  0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1
## 
## For the frequency table, variable is rounded to the nearest 0
# The easiest for our purposes is sumtable, from the vtable package

sumtable(so859_country, vars = c('gini','top1', 'murder', 'union', 'ls'), 
        group='eu', digits = 3, add.median = TRUE, title = 'Country Data Summary 
        Statistics (by EU/Non-EU, 2021)')
Country Data Summary Statistics (by EU/Non-EU, 2021)
eu
EU
Non-EU
Variable N Mean SD Median N Mean SD Median
gini 27 31.8 3.65 31.8 10 41.5 11 42.2
top1 14 9.89 2.44 9.5 16 14.9 5.96 13.1
murder 28 1.04 0.944 0.707 11 2.3 3.4 1.11
union 28 26.9 19.1 19.7 17 27.8 23.3 17.9
ls 28 51.8 5.66 52.1 11 53.9 7.51 56.3

Data Visualisation: Histograms and Boxplots

# Let's revise our histogram command, and introduce another visulisation for
# ratio data (the boxplot). 

# Here is our 'full' histogram from last week.

ggplot(so859_country, aes(gini, after_stat(density))) +
  geom_histogram(col = "black", fill = "darkgray", bins = 30) +
  theme_classic() +  
  labs(x = "Gini Income Inequality", y = "Density", subtitle = 
         "by development level",
       title = "Global Inequality 2018-2021",
       caption = "Source: World Bank Databank.")

# And our version split across level of development:
ggplot(so859_country, aes(gini, after_stat(density))) +
  geom_histogram(col = "black", fill = "darkgray", bins = 30) +
  theme_classic() +  
  labs(x = "Gini Income Inequality", y = "Density", subtitle = 
         "by development level",
       title = "Global Inequality 2018-2021",
       caption = "Source: World Bank Databank.") +
  facet_grid(.~un_developed)

# How would you generate summary statistics for inequality (gini or top1)
# by level of development (un_developed)?

sumtable(so859_country, vars = c('gini','top1', 'ls'), 
        group='un_developed', digits = 4, add.median = TRUE, 
        title = 'Country Data Summary 
        Statistics (by Developed/Developing, 2021)')
Country Data Summary Statistics (by Developed/Developing, 2021)
un_developed
developed
developing
Variable N Mean SD Median N Mean SD Median
gini 31 31.83 3.981 31.8 6 47.67 8.816 45.55
top1 21 10.07 2.327 9.6 9 18.43 5.594 20.2
ls 36 52.78 5.689 53.71 3 47.84 11.48 45.63
# Refer to the lecture slides for interpretation of a boxplot. Here is the basic
# boxplot command for one variable (gini).

ggplot(so859_country$data, aes(y=so859_country$top1))+
       geom_boxplot()

# Notice how the structure of the command is almost identical to the one that
# produces a histogram? 

ggplot(so859_country, aes(gini)) +
  geom_histogram()

# The only difference is we add more 'things' to the code to make it read 
# better, or add detail. Let's use the same grammar (code) to add more detail
# to our boxplot:

ggplot(so859_country$data, aes(y=so859_country$top1))+
  geom_boxplot() +
       ylab("% total income to 1%'") +
       xlab("Income Share of Top 1%")+
       labs(title = "Top 1% Income Share (2018-2022)", 
            subtitle = "Data from World Bank")

# It's not a very informative graph, so let's modify it a bit by adding an
# x-variable.

ggplot(so859_country$data, aes(y=so859_country$top1, x=so859_country$un_developed))+
  geom_boxplot() +
  ylab("% total income to 1%'") +
  xlab("Income Share of Top 1%")+
  labs(title = "Top 1% Income Share (2018-2022)", 
       subtitle = "Data from World Bank")

# How would we use this to draw a boxplot of EU vs Non-EU unemployment?

Importing data from public sources

# Once you have your spreadsheet set up and saved in .csv format (completed in
# class, but also available from the module Moodle page) you can import
# it using the same function we used above. Create an object with a suitable name
# and assign the spreadsheet to that object.

euro_crim <- read_csv("eurostat_sa_rate_tot.csv")

# We can look at detailed information about the dataset using the 'str' command:
str(euro_crim)
## spc_tbl_ [15 × 42] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ time                  : num [1:15] 2008 2009 2010 2011 2012 ...
##  $ Belgium               : num [1:15] NA NA NA NA NA NA NA 4.41 4.14 3.86 ...
##  $ Bulgaria              : num [1:15] 2.15 2.26 1.54 1.75 1.82 1.8 1.41 1.08 1.44 1.14 ...
##  $ Czechia               : num [1:15] 3.55 3.45 3.52 2.97 0.07 0.14 0.16 0.09 0.13 0.21 ...
##  $ Denmark               : num [1:15] 3.32 3.36 4.37 3.67 3.66 3.82 4.9 5.39 5.29 5.51 ...
##  $ Germany               : num [1:15] 9.79 8.78 8.26 7.63 7.93 7.6 7.73 7.57 7.17 3.32 ...
##  $ Estonia               : num [1:15] NA NA 2.33 2.18 1.66 2.58 0.76 2.59 0.84 2.66 ...
##  $ Ireland               : num [1:15] NA 2.65 3.12 2.28 1.57 NA NA NA NA NA ...
##  $ Greece                : logi [1:15] NA NA NA NA NA NA ...
##  $ Spain                 : num [1:15] NA NA NA NA NA NA 1.05 0.98 0.9 0.89 ...
##  $ France                : num [1:15] 9.49 9.37 8.49 8.2 90.1 8.18 7.62 7.77 7.83 8.03 ...
##  $ Croatia               : num [1:15] 5.43 5.68 5.37 5.69 5.5 3.99 4.52 4.52 0.72 0.77 ...
##  $ Italy                 : logi [1:15] NA NA NA NA NA NA ...
##  $ Cyprus                : num [1:15] 3.22 4.02 1.71 3.45 17.52 ...
##  $ Latvia                : num [1:15] 1.23 1.34 1.13 1.06 1.61 1.33 1.55 1.96 1.12 2.05 ...
##  $ Lithuania             : num [1:15] 3.14 3.2 4.07 5.93 5.29 3.63 4.99 5.61 4.6 4.88 ...
##  $ Luxembourg            : num [1:15] NA NA NA NA NA NA NA 6.75 8.85 5.93 ...
##  $ Hungary               : num [1:15] 1.03 1.28 1.2 2.43 2.64 2.13 0.56 0.65 0.8 0.66 ...
##  $ Malta                 : num [1:15] NA NA NA NA NA NA NA NA NA NA ...
##  $ Netherlands           : num [1:15] 7.49 6.46 6.35 5.65 5.4 1.3 1.21 0.96 1.04 1.13 ...
##  $ Austria               : num [1:15] 6.56 6.31 2.48 2.73 2.55 1.3 1.45 1.47 1.53 2.38 ...
##  $ Poland                : num [1:15] 0.74 0.82 0.67 0.58 0.67 0.38 0.36 0.33 0.39 0.38 ...
##  $ Portugal              : num [1:15] 2.57 2.86 3.17 3.12 3.76 3.6 3.05 4.23 3.94 4 ...
##  $ Romania               : num [1:15] 0.89 0.73 0.6 0.88 1.12 1.18 0.02 0.09 0.11 0.17 ...
##  $ Slovenia              : num [1:15] 5.47 4.18 4.3 3.51 3.45 1.07 0.92 0.78 0.39 0.63 ...
##  $ Slovakia              : num [1:15] 0.54 0.72 0.41 0.41 0.43 0.5 0.35 0.33 0.29 0.33 ...
##  $ Finland               : num [1:15] 7.96 7.06 6.86 7.01 6.7 6.23 7.25 6.69 6.49 6.56 ...
##  $ Sweden                : num [1:15] 6 5.99 6.51 6.26 5.55 5.27 4.69 4.49 5.25 5.75 ...
##  $ Iceland               : num [1:15] NA NA NA NA NA ...
##  $ Liechtenstein         : logi [1:15] NA NA NA NA NA NA ...
##  $ Norway                : num [1:15] 5.45 4.96 6.32 5.14 5.7 4.97 4.93 5.05 NA 5.31 ...
##  $ Switzerland           : num [1:15] 9.17 9.05 8.07 6.91 7 7.54 7.29 7.43 6.82 7.46 ...
##  $ England and Wales     : num [1:15] 7.54 7.35 8.43 8.59 8.13 8.01 8.9 9.69 NA NA ...
##  $ Scotland              : num [1:15] 3.51 3.03 3.05 2.86 3.84 4.42 4.42 5.15 4.94 5.58 ...
##  $ Northern Ireland      : num [1:15] 29.8 28.3 33.5 33.8 37.8 ...
##  $ Bosnia and Herzegovina: num [1:15] 0 0.05 0.62 2.03 1.51 NA NA NA NA NA ...
##  $ Montenegro            : num [1:15] 7.96 7.13 1.94 2.74 2.1 4.67 0.8 2.73 1.77 1.61 ...
##  $ North Macedonia       : logi [1:15] NA NA NA NA NA NA ...
##  $ Albania               : num [1:15] NA NA NA NA NA NA 0.86 1.07 1.46 1.04 ...
##  $ Serbia                : num [1:15] 1.45 1.32 0.73 0.95 1 1.16 1.32 0.91 1.13 1.49 ...
##  $ Turkey                : num [1:15] NA 3.74 3.05 3.16 4.02 ...
##  $ Kosovo                : num [1:15] 2.04 2.52 2.04 2.51 NA NA NA NA 3.27 NA ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   time = col_double(),
##   ..   Belgium = col_double(),
##   ..   Bulgaria = col_double(),
##   ..   Czechia = col_double(),
##   ..   Denmark = col_double(),
##   ..   Germany = col_double(),
##   ..   Estonia = col_double(),
##   ..   Ireland = col_double(),
##   ..   Greece = col_logical(),
##   ..   Spain = col_double(),
##   ..   France = col_double(),
##   ..   Croatia = col_double(),
##   ..   Italy = col_logical(),
##   ..   Cyprus = col_double(),
##   ..   Latvia = col_double(),
##   ..   Lithuania = col_double(),
##   ..   Luxembourg = col_double(),
##   ..   Hungary = col_double(),
##   ..   Malta = col_double(),
##   ..   Netherlands = col_double(),
##   ..   Austria = col_double(),
##   ..   Poland = col_double(),
##   ..   Portugal = col_double(),
##   ..   Romania = col_double(),
##   ..   Slovenia = col_double(),
##   ..   Slovakia = col_double(),
##   ..   Finland = col_double(),
##   ..   Sweden = col_double(),
##   ..   Iceland = col_double(),
##   ..   Liechtenstein = col_logical(),
##   ..   Norway = col_double(),
##   ..   Switzerland = col_double(),
##   ..   `England and Wales` = col_double(),
##   ..   Scotland = col_double(),
##   ..   `Northern Ireland` = col_double(),
##   ..   `Bosnia and Herzegovina` = col_double(),
##   ..   Montenegro = col_double(),
##   ..   `North Macedonia` = col_logical(),
##   ..   Albania = col_double(),
##   ..   Serbia = col_double(),
##   ..   Turkey = col_double(),
##   ..   Kosovo = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
# Let's take a look at a new kind of visualisation for a specific type of data.
# Below is a command to produce a time series plot for the Germany column:

ggplot(euro_crim, aes(x=time, y=Germany)) +
  geom_line() + 
  labs(x = "Male SA Convictions", y = "per 100,000", 
       title = "Germany 2008-2022", subtitle = "Source: Eurostat")

# How would you draw a plot for Denmark?