SO859 Quantitative Methods and Data Visualization (Weeks 2-3)

Introduction
Importing and Exploring Data
Basic Summary Statistics
First Visualizations

Introduction

These are R Markdown pages. They will accompany your R script file each week, and will show you the expected output from each command or procedure. We start each session by loading in the required packages for each week, and setting our working directory.

# Loading Libraries and Converting Data. Note that you only need to install once
# For subsequent weeks, you can run the 'library(readr)' code only.
# Remember, if installing a package for the firs time, use install.packages("name")
library(readr)
library(tidyverse)
library(ggplot2)

# Next we set our working directory. You should ensure that this is the same
# for each week of class, and store all your R files and datasets in the same
# folder. Replace the C:/ line below with the address for your folder. 
setwd("C:/Users/eflaherty/iCloudDrive/Papers/NI census/Census Boundaries/Super Data Zone")
# Want to see what files are in your directory?
dir("C:/Users/eflaherty/iCloudDrive/Teaching/so_859_2025/so859_data_script_2025")

## [1] "rsconnect"                   "so859_country_2025.csv"     
## [3] "week2_so859_2025.R"          "week2_week3_so859_2025.html"
## [5] "week2_week3_so859_2025.Rmd"

Importing and Exploring Data

# Let's import some basic country data from a spreadsheet.
so859_country <- read_csv("so859_country_2025.csv")

# At this point, let's inspect our data further.
so859_country

## # A tibble: 47 × 28
##    cntry     cntry_code unemp_nat unemp_ilo  gini pop_grow gni_grow gnipc_grow
##    <chr>     <chr>          <dbl>     <dbl> <dbl>    <dbl>    <dbl>      <dbl>
##  1 Australia AUS              6.1       6.1  NA        1.5      3          1.5
##  2 Austria   AUT              5.7       5.7  30.5      1.1      0.2       -0.9
##  3 Belgium   BEL              8.5       8.5  27.7      0.6      0.9        0.3
##  4 Brazil    BRA             11.6       8.4  51.3      0.9     -3.7       -4.5
##  5 Bulgaria  BGR              9.1       9.1  37.4     -0.6      2.2        2.9
##  6 Canada    CAN              6.9       6.9  NA        0.8      1.1        0.3
##  7 Chile     CHL              6.7       6.5  47.7      0.8      2.9        2  
##  8 China     CHN              4.1       4.6  NA        0.5      6.4        5.8
##  9 Croatia   HRV             16.2      16.2  31.1     -0.8      3.8        4.7
## 10 Cyprus    CYP             14.9      14.9  34        0.8      6.8        7.1
## # ℹ 37 more rows
## # ℹ 20 more variables: gdppc_grow <dbl>, gdp_grow <dbl>, gni_pc <dbl>,
## #   healthex_gdp <dbl>, healthex_exp <dbl>, fdiin_gdp <dbl>, tech_exp <dbl>,
## #   service_emp <dbl>, trade <dbl>, rd_exp <dbl>, rd_people <dbl>, life <dbl>,
## #   credit <dbl>, stocks <dbl>, top1 <dbl>, ls <dbl>, eu <chr>,
## #   un_developed <chr>, union <dbl>, murder <dbl>

# To view the data grid:
View(so859_country)

# To view a more detailed summary of the dataset contents
head(so859_country)

## # A tibble: 6 × 28
##   cntry     cntry_code unemp_nat unemp_ilo  gini pop_grow gni_grow gnipc_grow
##   <chr>     <chr>          <dbl>     <dbl> <dbl>    <dbl>    <dbl>      <dbl>
## 1 Australia AUS              6.1       6.1  NA        1.5      3          1.5
## 2 Austria   AUT              5.7       5.7  30.5      1.1      0.2       -0.9
## 3 Belgium   BEL              8.5       8.5  27.7      0.6      0.9        0.3
## 4 Brazil    BRA             11.6       8.4  51.3      0.9     -3.7       -4.5
## 5 Bulgaria  BGR              9.1       9.1  37.4     -0.6      2.2        2.9
## 6 Canada    CAN              6.9       6.9  NA        0.8      1.1        0.3
## # ℹ 20 more variables: gdppc_grow <dbl>, gdp_grow <dbl>, gni_pc <dbl>,
## #   healthex_gdp <dbl>, healthex_exp <dbl>, fdiin_gdp <dbl>, tech_exp <dbl>,
## #   service_emp <dbl>, trade <dbl>, rd_exp <dbl>, rd_people <dbl>, life <dbl>,
## #   credit <dbl>, stocks <dbl>, top1 <dbl>, ls <dbl>, eu <chr>,
## #   un_developed <chr>, union <dbl>, murder <dbl>

# To list the variables in our dataset (column names)
colnames(so859_country)

##  [1] "cntry"        "cntry_code"   "unemp_nat"    "unemp_ilo"    "gini"        
##  [6] "pop_grow"     "gni_grow"     "gnipc_grow"   "gdppc_grow"   "gdp_grow"    
## [11] "gni_pc"       "healthex_gdp" "healthex_exp" "fdiin_gdp"    "tech_exp"    
## [16] "service_emp"  "trade"        "rd_exp"       "rd_people"    "life"        
## [21] "credit"       "stocks"       "top1"         "ls"           "eu"          
## [26] "un_developed" "union"        "murder"

# To view the class of object we are dealing with:
class(so859_country)

## [1] "spec_tbl_df" "tbl_df"      "tbl"         "data.frame"

# Use the $ dollar sign if you want to inspect or limit to a specific variable.
class(so859_country$eu)

## [1] "character"

# And to view basic summaries of our variables:
summary(so859_country)

##     cntry            cntry_code          unemp_nat      unemp_ilo     
##  Length:47          Length:47          Min.   : 3.4   Min.   : 3.300  
##  Class :character   Class :character   1st Qu.: 5.3   1st Qu.: 5.300  
##  Mode  :character   Mode  :character   Median : 6.8   Median : 6.800  
##                                        Mean   : 8.5   Mean   : 8.321  
##                                        3rd Qu.: 9.9   3rd Qu.: 9.400  
##                                        Max.   :25.2   Max.   :25.200  
##                                        NA's   :1                      
##       gini          pop_grow          gni_grow        gnipc_grow    
##  Min.   :25.40   Min.   :-0.9000   Min.   :-3.700   Min.   :-4.500  
##  1st Qu.:29.20   1st Qu.: 0.0000   1st Qu.: 1.250   1st Qu.: 0.650  
##  Median :32.70   Median : 0.7000   Median : 2.800   Median : 2.000  
##  Mean   :34.39   Mean   : 0.5894   Mean   : 3.011   Mean   : 2.423  
##  3rd Qu.:36.20   3rd Qu.: 1.1000   3rd Qu.: 3.850   3rd Qu.: 3.650  
##  Max.   :63.00   Max.   : 2.4000   Max.   :16.500   Max.   :15.400  
##  NA's   :10                                                         
##    gdppc_grow        gdp_grow          gni_pc       healthex_gdp  
##  Min.   :-4.400   Min.   :-3.500   Min.   : 1739   Min.   :1.000  
##  1st Qu.: 0.900   1st Qu.: 1.500   1st Qu.:14079   1st Qu.:4.200  
##  Median : 2.100   Median : 2.400   Median :24939   Median :5.700  
##  Mean   : 2.589   Mean   : 3.179   Mean   :31198   Mean   :5.787  
##  3rd Qu.: 3.600   3rd Qu.: 3.850   3rd Qu.:46907   3rd Qu.:7.800  
##  Max.   :24.400   Max.   :25.600   Max.   :94382   Max.   :9.400  
##                                                    NA's   :2      
##   healthex_exp    fdiin_gdp         tech_exp       service_emp   
##  Min.   : 3.4   Min.   :-5.300   Min.   : 2.160   Min.   :31.74  
##  1st Qu.:10.7   1st Qu.: 0.650   1st Qu.: 7.583   1st Qu.:64.42  
##  Median :12.9   Median : 2.100   Median :13.353   Median :70.46  
##  Mean   :13.9   Mean   : 6.089   Mean   :13.760   Mean   :69.08  
##  3rd Qu.:16.6   3rd Qu.: 4.000   3rd Qu.:17.887   3rd Qu.:77.88  
##  Max.   :25.2   Max.   :81.000   Max.   :30.197   Max.   :86.43  
##  NA's   :2                                                       
##      trade            rd_exp         rd_people           life      
##  Min.   : 26.95   Min.   :0.3829   Min.   : 216.2   Min.   :61.98  
##  1st Qu.: 57.94   1st Qu.:0.9757   1st Qu.:2016.4   1st Qu.:76.33  
##  Median : 82.66   Median :1.3529   Median :3651.6   Median :80.78  
##  Mean   :101.13   Mean   :1.7311   Mean   :3549.8   Mean   :78.75  
##  3rd Qu.:128.44   3rd Qu.:2.2527   3rd Qu.:4739.1   3rd Qu.:82.04  
##  Max.   :410.17   Max.   :4.2690   Max.   :7457.8   Max.   :83.79  
##                   NA's   :1        NA's   :5                       
##      credit           stocks              top1             ls       
##  Min.   : 29.90   Min.   :  0.1665   Min.   : 6.30   Min.   :35.56  
##  1st Qu.: 54.09   1st Qu.:  6.4409   1st Qu.: 8.80   1st Qu.:48.39  
##  Median : 88.53   Median : 20.2944   Median :11.20   Median :52.88  
##  Mean   : 95.74   Mean   : 38.6952   Mean   :12.58   Mean   :52.40  
##  3rd Qu.:131.71   3rd Qu.: 66.0871   3rd Qu.:13.82   3rd Qu.:56.39  
##  Max.   :246.09   Max.   :223.6490   Max.   :23.70   Max.   :65.39  
##  NA's   :1        NA's   :9          NA's   :17      NA's   :8      
##       eu            un_developed           union          murder       
##  Length:47          Length:47          Min.   : 4.5   Min.   : 0.1920  
##  Class :character   Class :character   1st Qu.:12.7   1st Qu.: 0.5605  
##  Mode  :character   Mode  :character   Median :19.6   Median : 0.7320  
##                                        Mean   :27.2   Mean   : 1.3938  
##                                        3rd Qu.:32.8   3rd Qu.: 1.1500  
##                                        Max.   :91.6   Max.   :11.0000  
##                                        NA's   :2      NA's   :8

# One thing we will do early on is convert our data to a tibble. This won't
# matter much for the immediate weeks ahead, but will make things easier later 
# on. 
as_tibble(so859_country)

## # A tibble: 47 × 28
##    cntry     cntry_code unemp_nat unemp_ilo  gini pop_grow gni_grow gnipc_grow
##    <chr>     <chr>          <dbl>     <dbl> <dbl>    <dbl>    <dbl>      <dbl>
##  1 Australia AUS              6.1       6.1  NA        1.5      3          1.5
##  2 Austria   AUT              5.7       5.7  30.5      1.1      0.2       -0.9
##  3 Belgium   BEL              8.5       8.5  27.7      0.6      0.9        0.3
##  4 Brazil    BRA             11.6       8.4  51.3      0.9     -3.7       -4.5
##  5 Bulgaria  BGR              9.1       9.1  37.4     -0.6      2.2        2.9
##  6 Canada    CAN              6.9       6.9  NA        0.8      1.1        0.3
##  7 Chile     CHL              6.7       6.5  47.7      0.8      2.9        2  
##  8 China     CHN              4.1       4.6  NA        0.5      6.4        5.8
##  9 Croatia   HRV             16.2      16.2  31.1     -0.8      3.8        4.7
## 10 Cyprus    CYP             14.9      14.9  34        0.8      6.8        7.1
## # ℹ 37 more rows
## # ℹ 20 more variables: gdppc_grow <dbl>, gdp_grow <dbl>, gni_pc <dbl>,
## #   healthex_gdp <dbl>, healthex_exp <dbl>, fdiin_gdp <dbl>, tech_exp <dbl>,
## #   service_emp <dbl>, trade <dbl>, rd_exp <dbl>, rd_people <dbl>, life <dbl>,
## #   credit <dbl>, stocks <dbl>, top1 <dbl>, ls <dbl>, eu <chr>,
## #   un_developed <chr>, union <dbl>, murder <dbl>

Basic Summary Statistics

# Let's look at our first practical command
summarise(so859_country, gini = mean(gini, na.rm = TRUE), 
          top1 = mean(top1, na.rm = TRUE))

## # A tibble: 1 × 2
##    gini  top1
##   <dbl> <dbl>
## 1  34.4  12.6

# This is fine, but what if we want to see if inequality differs between EU and
# non-EU countries? We can do this by using group_by()

# First we will tell R to treat the eu variable in our dataset as a factor. This
# tells R to treat each group in the variable as a category. These can be ranked
# or unranked.
so859_country$eu <- as.factor(so859_country$eu)

# Then, we can declare our groups, and re-run the summarize procedure. 
so859_country %>% 
  group_by(eu) %>% 
  summarize(
    gini = mean(gini, na.rm = TRUE),
    top1 = mean(top1, na.rm = TRUE)
  )

## # A tibble: 2 × 3
##   eu      gini  top1
##   <fct>  <dbl> <dbl>
## 1 EU      31.8  9.89
## 2 Non-EU  41.5 14.9

First Visualizations

# A very common and useful data visualisation package is ggplot. Here, we will
# work with it to produce a customised graph.
# If you ever need help:
help(ggplot2)

# Let's answer a question with a visual. What is the distribution 
# of inequality across our sample of countries?
ggplot(so859_country, aes(gini, after_stat(density))) +
  geom_histogram()

# That looks a bit off....let's modify it to make it more legible
ggplot(so859_country, aes(gini, after_stat(density))) +
  geom_histogram(col = "#000000", fill = "darkgray", bins = 60) +
  theme_classic()

# What happens if we change the number of bins?
ggplot(so859_country, aes(gini, after_stat(density))) +
  geom_histogram(col = "#000000", fill = "darkgray", bins = 30) +
  theme_classic()

# Let's produce one with full annotation:
ggplot(so859_country, aes(gini, after_stat(density))) +
  geom_histogram(col = "#000000", fill = "darkgray", bins = 30) +
  theme_classic() +  
  xlab(NULL) +
  geom_vline(aes(xintercept = mean(gini)), color = "#000000", size = 0.75) +
  labs(x = "Gini Income Inequality", y = "Density",
       title = "Global Inequality 2018-2021",
       caption = "Source: World Bank Databank.")

# Finally, a quick way to split our graph by subgroups is to use facet_grid.
ggplot(so859_country, aes(gini, after_stat(density))) +
  geom_histogram(col = "#000000", fill = "darkgray", bins = 30) +
  theme_classic() +  
  xlab(NULL) +
  geom_vline(aes(xintercept = mean(gini)), color = "#000000", size = 0.75) +
  labs(x = "Gini Income Inequality", y = "Density",
       title = "Global Inequality 2018-2021",
       caption = "Source: World Bank Databank.") +
       facet_grid(.~eu)

SO859 Quantitative Methods and Data Visualization (Weeks 2-3)

Dr Eoin Flaherty, Maynooth University

18-2-2025

Introduction

Importing and Exploring Data

Basic Summary Statistics

First Visualizations