These are R Markdown pages. They will accompany your R script file each week, and will show you the expected output from each command or procedure. We start each session by loading in the required packages for each week, and setting our working directory.
# Loading Libraries and Converting Data. Note that you only need to install once
# For subsequent weeks, you can run the 'library(readr)' code only.
# Remember, if installing a package for the firs time, use install.packages("name")
library(readr)
library(tidyverse)
library(ggplot2)
# Next we set our working directory. You should ensure that this is the same
# for each week of class, and store all your R files and datasets in the same
# folder. Replace the C:/ line below with the address for your folder.
setwd("C:/Users/eflaherty/iCloudDrive/Papers/NI census/Census Boundaries/Super Data Zone")
# Want to see what files are in your directory?
dir("C:/Users/eflaherty/iCloudDrive/Teaching/so_859_2025/so859_data_script_2025")
## [1] "rsconnect" "so859_country_2025.csv"
## [3] "week2_so859_2025.R" "week2_week3_so859_2025.html"
## [5] "week2_week3_so859_2025.Rmd"
# Let's import some basic country data from a spreadsheet.
so859_country <- read_csv("so859_country_2025.csv")
# At this point, let's inspect our data further.
so859_country
## # A tibble: 47 × 28
## cntry cntry_code unemp_nat unemp_ilo gini pop_grow gni_grow gnipc_grow
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Australia AUS 6.1 6.1 NA 1.5 3 1.5
## 2 Austria AUT 5.7 5.7 30.5 1.1 0.2 -0.9
## 3 Belgium BEL 8.5 8.5 27.7 0.6 0.9 0.3
## 4 Brazil BRA 11.6 8.4 51.3 0.9 -3.7 -4.5
## 5 Bulgaria BGR 9.1 9.1 37.4 -0.6 2.2 2.9
## 6 Canada CAN 6.9 6.9 NA 0.8 1.1 0.3
## 7 Chile CHL 6.7 6.5 47.7 0.8 2.9 2
## 8 China CHN 4.1 4.6 NA 0.5 6.4 5.8
## 9 Croatia HRV 16.2 16.2 31.1 -0.8 3.8 4.7
## 10 Cyprus CYP 14.9 14.9 34 0.8 6.8 7.1
## # ℹ 37 more rows
## # ℹ 20 more variables: gdppc_grow <dbl>, gdp_grow <dbl>, gni_pc <dbl>,
## # healthex_gdp <dbl>, healthex_exp <dbl>, fdiin_gdp <dbl>, tech_exp <dbl>,
## # service_emp <dbl>, trade <dbl>, rd_exp <dbl>, rd_people <dbl>, life <dbl>,
## # credit <dbl>, stocks <dbl>, top1 <dbl>, ls <dbl>, eu <chr>,
## # un_developed <chr>, union <dbl>, murder <dbl>
# To view the data grid:
View(so859_country)
# To view a more detailed summary of the dataset contents
head(so859_country)
## # A tibble: 6 × 28
## cntry cntry_code unemp_nat unemp_ilo gini pop_grow gni_grow gnipc_grow
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Australia AUS 6.1 6.1 NA 1.5 3 1.5
## 2 Austria AUT 5.7 5.7 30.5 1.1 0.2 -0.9
## 3 Belgium BEL 8.5 8.5 27.7 0.6 0.9 0.3
## 4 Brazil BRA 11.6 8.4 51.3 0.9 -3.7 -4.5
## 5 Bulgaria BGR 9.1 9.1 37.4 -0.6 2.2 2.9
## 6 Canada CAN 6.9 6.9 NA 0.8 1.1 0.3
## # ℹ 20 more variables: gdppc_grow <dbl>, gdp_grow <dbl>, gni_pc <dbl>,
## # healthex_gdp <dbl>, healthex_exp <dbl>, fdiin_gdp <dbl>, tech_exp <dbl>,
## # service_emp <dbl>, trade <dbl>, rd_exp <dbl>, rd_people <dbl>, life <dbl>,
## # credit <dbl>, stocks <dbl>, top1 <dbl>, ls <dbl>, eu <chr>,
## # un_developed <chr>, union <dbl>, murder <dbl>
# To list the variables in our dataset (column names)
colnames(so859_country)
## [1] "cntry" "cntry_code" "unemp_nat" "unemp_ilo" "gini"
## [6] "pop_grow" "gni_grow" "gnipc_grow" "gdppc_grow" "gdp_grow"
## [11] "gni_pc" "healthex_gdp" "healthex_exp" "fdiin_gdp" "tech_exp"
## [16] "service_emp" "trade" "rd_exp" "rd_people" "life"
## [21] "credit" "stocks" "top1" "ls" "eu"
## [26] "un_developed" "union" "murder"
# To view the class of object we are dealing with:
class(so859_country)
## [1] "spec_tbl_df" "tbl_df" "tbl" "data.frame"
# Use the $ dollar sign if you want to inspect or limit to a specific variable.
class(so859_country$eu)
## [1] "character"
# And to view basic summaries of our variables:
summary(so859_country)
## cntry cntry_code unemp_nat unemp_ilo
## Length:47 Length:47 Min. : 3.4 Min. : 3.300
## Class :character Class :character 1st Qu.: 5.3 1st Qu.: 5.300
## Mode :character Mode :character Median : 6.8 Median : 6.800
## Mean : 8.5 Mean : 8.321
## 3rd Qu.: 9.9 3rd Qu.: 9.400
## Max. :25.2 Max. :25.200
## NA's :1
## gini pop_grow gni_grow gnipc_grow
## Min. :25.40 Min. :-0.9000 Min. :-3.700 Min. :-4.500
## 1st Qu.:29.20 1st Qu.: 0.0000 1st Qu.: 1.250 1st Qu.: 0.650
## Median :32.70 Median : 0.7000 Median : 2.800 Median : 2.000
## Mean :34.39 Mean : 0.5894 Mean : 3.011 Mean : 2.423
## 3rd Qu.:36.20 3rd Qu.: 1.1000 3rd Qu.: 3.850 3rd Qu.: 3.650
## Max. :63.00 Max. : 2.4000 Max. :16.500 Max. :15.400
## NA's :10
## gdppc_grow gdp_grow gni_pc healthex_gdp
## Min. :-4.400 Min. :-3.500 Min. : 1739 Min. :1.000
## 1st Qu.: 0.900 1st Qu.: 1.500 1st Qu.:14079 1st Qu.:4.200
## Median : 2.100 Median : 2.400 Median :24939 Median :5.700
## Mean : 2.589 Mean : 3.179 Mean :31198 Mean :5.787
## 3rd Qu.: 3.600 3rd Qu.: 3.850 3rd Qu.:46907 3rd Qu.:7.800
## Max. :24.400 Max. :25.600 Max. :94382 Max. :9.400
## NA's :2
## healthex_exp fdiin_gdp tech_exp service_emp
## Min. : 3.4 Min. :-5.300 Min. : 2.160 Min. :31.74
## 1st Qu.:10.7 1st Qu.: 0.650 1st Qu.: 7.583 1st Qu.:64.42
## Median :12.9 Median : 2.100 Median :13.353 Median :70.46
## Mean :13.9 Mean : 6.089 Mean :13.760 Mean :69.08
## 3rd Qu.:16.6 3rd Qu.: 4.000 3rd Qu.:17.887 3rd Qu.:77.88
## Max. :25.2 Max. :81.000 Max. :30.197 Max. :86.43
## NA's :2
## trade rd_exp rd_people life
## Min. : 26.95 Min. :0.3829 Min. : 216.2 Min. :61.98
## 1st Qu.: 57.94 1st Qu.:0.9757 1st Qu.:2016.4 1st Qu.:76.33
## Median : 82.66 Median :1.3529 Median :3651.6 Median :80.78
## Mean :101.13 Mean :1.7311 Mean :3549.8 Mean :78.75
## 3rd Qu.:128.44 3rd Qu.:2.2527 3rd Qu.:4739.1 3rd Qu.:82.04
## Max. :410.17 Max. :4.2690 Max. :7457.8 Max. :83.79
## NA's :1 NA's :5
## credit stocks top1 ls
## Min. : 29.90 Min. : 0.1665 Min. : 6.30 Min. :35.56
## 1st Qu.: 54.09 1st Qu.: 6.4409 1st Qu.: 8.80 1st Qu.:48.39
## Median : 88.53 Median : 20.2944 Median :11.20 Median :52.88
## Mean : 95.74 Mean : 38.6952 Mean :12.58 Mean :52.40
## 3rd Qu.:131.71 3rd Qu.: 66.0871 3rd Qu.:13.82 3rd Qu.:56.39
## Max. :246.09 Max. :223.6490 Max. :23.70 Max. :65.39
## NA's :1 NA's :9 NA's :17 NA's :8
## eu un_developed union murder
## Length:47 Length:47 Min. : 4.5 Min. : 0.1920
## Class :character Class :character 1st Qu.:12.7 1st Qu.: 0.5605
## Mode :character Mode :character Median :19.6 Median : 0.7320
## Mean :27.2 Mean : 1.3938
## 3rd Qu.:32.8 3rd Qu.: 1.1500
## Max. :91.6 Max. :11.0000
## NA's :2 NA's :8
# One thing we will do early on is convert our data to a tibble. This won't
# matter much for the immediate weeks ahead, but will make things easier later
# on.
as_tibble(so859_country)
## # A tibble: 47 × 28
## cntry cntry_code unemp_nat unemp_ilo gini pop_grow gni_grow gnipc_grow
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Australia AUS 6.1 6.1 NA 1.5 3 1.5
## 2 Austria AUT 5.7 5.7 30.5 1.1 0.2 -0.9
## 3 Belgium BEL 8.5 8.5 27.7 0.6 0.9 0.3
## 4 Brazil BRA 11.6 8.4 51.3 0.9 -3.7 -4.5
## 5 Bulgaria BGR 9.1 9.1 37.4 -0.6 2.2 2.9
## 6 Canada CAN 6.9 6.9 NA 0.8 1.1 0.3
## 7 Chile CHL 6.7 6.5 47.7 0.8 2.9 2
## 8 China CHN 4.1 4.6 NA 0.5 6.4 5.8
## 9 Croatia HRV 16.2 16.2 31.1 -0.8 3.8 4.7
## 10 Cyprus CYP 14.9 14.9 34 0.8 6.8 7.1
## # ℹ 37 more rows
## # ℹ 20 more variables: gdppc_grow <dbl>, gdp_grow <dbl>, gni_pc <dbl>,
## # healthex_gdp <dbl>, healthex_exp <dbl>, fdiin_gdp <dbl>, tech_exp <dbl>,
## # service_emp <dbl>, trade <dbl>, rd_exp <dbl>, rd_people <dbl>, life <dbl>,
## # credit <dbl>, stocks <dbl>, top1 <dbl>, ls <dbl>, eu <chr>,
## # un_developed <chr>, union <dbl>, murder <dbl>
# Let's look at our first practical command
summarise(so859_country, gini = mean(gini, na.rm = TRUE),
top1 = mean(top1, na.rm = TRUE))
## # A tibble: 1 × 2
## gini top1
## <dbl> <dbl>
## 1 34.4 12.6
# This is fine, but what if we want to see if inequality differs between EU and
# non-EU countries? We can do this by using group_by()
# First we will tell R to treat the eu variable in our dataset as a factor. This
# tells R to treat each group in the variable as a category. These can be ranked
# or unranked.
so859_country$eu <- as.factor(so859_country$eu)
# Then, we can declare our groups, and re-run the summarize procedure.
so859_country %>%
group_by(eu) %>%
summarize(
gini = mean(gini, na.rm = TRUE),
top1 = mean(top1, na.rm = TRUE)
)
## # A tibble: 2 × 3
## eu gini top1
## <fct> <dbl> <dbl>
## 1 EU 31.8 9.89
## 2 Non-EU 41.5 14.9
# A very common and useful data visualisation package is ggplot. Here, we will
# work with it to produce a customised graph.
# If you ever need help:
help(ggplot2)
# Let's answer a question with a visual. What is the distribution
# of inequality across our sample of countries?
ggplot(so859_country, aes(gini, after_stat(density))) +
geom_histogram()
# That looks a bit off....let's modify it to make it more legible
ggplot(so859_country, aes(gini, after_stat(density))) +
geom_histogram(col = "#000000", fill = "darkgray", bins = 60) +
theme_classic()
# What happens if we change the number of bins?
ggplot(so859_country, aes(gini, after_stat(density))) +
geom_histogram(col = "#000000", fill = "darkgray", bins = 30) +
theme_classic()
# Let's produce one with full annotation:
ggplot(so859_country, aes(gini, after_stat(density))) +
geom_histogram(col = "#000000", fill = "darkgray", bins = 30) +
theme_classic() +
xlab(NULL) +
geom_vline(aes(xintercept = mean(gini)), color = "#000000", size = 0.75) +
labs(x = "Gini Income Inequality", y = "Density",
title = "Global Inequality 2018-2021",
caption = "Source: World Bank Databank.")
# Finally, a quick way to split our graph by subgroups is to use facet_grid.
ggplot(so859_country, aes(gini, after_stat(density))) +
geom_histogram(col = "#000000", fill = "darkgray", bins = 30) +
theme_classic() +
xlab(NULL) +
geom_vline(aes(xintercept = mean(gini)), color = "#000000", size = 0.75) +
labs(x = "Gini Income Inequality", y = "Density",
title = "Global Inequality 2018-2021",
caption = "Source: World Bank Databank.") +
facet_grid(.~eu)