Session2

library(tidyverse) 
library(lubridate)
library(readxl)
library(esquisse)
library(janitor)
library(ggthemes)
library(ggrepel)
library(scales)
library(naniar)
library(viridis)
library(vtable)
library(gplots)
#install.packages("naniar")
emissions <- read.csv("emissions_dataset.csv")

# stop R from defaulting to exponents for large values
options(scipen = 1000000)

Missing Values

When dealing with large data sets it can be helpful to get an idea of how many values are absent. There are many ways to do this in R, but one way I like is utilizing the below code from the “naniar” package.

install the package and load the library:

install.package(“naniar”)

library(naniar)

gg_miss_var(emissions, show_pct = TRUE)

Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
use `guide = "none"` instead.

Basic Charting

Creating useful visualizations of our data is both art and science. To start, we’ll go over how to make basic versions of charts and follow with what the standard looks like for charts you will submit. For the homework this week we will accept the bare minimum as you’re still getting used to the tech. Later on we will expect higher quality outputs like appropriate labeling and titles.

This topic is incredibly broad and deep, and there is a massive amount of resources online dedicated to making some truly outrageous visualizations. You’re welcome to pursue this as far as your time and patience allows, but let’s first get the basics down.

This website is a great reference for how to make a large variety of charts and spruce them up. As you do the homework, we recommend referencing this site.

Bar charts

# open emissions dataset we already imported
emissions %>%
  # filter for three countries we'd like to see populations for
  filter(country_name %in% c("Angola","Argentina","Australia")) %>%
  # filter for the year we'd like to view
  filter(year == 2010) %>%
  
  ggplot(aes(x = country_name, y = population)) +
    geom_col()

emissions %>%
  group_by(em_dm) %>%
  na.omit() %>%
  summarize(avg = mean(population)) %>%
  
  ggplot(aes(x = em_dm, y = avg)) +
    geom_col()

###Line charts

emissions %>%
  filter(year == 2010:2019) %>%
  group_by(year) %>%
  summarize(avg = mean(territorial_co2)) %>%  
  
  ggplot(aes(x = year, y = avg)) +
    geom_line()

Scatter plots

emissions %>%
  na.omit() %>%
  
  ggplot(aes(x = gdp_usd_current_prices, y = territorial_co2)) +
    geom_point()

Bubble chart

emissions %>%
  # filter(em_dm == "Emerging Markets") %>%
  filter(year == 2010:2011) %>%
  na.omit() %>%
  
    # size tells it to scale the size of the bubbles based on the size of the population
    # color tells it to provide a color pattern based on the unique variables in the em_dm column
    ggplot(aes(x = gdp_usd_current_prices, y = territorial_co2, size = population, color = em_dm)) +
  
    # alpha tells it to provide a level of transparency for each bubble
    geom_point(alpha = .5)

Histograms

emissions %>%
  filter(em_dm == "Emerging Markets") %>%
  filter(year == 2019) %>%
  filter(territorial_co2 <1000) %>%
  
  ggplot(aes(x = territorial_co2)) +
  
    # binwith tells it how wide to make each column
    # fill tells it what color to make the column 
    # color tells it what color to make the edges of the column
    geom_histogram(binwidth = 10, fill = "#69b3a2", color = "black")

Boxplot

emissions %>%
  filter(year == 2010:2019) %>%
  
  ggplot(aes(x = em_dm, y = govt_expenditure_pct_gdp)) +
    geom_boxplot()

This version shows the distribution across the box plot through the addiont of the geom_jitter command.

emissions %>%
  filter(year == 2010:2019) %>%
  
  ggplot(aes(x = em_dm, y = govt_expenditure_pct_gdp, fill = em_dm)) +
    geom_boxplot() +
    scale_fill_viridis(discrete = TRUE, alpha=0.6) +
    geom_jitter(color="black", size=0.4, alpha=0.9)

Violin plots

Allow you to compare the distribution of several groups by displaying their densities.

emissions %>%
  filter(year == 2010:2019) %>%
  
  ggplot(aes(x = em_dm, y = govt_expenditure_pct_gdp, fill = em_dm)) +
    geom_violin()

Heatmaps

We can use a heatmap to display correlations between different variables.

emissions2 <- emissions %>%
  na.omit() %>%
  select(-c(1:4)) %>%
  select(c(1:5))

heatmap.2(cor(emissions2), Rowv = FALSE, Colv = FALSE, dendrogram = "none", 
          cellnote = round(cor(emissions2),2), 
          notecol = "black", key = TRUE, trace = 'none', margins = c(10,10), cexRow = .7, cexCol = .7,
          main = "Emissions Comparison")

Making a nice graph

This will be a brief intro to some basic functions you’ll end up using continuously to make decent looking graphs.

I’m going to change the emissions data to long format for this one. See the next section for an explanation of how what this is and how it works.

emissions_long <- emissions %>%
    pivot_longer(cols = gdp_usd_current_prices:cumulative_co2_per_capita, names_to = "variable", values_to = "value")

We’re going to take the below graph and turn it into something a little nicer looking.

emissions_long %>%
  filter(year == 2019) %>%
 filter(country_name %in% c("Angola","Argentina","Australia")) %>%
  filter(variable %in% c("govt_expenditure_pct_gdp", "debt_pct_gdp")) %>%
  drop_na() %>%
  #group_by(em_dm) %>%
  #summarize(avg = mean(value)) %>%
  
  ggplot(aes(x = country_name, y = value, fill = variable)) +
  geom_col(position = "dodge")

emissions_long %>%
  filter(year == 2019) %>%
 filter(country_name %in% c("Angola","Argentina","Australia")) %>%
  filter(variable %in% c("govt_expenditure_pct_gdp", "debt_pct_gdp")) %>%
  drop_na() %>%
  #group_by(em_dm) %>%
  #summarize(avg = mean(value)) %>%
  
  ggplot(aes(x = country_name, y = (value/100), fill = variable)) +
  
  # position = "dodge" will place the two variables we're measuring for each country next to each other. 
  geom_col(position = "dodge") + 
  
  
  # manually fill the columns with your desired colors
  # change the label names to something easier to read
  scale_fill_manual(values = c("steelblue4",
                             "steelblue1"), labels =  c("Debt","Govt Expenditure")) +
                               
  # Alternatively if you don't feel like manually selecting but want a nicer color pattern, the below works nicely as well.
  #scale_fill_viridis(discrete = TRUE, name = "", labels = c("Debt","Govt Expenditure")) +
  
  labs(y = "Percent of GDP",
       title = "Debt vs goverment expenditure levels",
       subtitle = "Percentage of GDP",
       caption = "Source: I don't remember") +
  
  # ggplot and ggthemes have a nice selection of themes to help make the overall graph nicer looking. You can even make it look like the Economist or FT. Play around with it and see what you like best. Note that theme_x needs to come before this next part where we use the theme() function to clean some stuff. 
  theme_clean() +
  
  # drop the X axis as we can clearly see they're countries. 
  # drop legend title
  # move the caption at the bottom to the left side of the chart
  theme(axis.title.x = element_blank(),
        legend.title = element_blank(),
        plot.caption = element_text(hjust=-.1, face="italic")) +
  
  # the first part labels tells the function to convert the values to percentages (hence why I divide the value by 100 up above). Note that percent_format comes from the scales package.
  # breaks and seq() tells it to start at zero and go to 1.10 (again because it's in percentages) and to show in increments of 20%. 
  # expand drops the bar down to the axis line, rather than hovering an inch above it. Not sure why this is used on scale_y_continuous rather than scale_x_continuous. 
  scale_y_continuous(labels = percent_format(accuracy = 1), 
                     breaks = seq(0,1.10, by = .20 ), expand = c(0,0))

Pivot Wider vs Pivot Longer

Transforming a table into long format means condensing columns down into a single variable column and shifting the values under a separate column. This makes the dataframe “longer” by creating many more rows. This is useful for building graphs, particularly comparing variables. Below shows how to pivot a dataframe to long format. Note that when looking up how to do this or looking at other code, you might see the author use “gather.” Gather is an older version of this and is no longer maintained. It is recommended that you use pivot_longer and pivot_wider, both of which come from the tidyverse.

As a general rule, time series and plotting multiple variables is easier in long format, while analyzing data is typically easier in wide format.

First let’s take a look at our current dataset. We see that we have 27 indicators (columns) with various values. Let’s turn this into long form.

head(emissions)

  country_name iso3c            em_dm year gdp_usd_current_prices
1       Angola   AGO Emerging Markets 1990                 12.571
2       Angola   AGO Emerging Markets 1991                 12.186
3       Angola   AGO Emerging Markets 1992                  9.395
4       Angola   AGO Emerging Markets 1993                  6.819
5       Angola   AGO Emerging Markets 1994                  4.965
6       Angola   AGO Emerging Markets 1995                  6.197
  gdp_ppp_current_prices gdp_pc_usd_current_prices gdp_pc_ppp_current_prices
1                 19.681                   986.639                   1544.61
2                 22.805                   928.506                   1737.67
3                 25.984                   695.059                   1922.26
4                 29.518                   489.755                   2120.07
5                 33.317                   346.239                   2323.25
6                 37.541                   419.529                   2541.54
  population govt_expenditure_pct_gdp debt_pct_gdp territorial_co2 trade_co2
1     12.742                       NA           NA           5.090        NA
2     13.124                       NA           NA           5.064        NA
3     13.518                       NA           NA           5.164        NA
4     13.923                       NA           NA           5.748        NA
5     14.341                       NA           NA           3.865        NA
6     14.771                       NA           NA          10.949        NA
  consumption_co2 cumulative_co2 debt_usd govt_expenditure_usd
1              NA        115.663       NA                   NA
2              NA        120.727       NA                   NA
3              NA        125.891       NA                   NA
4              NA        131.640       NA                   NA
5              NA        135.504       NA                   NA
6              NA        146.453       NA                   NA
  territorial_co2_per_capita trade_co2_per_capita consumption_co2_per_capita
1                  0.3994663                   NA                         NA
2                  0.3858580                   NA                         NA
3                  0.3820092                   NA                         NA
4                  0.4128421                   NA                         NA
5                  0.2695070                   NA                         NA
6                  0.7412497                   NA                         NA
  territorial_co2_per_gdp trade_co2_per_gdp consumption_co2_per_gdp
1               0.2586251                NA                      NA
2               0.2220566                NA                      NA
3               0.1987377                NA                      NA
4               0.1947286                NA                      NA
5               0.1160068                NA                      NA
6               0.2916545                NA                      NA
  territorial_co2_per_debt trade_co2_per_debt consumption_co2_per_debt
1                       NA                 NA                       NA
2                       NA                 NA                       NA
3                       NA                 NA                       NA
4                       NA                 NA                       NA
5                       NA                 NA                       NA
6                       NA                 NA                       NA
  territorial_co2_per_govt_expenditure trade_co2_per_govt_expenditure
1                                   NA                             NA
2                                   NA                             NA
3                                   NA                             NA
4                                   NA                             NA
5                                   NA                             NA
6                                   NA                             NA
  consumption_co2_govt_expenditure trade_pct_of_consumption_co2
1                               NA                           NA
2                               NA                           NA
3                               NA                           NA
4                               NA                           NA
5                               NA                           NA
6                               NA                           NA
  cumulative_co2_per_capita
1                  9.077303
2                  9.198948
3                  9.312842
4                  9.454859
5                  9.448713
6                  9.914901

emissions_long <- emissions %>%
  
  # cols tells it to grab the columns from gdp_us_current_prices all the way to cumulative_co2_per_capita
  # names_to tells it to create a new column called "variable" and put all the column headers under it
  # values_to tells it to take all the data contained in the columns and put it under a new column called "value"
  
  pivot_longer(cols = gdp_usd_current_prices:cumulative_co2_per_capita, names_to = "variable", values_to = "value")

We can see that we’ve gone from 31 total columns to 6 columns. But what can we do with this format?

head(emissions_long)

# A tibble: 6 × 6
  country_name iso3c em_dm             year variable                   value
  <chr>        <chr> <chr>            <int> <chr>                      <dbl>
1 Angola       AGO   Emerging Markets  1990 gdp_usd_current_prices      12.6
2 Angola       AGO   Emerging Markets  1990 gdp_ppp_current_prices      19.7
3 Angola       AGO   Emerging Markets  1990 gdp_pc_usd_current_prices  987. 
4 Angola       AGO   Emerging Markets  1990 gdp_pc_ppp_current_prices 1545. 
5 Angola       AGO   Emerging Markets  1990 population                  12.7
6 Angola       AGO   Emerging Markets  1990 govt_expenditure_pct_gdp    NA

We can see in the graph below we’re able to easily compare multiple variables, which is the main point for pivoting long

emissions_long %>%
  filter(year == 2019) %>%
 filter(country_name %in% c("Angola","Argentina","Australia")) %>%
  filter(variable %in% c("govt_expenditure_pct_gdp", "debt_pct_gdp")) %>%
  drop_na() %>%
  #group_by(em_dm) %>%
  #summarize(avg = mean(value)) %>%
  
  ggplot(aes(x = country_name, y = value, fill = variable)) +
  geom_col(position = "dodge")

Let’s look at another example where having data in long format will make our lives easier. We are going to briefly look at the World Bank’s International Debt Statistics (IDS) dataset

IDS <- read.csv("IDS_All.csv")

head(IDS)

  Country.Name Country.Code Counterpart.Area.Name Counterpart.Area.Code
1  Afghanistan          AFG                 World                   WLD
2  Afghanistan          AFG                 World                   WLD
3  Afghanistan          AFG                 World                   WLD
4  Afghanistan          AFG                 World                   WLD
5  Afghanistan          AFG                 World                   WLD
6  Afghanistan          AFG                 World                   WLD
                                                              Series.Name
1           Average grace period on new external debt commitments (years)
2 Average grace period on new external debt commitments, official (years)
3  Average grace period on new external debt commitments, private (years)
4              Average grant element on new external debt commitments (%)
5    Average grant element on new external debt commitments, official (%)
6     Average grant element on new external debt commitments, private (%)
  Series.Code X1970 X1971 X1972 X1973 X1974 X1975 X1976 X1977 X1978 X1979 X1980
1 DT.GPA.DPPG    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
2 DT.GPA.OFFT    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
3 DT.GPA.PRVT    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
4 DT.GRE.DPPG    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
5 DT.GRE.OFFT    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
6 DT.GRE.PRVT    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
  X1981 X1982 X1983 X1984 X1985 X1986 X1987 X1988 X1989 X1990 X1991 X1992 X1993
1    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
2    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
3    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
4    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
5    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
6    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
  X1994 X1995 X1996 X1997 X1998 X1999 X2000 X2001 X2002 X2003 X2004 X2005 X2006
1    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA 10.25
2    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA 10.25
3    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA  0.00
4    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA 50.62
5    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA 50.62
6    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA  0.00
  X2007 X2008 X2009 X2010 X2011 X2012 X2013 X2014 X2015 X2016 X2017 X2018 X2019
1  8.08  9.96     0  7.83     0     0     0 29.83     0     0 19.84     0 17.90
2  8.08  9.96     0  7.83     0     0     0 29.83     0     0 19.84     0 17.90
3  0.00  0.00     0  0.00     0     0     0  0.00     0     0  0.00     0  0.00
4 43.02 58.01     0 21.21     0     0     0 81.93     0     0 64.26     0 73.08
5 43.02 58.01     0 21.21     0     0     0 81.93     0     0 64.26     0 73.08
6  0.00  0.00     0  0.00     0     0     0  0.00     0     0  0.00     0  0.00
  X2020 X2021 X2022 X2023 X2024 X2025 X2026 X2027 X2028
1     0    NA    NA    NA    NA    NA    NA    NA    NA
2     0    NA    NA    NA    NA    NA    NA    NA    NA
3     0    NA    NA    NA    NA    NA    NA    NA    NA
4     0    NA    NA    NA    NA    NA    NA    NA    NA
5     0    NA    NA    NA    NA    NA    NA    NA    NA
6     0    NA    NA    NA    NA    NA    NA    NA    NA

# For some reason R reads the years in with X's on the front so let's get rid of that
colnames(IDS)<-gsub("X","", colnames(IDS))

head(IDS)

  Country.Name Country.Code Counterpart.Area.Name Counterpart.Area.Code
1  Afghanistan          AFG                 World                   WLD
2  Afghanistan          AFG                 World                   WLD
3  Afghanistan          AFG                 World                   WLD
4  Afghanistan          AFG                 World                   WLD
5  Afghanistan          AFG                 World                   WLD
6  Afghanistan          AFG                 World                   WLD
                                                              Series.Name
1           Average grace period on new external debt commitments (years)
2 Average grace period on new external debt commitments, official (years)
3  Average grace period on new external debt commitments, private (years)
4              Average grant element on new external debt commitments (%)
5    Average grant element on new external debt commitments, official (%)
6     Average grant element on new external debt commitments, private (%)
  Series.Code 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982
1 DT.GPA.DPPG   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
2 DT.GPA.OFFT   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
3 DT.GPA.PRVT   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
4 DT.GRE.DPPG   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
5 DT.GRE.OFFT   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
6 DT.GRE.PRVT   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
  1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997
1   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
2   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
3   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
4   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
5   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
6   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA   NA
  1998 1999 2000 2001 2002 2003 2004 2005  2006  2007  2008 2009  2010 2011
1   NA   NA   NA   NA   NA   NA   NA   NA 10.25  8.08  9.96    0  7.83    0
2   NA   NA   NA   NA   NA   NA   NA   NA 10.25  8.08  9.96    0  7.83    0
3   NA   NA   NA   NA   NA   NA   NA   NA  0.00  0.00  0.00    0  0.00    0
4   NA   NA   NA   NA   NA   NA   NA   NA 50.62 43.02 58.01    0 21.21    0
5   NA   NA   NA   NA   NA   NA   NA   NA 50.62 43.02 58.01    0 21.21    0
6   NA   NA   NA   NA   NA   NA   NA   NA  0.00  0.00  0.00    0  0.00    0
  2012 2013  2014 2015 2016  2017 2018  2019 2020 2021 2022 2023 2024 2025 2026
1    0    0 29.83    0    0 19.84    0 17.90    0   NA   NA   NA   NA   NA   NA
2    0    0 29.83    0    0 19.84    0 17.90    0   NA   NA   NA   NA   NA   NA
3    0    0  0.00    0    0  0.00    0  0.00    0   NA   NA   NA   NA   NA   NA
4    0    0 81.93    0    0 64.26    0 73.08    0   NA   NA   NA   NA   NA   NA
5    0    0 81.93    0    0 64.26    0 73.08    0   NA   NA   NA   NA   NA   NA
6    0    0  0.00    0    0  0.00    0  0.00    0   NA   NA   NA   NA   NA   NA
  2027 2028
1   NA   NA
2   NA   NA
3   NA   NA
4   NA   NA
5   NA   NA
6   NA   NA

# figure out column numbers
colnames(IDS)

 [1] "Country.Name"          "Country.Code"          "Counterpart.Area.Name"
 [4] "Counterpart.Area.Code" "Series.Name"           "Series.Code"          
 [7] "1970"                  "1971"                  "1972"                 
[10] "1973"                  "1974"                  "1975"                 
[13] "1976"                  "1977"                  "1978"                 
[16] "1979"                  "1980"                  "1981"                 
[19] "1982"                  "1983"                  "1984"                 
[22] "1985"                  "1986"                  "1987"                 
[25] "1988"                  "1989"                  "1990"                 
[28] "1991"                  "1992"                  "1993"                 
[31] "1994"                  "1995"                  "1996"                 
[34] "1997"                  "1998"                  "1999"                 
[37] "2000"                  "2001"                  "2002"                 
[40] "2003"                  "2004"                  "2005"                 
[43] "2006"                  "2007"                  "2008"                 
[46] "2009"                  "2010"                  "2011"                 
[49] "2012"                  "2013"                  "2014"                 
[52] "2015"                  "2016"                  "2017"                 
[55] "2018"                  "2019"                  "2020"                 
[58] "2021"                  "2022"                  "2023"                 
[61] "2024"                  "2025"                  "2026"                 
[64] "2027"                  "2028"

# with data like this there's often not a lot of good record keeping the further back you go, so we'll hack off some of these year columns. I'm also not interested in things past 

IDS <- IDS %>%
  select(-c(7:36)) 

# I'm doing this in two parts to validate the column numbers 

IDS <- IDS %>% 
  select(-c(28:35))

head(IDS)

  Country.Name Country.Code Counterpart.Area.Name Counterpart.Area.Code
1  Afghanistan          AFG                 World                   WLD
2  Afghanistan          AFG                 World                   WLD
3  Afghanistan          AFG                 World                   WLD
4  Afghanistan          AFG                 World                   WLD
5  Afghanistan          AFG                 World                   WLD
6  Afghanistan          AFG                 World                   WLD
                                                              Series.Name
1           Average grace period on new external debt commitments (years)
2 Average grace period on new external debt commitments, official (years)
3  Average grace period on new external debt commitments, private (years)
4              Average grant element on new external debt commitments (%)
5    Average grant element on new external debt commitments, official (%)
6     Average grant element on new external debt commitments, private (%)
  Series.Code 2000 2001 2002 2003 2004 2005  2006  2007  2008 2009  2010 2011
1 DT.GPA.DPPG   NA   NA   NA   NA   NA   NA 10.25  8.08  9.96    0  7.83    0
2 DT.GPA.OFFT   NA   NA   NA   NA   NA   NA 10.25  8.08  9.96    0  7.83    0
3 DT.GPA.PRVT   NA   NA   NA   NA   NA   NA  0.00  0.00  0.00    0  0.00    0
4 DT.GRE.DPPG   NA   NA   NA   NA   NA   NA 50.62 43.02 58.01    0 21.21    0
5 DT.GRE.OFFT   NA   NA   NA   NA   NA   NA 50.62 43.02 58.01    0 21.21    0
6 DT.GRE.PRVT   NA   NA   NA   NA   NA   NA  0.00  0.00  0.00    0  0.00    0
  2012 2013  2014 2015 2016  2017 2018  2019 2020
1    0    0 29.83    0    0 19.84    0 17.90    0
2    0    0 29.83    0    0 19.84    0 17.90    0
3    0    0  0.00    0    0  0.00    0  0.00    0
4    0    0 81.93    0    0 64.26    0 73.08    0
5    0    0 81.93    0    0 64.26    0 73.08    0
6    0    0  0.00    0    0  0.00    0  0.00    0

Now let’s pivot this into long format so we can work some time series.

When we do this, year gets changed to the character data type. Something you’ll see often is when you try to use as.numeric to fix this, it will give you an error message that “list object cannot be coerced to double.” Try running the functon below as I do, which you can also find here.

IDS_long <- IDS %>%
  pivot_longer(cols = `2000`:`2020`, names_to = "year", values_to = "value")

# Teal's boutique function to deal with columns labled as characters that tell you they can't be coerced to numeric when trying change datatypes. Time series needs to be put in date or numeric for x-axis

character_num_to_numeric <- function(character_num) {
  character_num %>%
  # take out the commas
  str_remove_all(pattern = ",") %>%
  # take out any blank spaces before or after the number
  str_trim() %>%
  # coerce to numeric
  as.numeric()
}

# run the function with mutate
IDS_long %>%
  mutate(year = character_num_to_numeric(year))

# A tibble: 1,625,169 × 8
   Country.Name Country.Code Counterpart.A…¹ Count…² Serie…³ Serie…⁴  year value
   <chr>        <chr>        <chr>           <chr>   <chr>   <chr>   <dbl> <dbl>
 1 Afghanistan  AFG          World           WLD     Averag… DT.GPA…  2000 NA   
 2 Afghanistan  AFG          World           WLD     Averag… DT.GPA…  2001 NA   
 3 Afghanistan  AFG          World           WLD     Averag… DT.GPA…  2002 NA   
 4 Afghanistan  AFG          World           WLD     Averag… DT.GPA…  2003 NA   
 5 Afghanistan  AFG          World           WLD     Averag… DT.GPA…  2004 NA   
 6 Afghanistan  AFG          World           WLD     Averag… DT.GPA…  2005 NA   
 7 Afghanistan  AFG          World           WLD     Averag… DT.GPA…  2006 10.2 
 8 Afghanistan  AFG          World           WLD     Averag… DT.GPA…  2007  8.08
 9 Afghanistan  AFG          World           WLD     Averag… DT.GPA…  2008  9.96
10 Afghanistan  AFG          World           WLD     Averag… DT.GPA…  2009  0   
# … with 1,625,159 more rows, and abbreviated variable names
#   ¹Counterpart.Area.Name, ²Counterpart.Area.Code, ³Series.Name, ⁴Series.Code

Finally I can show you a time series graph. In later sessions we will go more in depth on how to make the graphs look much nicer.

IDS_long %>%
  filter(Country.Name %in% c("Ghana", "Angola","Mozambique")) %>%
  filter(year > 2010) %>%
  #see why we want to change variable names? I made the mistake of not shortening and it was a pain for the entire project.
  filter(Series.Name == "Average grace period on new external debt commitments (years)") %>%
  
  ggplot(aes(x = year, y = value, group = Country.Name, color = Country.Name)) +
  geom_line()

What if we want to pivot wider and break out all the series names? I maxxed out my computer attempting this (it tried to create over 500 columns. Not advised) so we’ll show it with the emissions dataset instead.

emissions_wide <- emissions_long %>%
  pivot_wider(names_from = variable, values_from = value)

head(emissions_wide)

# A tibble: 6 × 31
  country_name iso3c em_dm  year gdp_u…¹ gdp_p…² gdp_p…³ gdp_p…⁴ popul…⁵ govt_…⁶
  <chr>        <chr> <chr> <int>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
1 Angola       AGO   Emer…  1990   12.6     19.7    987.   1545.    12.7      NA
2 Angola       AGO   Emer…  1991   12.2     22.8    929.   1738.    13.1      NA
3 Angola       AGO   Emer…  1992    9.40    26.0    695.   1922.    13.5      NA
4 Angola       AGO   Emer…  1993    6.82    29.5    490.   2120.    13.9      NA
5 Angola       AGO   Emer…  1994    4.96    33.3    346.   2323.    14.3      NA
6 Angola       AGO   Emer…  1995    6.20    37.5    420.   2542.    14.8      NA
# … with 21 more variables: debt_pct_gdp <dbl>, territorial_co2 <dbl>,
#   trade_co2 <dbl>, consumption_co2 <dbl>, cumulative_co2 <dbl>,
#   debt_usd <dbl>, govt_expenditure_usd <dbl>,
#   territorial_co2_per_capita <dbl>, trade_co2_per_capita <dbl>,
#   consumption_co2_per_capita <dbl>, territorial_co2_per_gdp <dbl>,
#   trade_co2_per_gdp <dbl>, consumption_co2_per_gdp <dbl>,
#   territorial_co2_per_debt <dbl>, trade_co2_per_debt <dbl>, …