Loading R packages needed for data cleaning and data plots

library(tidyverse) 
library(tidyquant)
library(lubridate)
library(stringr)
library(ggplot2)

library(esquisse)
library(here)
library(janitor)
library(ggthemes)

library(viridisLite)
library(plotly)
library(plyr)
library(tidyr)
library(scales)

library(countrycode)
library(wbstats)

options(scipen=10) # forces regular notation vs scientific notation (ie5)

##Clean IAM_dataset First, I am removing all the years which I will not need for my analysis. These are years from 1990 to 2019, and year 2010 which mainly contains missing values because very few output variables were generated for this year.

#Remove all year columns preceding 2022
IAM_data <- IAM_data %>%
  select(-starts_with(c("19","200","201")))

#Remove year column 2110
IAM_data <- IAM_data %>%
  select(-'2110')

Variables in the IAM dataset have different categorization, but all categorization was aggregated into the ‘Variable’ column. To view the broad variable categories in the dataset, I split the Variable column based on first delimiter “|” (sep = “\|”), keeping the original ‘Variable’ column (remove = FALSE). Variable category is now under the column ‘var.category’, and the sub-variable is under ‘variable’. Running the function unique shows that there are 38 variable categories.

IAM_data <- separate(IAM_data, Variable, into = c("var.category","variable"),sep = "\\|", remove = FALSE, extra = "merge")
Warning: Expected 2 pieces. Missing pieces filled with `NA` in 8946 rows [69, 371, 396, 472, 670, 812, 821, 857, 888, 946, 1310, 1614, 1639, 1749, 2052, 2076, 2322, 2520, 2660, 2669, ...].
unique(IAM_data$var.category) #list of 38 variable categories in IAM_Data
 [1] "GDP"                                          
 [2] "Post-processed"                               
 [3] "net GDP"                                      
 [4] "Agricultural Demand"                          
 [5] "Capacity Additions"                           
 [6] "Capacity"                                     
 [7] "Capital Cost"                                 
 [8] "Carbon Sequestration"                         
 [9] "Emissions"                                    
[10] "Energy Service"                               
[11] "Final Energy"                                 
[12] "Investment"                                   
[13] "Land Cover"                                   
[14] "Primary Energy"                               
[15] "Production"                                   
[16] "Secondary Energy"                             
[17] "Trade"                                        
[18] "Population"                                   
[19] "Price"                                        
[20] "Revenue"                                      
[21] "Useful Energy"                                
[22] "AR6 climate diagnostics"                      
[23] "Concentration"                                
[24] "Forcing"                                      
[25] "Temperature"                                  
[26] "Agricultural Production"                      
[27] "Consumption"                                  
[28] "Fertilizer Use"                               
[29] "Food Demand"                                  
[30] "Forestry Demand"                              
[31] "Forestry Production"                          
[32] "Water Consumption"                            
[33] "Water Withdrawal"                             
[34] "Yield"                                        
[35] "Damage factor"                                
[36] "Policy Cost"                                  
[37] "Macro-Economic Climate Damage"                
[38] "Policy Cost and Macro-Economic Climate Damage"

The missing values in the newly created ‘variable’ column comes from variables that do not have sub-labels. These are typically aggregate values of sub-categories and therefore do not have the delimiter “|”. I create a new dataframe and filter for only rows that have NA values in the ‘variable’ column to see these aggregate variables

IAM_data2 <- IAM_data %>%
  filter(is.na(variable))

unique(IAM_data2$var.category) #variables that do not have delimiter "|"
 [1] "Agricultural Demand"     "Land Cover"              "Secondary Energy"       
 [4] "Final Energy"            "Population"              "Primary Energy"         
 [7] "Useful Energy"           "Forcing"                 "Agricultural Production"
[10] "Consumption"             "Food Demand"             "Water Consumption"      
[13] "Damage factor"           "Investment"             

Comparing region coverage between IAM models

Create a new dataframe for each IAM model.

unique(IAM_data$Model)

IAM_GCAM <- IAM_data %>%
  filter(Model == "GCAM 5.3+ NGFS")

IAM_MESSAGE <- IAM_data %>%
  filter(Model == "MESSAGEix-GLOBIOM 1.1-M-R12")

IAM_REMIND <- IAM_data %>%
  filter(Model == "REMIND-MAgPIE 3.0-4.4")

Create a vector containing the list of countries which have output from the each model.These vectors will be used to compare which regions are not included in one model but are included in another model.

IAM_GCAM.country <- IAM_GCAM
 # filter(str_detect(Region,"(R5)"))
 
IAM_GCAM.country <- IAM_GCAM.country[!grepl('(R5)|GCAM|World', IAM_GCAM.country$Region),]

IAM_GCAM.country <- unique(IAM_GCAM.country$Region)
IAM_MESSAGE.country <- IAM_MESSAGE
IAM_MESSAGE.country <- IAM_MESSAGE.country[!grepl('(R5)|MESSAGE|World', IAM_MESSAGE.country$Region),]

IAM_MESSAGE.country <- unique(IAM_MESSAGE.country$Region)
IAM_REMIND.country <- IAM_REMIND
IAM_REMIND.country <- IAM_REMIND.country[!grepl('(R5)|REMIND|World', IAM_REMIND.country$Region),]

IAM_REMIND.country <- unique(IAM_REMIND.country$Region)

Show all overlapping individual countries between the 3 IAM models

Reduce(intersect, list(IAM_GCAM.country,IAM_MESSAGE.country,IAM_REMIND.country))
  [1] "AFG" "ALB" "DZA" "AGO" "ARG" "ARM" "ABW" "AUS" "AUT" "AZE" "BHS" "BHR" "BGD"
 [14] "BRB" "BLR" "BEL" "BLZ" "BEN" "BTN" "BOL" "BIH" "BWA" "BRN" "BGR" "BFA" "BDI"
 [27] "KHM" "CMR" "CAN" "CPV" "CAF" "TCD" "CHL" "COL" "COM" "COG" "COD" "CRI" "HRV"
 [40] "CUB" "CYP" "CZE" "CIV" "DNK" "DJI" "DOM" "EU"  "ECU" "EGY" "SLV" "GNQ" "ERI"
 [53] "EST" "ETH" "BRA" "FJI" "FIN" "FRA" "GAB" "GMB" "GEO" "DEU" "GHA" "GRC" "GTM"
 [66] "GIN" "GNB" "GUY" "HTI" "HND" "HKG" "HUN" "ISL" "IDN" "IRN" "IRQ" "IRL" "ISR"
 [79] "ITA" "JAM" "JOR" "KAZ" "KEN" "KOR" "KWT" "KGZ" "LAO" "LVA" "LBN" "LSO" "LBR"
 [92] "LBY" "LTU" "LUX" "MAC" "MKD" "MDG" "MWI" "MYS" "MLI" "MLT" "MRT" "MDA" "MNG"
[105] "MNE" "MAR" "MOZ" "MMR" "NAM" "NPL" "NLD" "NCL" "NZL" "NIC" "NER" "NGA" "NOR"
[118] "OMN" "PAK" "PSE" "PAN" "PNG" "PRY" "CHN" "PER" "PHL" "POL" "PRT" "PRI" "QAT"
[131] "IND" "ROU" "RUS" "RWA" "LCA" "VCT" "WSM" "STP" "SAU" "SEN" "SRB" "SLE" "SGP"
[144] "SVK" "SVN" "SLB" "SOM" "ZAF" "ESP" "LKA" "JPN" "SDN" "SUR" "SWZ" "SWE" "CHE"
[157] "SYR" "TWN" "TJK" "TZA" "THA" "TLS" "TGO" "TTO" "TUN" "TUR" "TKM" "UGA" "UKR"
[170] "ARE" "GBR" "MEX" "USA" "URY" "UZB" "VUT" "VEN" "VNM" "YEM" "ZMB" "ZWE"

Show countries that are not included in the models. Output shows that countries with ISO codes “PYF” “MDV” “MUS” “TON” are included in the MESSAGE model but not in GCAM and REMIND. GCAM and REMIND models also have the same country coverage.

setdiff(IAM_MESSAGE.country,IAM_GCAM.country) #countries included in MESSAGE but NOT in GCAM
[1] "PYF" "MDV" "MUS" "TON"
setdiff(IAM_MESSAGE.country,IAM_REMIND.country) #countries included in MESSAGE but NOT in REMIND
[1] "PYF" "MDV" "MUS" "TON"
c(setdiff(IAM_GCAM.country, IAM_REMIND.country), setdiff(IAM_REMIND.country, IAM_GCAM.country)) #Complete overlap of individual countries between GCAM and REMIND
character(0)

Data cleaning to evaluate emission pathways

Create a new dataframe for world-level data.

IAM_world <- IAM_data %>%
  filter(Region == "World")
  
#Show variables that have no time skip
na_rows = IAM_world %>% 
  is.na() %>% 
  rowSums() > 0

processed.IAM_world <- IAM_world %>% 
  filter(!na_rows)

Create new dataframe for World emissions data. Variable “Emissions|CO2” shows annual emissions in Mt CO2, excluding carbon removal from CCS. Because the values in Mt are too large and are not suitable for facet charts, a new variable column is created which is the Mt CO2 value of the Mt.CO2.peryear variable converted into Ct CO2.

IAM_world.emissions <- IAM_world %>%
  filter(Variable == "Emissions|CO2") %>%
  pivot_longer('2020':'2100', names_to = "Year", values_to = "Mt.CO2.peryear") %>%
  mutate(Gt.CO2.peryear = Mt.CO2.peryear*0.001) %>%
  mutate(Year = as.numeric(Year)) %>%
  filter(!is.na(Gt.CO2.peryear))
 #filter(Scenario == c("Current Policies","Delayed Transition")) %>%

glimpse(IAM_world.emissions)
Rows: 414
Columns: 10
$ Model          <chr> "GCAM 5.3+ NGFS", "GCAM 5.3+ NGFS", "GCAM 5.3+ NGFS", "GCAM 5.3…
$ Scenario       <chr> "Below 2°C", "Below 2°C", "Below 2°C", "Below 2°C", "Below 2°C"…
$ Region         <chr> "World", "World", "World", "World", "World", "World", "World", …
$ Variable       <chr> "Emissions|CO2", "Emissions|CO2", "Emissions|CO2", "Emissions|C…
$ var.category   <chr> "Emissions", "Emissions", "Emissions", "Emissions", "Emissions"…
$ variable       <chr> "CO2", "CO2", "CO2", "CO2", "CO2", "CO2", "CO2", "CO2", "CO2", …
$ Unit           <chr> "Mt CO2/yr", "Mt CO2/yr", "Mt CO2/yr", "Mt CO2/yr", "Mt CO2/yr"…
$ Year           <dbl> 2020, 2025, 2030, 2035, 2040, 2045, 2050, 2055, 2060, 2065, 207…
$ Mt.CO2.peryear <dbl> 38613.0326, 35588.8368, 31691.6707, 26823.8586, 21933.7247, 170…
$ Gt.CO2.peryear <dbl> 38.6130326, 35.5888368, 31.6916707, 26.8238586, 21.9337247, 17.…

Rename scenarios for consistency with NGFS’s published presentation and easier chart visualization

IAM_world.emissions$Scenario[IAM_world.emissions$Scenario == "Nationally Determined Contributions (NDCs)"] <- "NDCs"
IAM_world.emissions$Scenario[IAM_world.emissions$Scenario == "Delayed transition"] <- "Delayed 2°C"

Data cleaning for share of primary energy from fossil fuels, renewables & nuclear

I want to display how the percentage share of fossil fuels, renewables and nuclear energy in primary energy consumption change over time. The following steps are to calculate this percentage share and how it changes over time.

First, I create new dataframe containing the 5 variables needed for my calculation. Data is pivoted from wider to longer so that there’s a new column for Year and a single column for values.

IAM_world.energyshare <- IAM_world %>%
  pivot_longer('2020':'2100', names_to = "Year")%>%
  filter(Variable %in% c("Primary Energy","Primary Energy|Fossil","Primary Energy|Biomass","Primary Energy|Non-Biomass Renewables","Primary Energy|Nuclear")) %>%
  filter(!is.na(value)) %>%
  select(-var.category, -variable,-Region)

unique(IAM_world.energyshare$Variable)
[1] "Primary Energy"                        "Primary Energy|Biomass"               
[3] "Primary Energy|Fossil"                 "Primary Energy|Non-Biomass Renewables"
[5] "Primary Energy|Nuclear"               

Then I create another dataframe where I do my calculation. First I pivot this data from longer to wider so that new columns are created based on the Primary energy variable type.

Having these columns then allow me to create new columns containing the percentage share, calculated by dividing the primary energy type (in EJ/year) by total primary energy (in EJ/year). I then remove these columns to leave only the percentage share columns. The percentage share dataframe is pivoted back from wider to longer.

energy.share <- IAM_world.energyshare %>%
  pivot_wider(names_from = Variable, values_from = value) %>%
  rename(Primary.Energy = 'Primary Energy',
         Primary.Energy.Biomass = 'Primary Energy|Biomass',
         Primary.Energy.NonBiomass.Renewables = 'Primary Energy|Non-Biomass Renewables',
         Primary.Energy.Nuclear = 'Primary Energy|Nuclear',
         Primary.Energy.Fossil = 'Primary Energy|Fossil') %>%
  mutate(Primary.Energy.Renewables = (Primary.Energy.Biomass + 
                                        Primary.Energy.NonBiomass.Renewables)) %>%
  mutate(Fossil.share = Primary.Energy.Fossil/Primary.Energy,
         Nuclear.share = Primary.Energy.Nuclear/Primary.Energy,
         Renewables.share = Primary.Energy.Renewables/Primary.Energy) %>%
  select(-Primary.Energy.Biomass, -Primary.Energy.NonBiomass.Renewables, -Primary.Energy,
         -Primary.Energy.Nuclear, -Primary.Energy.Fossil, -Primary.Energy.Renewables)


energy.share$Unit[energy.share$Unit == "EJ/yr"] <- "percent"

energy.share <- energy.share %>%
  pivot_longer(c(Fossil.share, Nuclear.share, Renewables.share),
               names_to = "Energy.source", values_to = "Percent") %>%
  mutate(Year = as.numeric(Year))
glimpse(energy.share)
Rows: 1,242
Columns: 6
$ Model         <chr> "GCAM 5.3+ NGFS", "GCAM 5.3+ NGFS", "GCAM 5.3+ NGFS", "GCAM 5.3+…
$ Scenario      <chr> "Below 2°C", "Below 2°C", "Below 2°C", "Below 2°C", "Below 2°C",…
$ Unit          <chr> "percent", "percent", "percent", "percent", "percent", "percent"…
$ Year          <dbl> 2020, 2020, 2020, 2025, 2025, 2025, 2030, 2030, 2030, 2035, 2035…
$ Energy.source <chr> "Fossil.share", "Nuclear.share", "Renewables.share", "Fossil.sha…
$ Percent       <dbl> 0.84890693, 0.01580466, 0.13528841, 0.81794416, 0.01686257, 0.16…

Data cleaning for energy mix analysis

Create new dataframe for energy mix in 2100 under 2 scenarios: “Below 2°C” and “Current Policies”

IAM_world.2100energymix <- IAM_world %>%
  pivot_longer('2020':'2100', names_to = "Year")%>%
  filter(Variable %in% c("Primary Energy|Coal","Primary Energy|Gas","Primary Energy|Oil","Primary Energy|Biomass","Primary Energy|Non-Biomass Renewables","Primary Energy|Nuclear")) %>%
  filter(!is.na(value)) %>%
  select(-var.category, -variable,-Region) %>%
  filter(Year == "2100") %>%
  filter(Scenario %in% c("Below 2°C", "Current Policies"))

# Reordering Scenario factor levels
IAM_world.2100energymix$Scenario <- factor(IAM_world.2100energymix$Scenario, levels = c("Current Policies","Below 2°C"))

#Rename Variable into Energy source simplified names
IAM_world.2100energymix$Variable[IAM_world.2100energymix$Variable == "Primary Energy|Coal"] <- "Coal"
IAM_world.2100energymix$Variable[IAM_world.2100energymix$Variable == "Primary Energy|Gas"] <- "Gas"
IAM_world.2100energymix$Variable[IAM_world.2100energymix$Variable == "Primary Energy|Oil"] <- "Oil"
IAM_world.2100energymix$Variable[IAM_world.2100energymix$Variable == "Primary Energy|Biomass"] <- "Biomass"
IAM_world.2100energymix$Variable[IAM_world.2100energymix$Variable == "Primary Energy|Non-Biomass Renewables"] <- "Renewables"
IAM_world.2100energymix$Variable[IAM_world.2100energymix$Variable == "Primary Energy|Nuclear"] <- "Nuclear"
IAM_world.2100energymix

Data cleaning for CCS deployment analysis

Create new dataframe for cumulative CCS deployment from 2020 to 2100. Data is grouped by ‘Variable’ column using the group_by function to facilitate showing the cumulative value in the stacked bar chart later on. Variables are also renamed.

IAM_world.CCS <- IAM_world %>%
  filter(Variable %in% c("Carbon Sequestration|Land Use","Carbon Sequestration|CCS|Fossil","Carbon Sequestration|CCS|Biomass","Carbon Sequestration|CCS|Industrial Processes")) %>%
  select(-var.category, -variable,-Region) %>%
  pivot_longer('2020':'2100', names_to = "Year") %>%
  filter(!is.na(value)) %>%
  mutate(Gt.CO2.peryear = value*0.001)

IAM_world.CCS$Scenario[IAM_world.CCS$Scenario == "Nationally Determined Contributions (NDCs)"] <- "NDCs"
IAM_world.CCS$Scenario[IAM_world.CCS$Scenario == "Delayed transition"] <- "Delayed 2°C"

IAM_world.CCS$Variable[IAM_world.CCS$Variable == "Carbon Sequestration|Land Use"] <- "Land-based sequestration"
IAM_world.CCS$Variable[IAM_world.CCS$Variable == "Carbon Sequestration|CCS|Biomass"] <- "Bioenergy with CCS"
IAM_world.CCS$Variable[IAM_world.CCS$Variable == "Carbon Sequestration|CCS|Fossil"] <- "Fossil fuel power plants fitted with CCS"
IAM_world.CCS$Variable[IAM_world.CCS$Variable == "Carbon Sequestration|CCS|Industrial Processes"] <- "Industrial processes fitted with CCS"

# Reordering group factor levels
IAM_world.CCS$Scenario <- factor(IAM_world.CCS$Scenario, levels = c("Current Policies","Divergent Net Zero","Net Zero 2050","NDCs","Delayed 2°C","Below 2°C"))

IAM_world.CCS <- IAM_world.CCS %>%
  group_by(Variable)
IAM_world.CCS
saveRDS(object = IAM_world.emissions, file = "IAM_world.emissions.rds")
saveRDS(object = IAM_world.CCS, file = "IAM_world.CCS.rds")
saveRDS(object = IAM_world.2100energymix, file = "IAM_world.2100energymix.rds")
saveRDS(object = energy.share, file = "world.energy.share.rds")
---
title: "Data preparation for analysis and creating charts"
output: html_notebook
---

Loading R packages needed for data cleaning and data plots
```{r}
library(tidyverse) 
library(tidyquant)
library(lubridate)
library(stringr)
library(ggplot2)

library(esquisse)
library(here)
library(janitor)
library(ggthemes)

library(viridisLite)
library(plotly)
library(plyr)
library(tidyr)
library(scales)

library(countrycode)
library(wbstats)

options(scipen=10) # forces regular notation vs scientific notation (ie5)
```


##Clean IAM_dataset
First, I am removing all the years which I will not need for my analysis. These are years from 1990 to 2019, and year 2010 which mainly contains missing values because very few output variables were generated for this year.

```{r}
#Remove all year columns preceding 2022
IAM_data <- IAM_data %>%
  select(-starts_with(c("19","200","201")))

#Remove year column 2110
IAM_data <- IAM_data %>%
  select(-'2110')

```

Variables in the IAM dataset have different categorization, but all categorization was aggregated into the 'Variable' column.
To view the broad variable categories in the dataset, I split the Variable column based on first delimiter "|" (sep = "\\|"), keeping the original 'Variable' column (remove = FALSE). Variable category is now under the column 'var.category', and the sub-variable is under 'variable'.
Running the function unique shows that there are 38 variable categories.
```{r}
IAM_data <- separate(IAM_data, Variable, into = c("var.category","variable"),sep = "\\|", remove = FALSE, extra = "merge")

unique(IAM_data$var.category) #list of 38 variable categories in IAM_Data
```
The missing values in the newly created 'variable' column comes from variables that do not have sub-labels. These are typically aggregate values of sub-categories and therefore do not have the delimiter "|". I create a new dataframe and filter for only rows that have NA values in the 'variable' column to see these aggregate variables
```{r}
IAM_data2 <- IAM_data %>%
  filter(is.na(variable))

unique(IAM_data2$var.category) #variables that do not have delimiter "|"
```

## Comparing region coverage between IAM models
Create a new dataframe for each IAM model.
```{r}
unique(IAM_data$Model)

IAM_GCAM <- IAM_data %>%
  filter(Model == "GCAM 5.3+ NGFS")

IAM_MESSAGE <- IAM_data %>%
  filter(Model == "MESSAGEix-GLOBIOM 1.1-M-R12")

IAM_REMIND <- IAM_data %>%
  filter(Model == "REMIND-MAgPIE 3.0-4.4")
```

Create a vector containing the list of countries which have output from the each model.These vectors will be used to compare which regions are not included in one model but are included in another model.
```{r}
IAM_GCAM.country <- IAM_GCAM
 # filter(str_detect(Region,"(R5)"))
 
IAM_GCAM.country <- IAM_GCAM.country[!grepl('(R5)|GCAM|World', IAM_GCAM.country$Region),]

IAM_GCAM.country <- unique(IAM_GCAM.country$Region)

```

```{r}
IAM_MESSAGE.country <- IAM_MESSAGE
IAM_MESSAGE.country <- IAM_MESSAGE.country[!grepl('(R5)|MESSAGE|World', IAM_MESSAGE.country$Region),]

IAM_MESSAGE.country <- unique(IAM_MESSAGE.country$Region)
```

```{r}
IAM_REMIND.country <- IAM_REMIND
IAM_REMIND.country <- IAM_REMIND.country[!grepl('(R5)|REMIND|World', IAM_REMIND.country$Region),]

IAM_REMIND.country <- unique(IAM_REMIND.country$Region)
```

Show all overlapping individual countries between the 3 IAM models
```{r}
Reduce(intersect, list(IAM_GCAM.country,IAM_MESSAGE.country,IAM_REMIND.country))
```

Show countries that are not included in the models. Output shows that countries with ISO codes "PYF" "MDV" "MUS" "TON" are included in the MESSAGE model but not in GCAM and REMIND. GCAM and REMIND models also have the same country coverage.
```{r}
setdiff(IAM_MESSAGE.country,IAM_GCAM.country) #countries included in MESSAGE but NOT in GCAM

setdiff(IAM_MESSAGE.country,IAM_REMIND.country) #countries included in MESSAGE but NOT in REMIND

c(setdiff(IAM_GCAM.country, IAM_REMIND.country), setdiff(IAM_REMIND.country, IAM_GCAM.country)) #Complete overlap of individual countries between GCAM and REMIND

```

## Data cleaning to evaluate emission pathways
Create a new dataframe for world-level data.
```{r}
IAM_world <- IAM_data %>%
  filter(Region == "World")
  
#Show variables that have no time skip
na_rows = IAM_world %>% 
  is.na() %>% 
  rowSums() > 0

processed.IAM_world <- IAM_world %>% 
  filter(!na_rows)

```

Create new dataframe for World emissions data. Variable "Emissions|CO2" shows annual emissions in Mt CO2, excluding carbon removal from CCS. Because the values in Mt are too large and are not suitable for facet charts, a new variable column is created which is the Mt CO2 value of the Mt.CO2.peryear variable converted into Ct CO2.
```{r}
IAM_world.emissions <- IAM_world %>%
  filter(Variable == "Emissions|CO2") %>%
  pivot_longer('2020':'2100', names_to = "Year", values_to = "Mt.CO2.peryear") %>%
  mutate(Gt.CO2.peryear = Mt.CO2.peryear*0.001) %>%
  mutate(Year = as.numeric(Year)) %>%
  filter(!is.na(Gt.CO2.peryear))
 #filter(Scenario == c("Current Policies","Delayed Transition")) %>%

glimpse(IAM_world.emissions)
```

Rename scenarios for consistency with NGFS's published presentation and easier chart visualization
```{r}
IAM_world.emissions$Scenario[IAM_world.emissions$Scenario == "Nationally Determined Contributions (NDCs)"] <- "NDCs"
IAM_world.emissions$Scenario[IAM_world.emissions$Scenario == "Delayed transition"] <- "Delayed 2°C"
```

## Data cleaning for share of primary energy from fossil fuels, renewables & nuclear
I want to display how the percentage share of fossil fuels, renewables and nuclear energy in primary energy consumption change over time. The following steps are to calculate this percentage share and how it changes over time.

First, I create new dataframe containing the 5 variables needed for my calculation. Data is pivoted from wider to longer so that there's a new column for Year and a single column for values.
```{r}
IAM_world.energyshare <- IAM_world %>%
  pivot_longer('2020':'2100', names_to = "Year")%>%
  filter(Variable %in% c("Primary Energy","Primary Energy|Fossil","Primary Energy|Biomass","Primary Energy|Non-Biomass Renewables","Primary Energy|Nuclear")) %>%
  filter(!is.na(value)) %>%
  select(-var.category, -variable,-Region)

unique(IAM_world.energyshare$Variable)
```

Then I create another dataframe where I do my calculation. First I pivot this data from longer to wider so that new columns are created based on the Primary energy variable type.

Having these columns then allow me to create new columns containing the percentage share, calculated by dividing the primary energy type (in EJ/year) by total primary energy (in EJ/year). I then remove these columns to leave only the percentage share columns. The percentage share dataframe is pivoted back from wider to longer.
```{r}
energy.share <- IAM_world.energyshare %>%
  pivot_wider(names_from = Variable, values_from = value) %>%
  rename(Primary.Energy = 'Primary Energy',
         Primary.Energy.Biomass = 'Primary Energy|Biomass',
         Primary.Energy.NonBiomass.Renewables = 'Primary Energy|Non-Biomass Renewables',
         Primary.Energy.Nuclear = 'Primary Energy|Nuclear',
         Primary.Energy.Fossil = 'Primary Energy|Fossil') %>%
  mutate(Primary.Energy.Renewables = (Primary.Energy.Biomass + 
                                        Primary.Energy.NonBiomass.Renewables)) %>%
  mutate(Fossil.share = Primary.Energy.Fossil/Primary.Energy,
         Nuclear.share = Primary.Energy.Nuclear/Primary.Energy,
         Renewables.share = Primary.Energy.Renewables/Primary.Energy) %>%
  select(-Primary.Energy.Biomass, -Primary.Energy.NonBiomass.Renewables, -Primary.Energy,
         -Primary.Energy.Nuclear, -Primary.Energy.Fossil, -Primary.Energy.Renewables)


energy.share$Unit[energy.share$Unit == "EJ/yr"] <- "percent"

energy.share <- energy.share %>%
  pivot_longer(c(Fossil.share, Nuclear.share, Renewables.share),
               names_to = "Energy.source", values_to = "Percent") %>%
  mutate(Year = as.numeric(Year))

```
```{r}
glimpse(energy.share)
```

## Data cleaning for energy mix analysis
Create new dataframe for energy mix in 2100 under 2 scenarios: "Below 2°C" and "Current Policies"
```{r}
IAM_world.2100energymix <- IAM_world %>%
  pivot_longer('2020':'2100', names_to = "Year")%>%
  filter(Variable %in% c("Primary Energy|Coal","Primary Energy|Gas","Primary Energy|Oil","Primary Energy|Biomass","Primary Energy|Non-Biomass Renewables","Primary Energy|Nuclear")) %>%
  filter(!is.na(value)) %>%
  select(-var.category, -variable,-Region) %>%
  filter(Year == "2100") %>%
  filter(Scenario %in% c("Below 2°C", "Current Policies"))

# Reordering Scenario factor levels
IAM_world.2100energymix$Scenario <- factor(IAM_world.2100energymix$Scenario, levels = c("Current Policies","Below 2°C"))

#Rename Variable into Energy source simplified names
IAM_world.2100energymix$Variable[IAM_world.2100energymix$Variable == "Primary Energy|Coal"] <- "Coal"
IAM_world.2100energymix$Variable[IAM_world.2100energymix$Variable == "Primary Energy|Gas"] <- "Gas"
IAM_world.2100energymix$Variable[IAM_world.2100energymix$Variable == "Primary Energy|Oil"] <- "Oil"
IAM_world.2100energymix$Variable[IAM_world.2100energymix$Variable == "Primary Energy|Biomass"] <- "Biomass"
IAM_world.2100energymix$Variable[IAM_world.2100energymix$Variable == "Primary Energy|Non-Biomass Renewables"] <- "Renewables"
IAM_world.2100energymix$Variable[IAM_world.2100energymix$Variable == "Primary Energy|Nuclear"] <- "Nuclear"
```
```{r}
IAM_world.2100energymix
```

## Data cleaning for CCS deployment analysis
Create new dataframe for cumulative CCS deployment from 2020 to 2100. Data is grouped by 'Variable' column using the group_by function to facilitate showing the cumulative value in the stacked bar chart later on. Variables are also renamed.
```{r}
IAM_world.CCS <- IAM_world %>%
  filter(Variable %in% c("Carbon Sequestration|Land Use","Carbon Sequestration|CCS|Fossil","Carbon Sequestration|CCS|Biomass","Carbon Sequestration|CCS|Industrial Processes")) %>%
  select(-var.category, -variable,-Region) %>%
  pivot_longer('2020':'2100', names_to = "Year") %>%
  filter(!is.na(value)) %>%
  mutate(Gt.CO2.peryear = value*0.001)

IAM_world.CCS$Scenario[IAM_world.CCS$Scenario == "Nationally Determined Contributions (NDCs)"] <- "NDCs"
IAM_world.CCS$Scenario[IAM_world.CCS$Scenario == "Delayed transition"] <- "Delayed 2°C"

IAM_world.CCS$Variable[IAM_world.CCS$Variable == "Carbon Sequestration|Land Use"] <- "Land-based sequestration"
IAM_world.CCS$Variable[IAM_world.CCS$Variable == "Carbon Sequestration|CCS|Biomass"] <- "Bioenergy with CCS"
IAM_world.CCS$Variable[IAM_world.CCS$Variable == "Carbon Sequestration|CCS|Fossil"] <- "Fossil fuel power plants fitted with CCS"
IAM_world.CCS$Variable[IAM_world.CCS$Variable == "Carbon Sequestration|CCS|Industrial Processes"] <- "Industrial processes fitted with CCS"

# Reordering group factor levels
IAM_world.CCS$Scenario <- factor(IAM_world.CCS$Scenario, levels = c("Current Policies","Divergent Net Zero","Net Zero 2050","NDCs","Delayed 2°C","Below 2°C"))

IAM_world.CCS <- IAM_world.CCS %>%
  group_by(Variable)
```
```{r}
IAM_world.CCS
```

```{r}
saveRDS(object = IAM_world.emissions, file = "IAM_world.emissions.rds")
saveRDS(object = IAM_world.CCS, file = "IAM_world.CCS.rds")
saveRDS(object = IAM_world.2100energymix, file = "IAM_world.2100energymix.rds")
saveRDS(object = energy.share, file = "world.energy.share.rds")

```




