#1. Overlook The goal of this R Markdown notebook is to record the data cleaning process for the project analyzing energy price, co2 emission, and policy efficiency.

##1.1 Framework ###1.1.1 Locate and load dataset Country group (for categorical analysis) End-use energy price in USD: This data focuses on advanced economy. Economic indicator (CPI, PPP) to control. Greenhouse Gas Emission ###1.1.2 Merge and clean See code. ###1.1.3 Ouput This dataset would exclude the seasonal elements, while the time series analysis would be based on the initial end-use price database.

##1.2 Characteristics ###1.2.1 Country & Year ###1.2.2 Energy Product ###1.2.3 Emissions

#2. Code ##2.1 load packages

#Load our packages (haven't got the chance to use all of them in this note)
library(tidyverse) 
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.0      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(tidyquant)
## Loading required package: lubridate
## 
## Attaching package: 'lubridate'
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
## 
## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## Attaching package: 'xts'
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## 
## Attaching package: 'PerformanceAnalytics'
## 
## The following object is masked from 'package:graphics':
## 
##     legend
## 
## Loading required package: quantmod
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(lubridate)


library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(esquisse)
library(here)
## here() starts at /Users/zhayiran/Library/CloudStorage/OneDrive-JohnsHopkins/Sustainable Finance/final_data
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(ggthemes)
library(ggrepel)
library(gt)

library(countrycode)
library(wbstats)
library(readxl)

options(scipen=10) # forces regular notation vs scientific notation (ie5)

##2.2 load datebases

#load raw data
folder_path <- partial(here, "00_raw_data","iea_energy")

end_use_price_usd_2021 <- folder_path("end_use_price_usd_2021.csv") %>%
  read_csv()
## Rows: 2227680 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (12): COUNTRY, Country, PRODUCT, Product, SECTOR, Sector, FLOW, Flow, TI...
## dbl  (1): Value
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
indicator_economic_2021 <- folder_path("indicator_economic_2021.csv") %>%
  read_csv()
## Rows: 45305 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): LOCATION, Country, FLOW, Flow, TIME, Time, Flag Codes, Flags
## dbl (1): Value
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
folder_path_0 <- partial(here, "00_raw_data")

imf_wb_country_groups <- folder_path_0("imf_wb_country_groups.csv") %>%
  read_csv()
## Rows: 2587 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): country_name, country_group, group_type
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
folder_path_wb <- partial(here, "00_raw_data", "world_bank")

co2_emission_per_capita <- folder_path_wb("co2_emission_per_capita.xls") %>%
  read_excel()

##2.3 Merge and data cleaning

#dataset used to include both quartly and annual data
#to simplify the dataset, only include the annual data
#Is there a easy way to do this?
end_use_price_usd_annual <- end_use_price_usd_2021 %>%
  filter(TIME == c("1978","1979","1980","1981","1982","1983","1984","1985","1986","1987","1988","1989","1990","1991","1992","1993","1994","1995","1996","1997","1998","1999","2000","2001","2002","2003","2004","2005","2006","2007","2008","2009","2010","2011","2012","2013","2014","2015","2016","2017","2018","2019","2020","2021")) 
## Warning in TIME == c("1978", "1979", "1980", "1981", "1982", "1983", "1984", :
## longer object length is not a multiple of shorter object length
indicator_economic_annual <- indicator_economic_2021 %>%
  filter(TIME == c("1978","1979","1980","1981","1982","1983","1984","1985","1986","1987","1988","1989","1990","1991","1992","1993","1994","1995","1996","1997","1998","1999","2000","2001","2002","2003","2004","2005","2006","2007","2008","2009","2010","2011","2012","2013","2014","2015","2016","2017","2018","2019","2020","2021")) 
## Warning in TIME == c("1978", "1979", "1980", "1981", "1982", "1983", "1984", :
## longer object length is not a multiple of shorter object length
energy <- merge(end_use_price_usd_annual,indicator_economic_annual, by=c("Country","Time","TIME"))

#delect null or duplicate columns
energy <- subset(energy, select = -c(Flags.x,Flags.y,LOCATION,TIME))

#rename the columns
energy_renamed <- energy %>%
  rename(
    country = Country,
    iso3_country = COUNTRY,
    year = Time,
    product = PRODUCT,
    product_detail = Product,
    sector = SECTOR,
    sector_detail = Sector,
    energy_price_flow = FLOW.x,
    energy_price_flow_detail = Flow.x,
    energy_price_value = Value.x,
    economic_indicator = FLOW.y,
    economic_indicator_detail = Flow.y,
    economic_indicator_value = Value.y
  )

imf_wb_country_groups_renamed <- imf_wb_country_groups %>%
  rename(country = country_name )

energy_wprice_windicator_wgroup <- merge(energy_renamed, imf_wb_country_groups_renamed, by = c("country"))

#view(energy_wprice_windicator_wgroup)
#unique(energy_wprice_windicator_wgroup$sector_detail)
#unique(energy_wprice_windicator_wgroup$country_group)

#view(co2_emission_per_capita)
co2_emission_per_capita_edited <- co2_emission_per_capita %>%
  #pivot longer to match previous data
  pivot_longer(cols = 5:66, names_to = "year", values_to = "co2_per_capital") %>%
  #rename columns
  rename(
    country = `Country Name`,
    iso3_country = `Country Code`,
    co2_emission_indicator = `Indicator Code`,
    co2_emission_indicator_detail = `Indicator Name`
  ) %>%
  #delete years not included in other datasets
  filter(year >= "1978")

#view(co2_emission_per_capita_edited)
  
energy_co2 <- merge(energy_wprice_windicator_wgroup, co2_emission_per_capita_edited, by = c("country", "year","iso3_country"))

#view(energy_co2)

# delete null explanations 
energy_co2 <- subset(energy_co2, select = -c(`Flag Codes.x`,`Flag Codes.y`))

#view(energy_co2)

#unique(energy_co2$product_detail)
#unique(energy_co2$product_detail)

##2.3 Final Dataset Output

write_csv(energy_co2, here("02_data_final", "energy_co2.csv"))

I export the dataset so that I can work direcly on it afterwards.

#3 What I have right now? ##3.1 country group

unique(energy_co2$country_group)
##  [1] "High income"                                      
##  [2] "East Asia & Pacific"                              
##  [3] "Post-demographic dividend"                        
##  [4] "G20"                                              
##  [5] "Advanced G20"                                     
##  [6] "OECD members"                                     
##  [7] "World"                                            
##  [8] "Advanced Economies"                               
##  [9] "Europe & Central Asia"                            
## [10] "European Union"                                   
## [11] "Euro area"                                        
## [12] "Euro Area"                                        
## [13] "Oil producers"                                    
## [14] "North America"                                    
## [15] "G7"                                               
## [16] "IDA & IBRD total"                                 
## [17] "IBRD only"                                        
## [18] "Latin America & Caribbean"                        
## [19] "Late-demographic dividend"                        
## [20] "Latin America & Caribbean (IDA & IBRD)"           
## [21] "Emerging Market Economies"                        
## [22] "Emerging Market and Middle-Income Latin America"  
## [23] "Middle income"                                    
## [24] "Latin America & Caribbean (excluding high income)"
## [25] "Low & middle income"                              
## [26] "Upper middle income"                              
## [27] "Small states"                                     
## [28] "Other small states"                               
## [29] "Central Europe and the Baltics"                   
## [30] "Emerging Market and Middle-Income Europe"         
## [31] "Early-demographic dividend"                       
## [32] "Middle East & North Africa"                       
## [33] "Emerging G20"                                     
## [34] "Europe & Central Asia (IDA & IBRD)"

##3.2 time period

unique(energy_co2$year)
##  [1] "1978" "1988" "1989" "1999" "2000" "2010" "2011" "2021" "1987" "1998"
## [11] "2009" "2020" "1986" "1997" "2008" "2019" "1985" "1996" "2007" "2018"
## [21] "1983" "1984" "1994" "1995" "2005" "2006" "2016" "2017" "1982" "1993"
## [31] "2004" "2015" "1981" "1992" "2003" "2014" "1980" "1991" "2002" "2013"
## [41] "1979" "1990" "2001" "2012"

It looks like I have plenty of data, but the problem is there exists many NULL values, which makes me worried.

##3.3 energy product

unique(energy_co2$product_detail)
##  [1] "High sulphur fuel oil (tonne)"     "Light fuel oil (1000 litres)"     
##  [3] "Premium unleaded 98 RON (litre)"   "Regular unleaded gasoline (litre)"
##  [5] "Coking coal (tonne)"               "Premium leaded gasoline (litre)"  
##  [7] "Natural gas (MWh)"                 "Steam coal (tonne)"               
##  [9] "Regular leaded gasoline (litre)"   "Automotive diesel (litre)"        
## [11] "Electricity (MWh)"                 "Liquefied petroleum gas (litre)"  
## [13] "Low sulphur fuel oil (tonne)"      "Premium unleaded 95 RON (litre)"

##3.3 Energy product cost by sector

unique(energy_co2$sector_detail)
## [1] "Industry"               "Households"             "Electricity generation"

##3.4 Elements Influencing Energy Price

unique(energy_co2$economic_indicator_detail)
## [1] "Consumer Price Index"      "U.S. dollar exchange rate"
## [3] "Purchasing Power Parities" "Producer Price Index"     
## [5] "CPI Energy"
unique(energy_co2$co2_emission_indicator_detail)
## [1] "CO2 emissions (metric tons per capita)"