#1. Overlook The goal of this R Markdown notebook is to record the data cleaning process for the project analyzing energy price, co2 emission, and policy efficiency.
##1.1 Framework ###1.1.1 Locate and load dataset Country group (for categorical analysis) End-use energy price in USD: This data focuses on advanced economy. Economic indicator (CPI, PPP) to control. Greenhouse Gas Emission ###1.1.2 Merge and clean See code. ###1.1.3 Ouput This dataset would exclude the seasonal elements, while the time series analysis would be based on the initial end-use price database.
##1.2 Characteristics ###1.2.1 Country & Year ###1.2.2 Energy Product ###1.2.3 Emissions
#2. Code ##2.1 load packages
#Load our packages (haven't got the chance to use all of them in this note)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(tidyquant)
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
##
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
##
## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
##
## Attaching package: 'xts'
##
## The following objects are masked from 'package:dplyr':
##
## first, last
##
##
## Attaching package: 'PerformanceAnalytics'
##
## The following object is masked from 'package:graphics':
##
## legend
##
## Loading required package: quantmod
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(lubridate)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(esquisse)
library(here)
## here() starts at /Users/zhayiran/Library/CloudStorage/OneDrive-JohnsHopkins/Sustainable Finance/final_data
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(ggthemes)
library(ggrepel)
library(gt)
library(countrycode)
library(wbstats)
library(readxl)
options(scipen=10) # forces regular notation vs scientific notation (ie5)
##2.2 load datebases
#load raw data
folder_path <- partial(here, "00_raw_data","iea_energy")
end_use_price_usd_2021 <- folder_path("end_use_price_usd_2021.csv") %>%
read_csv()
## Rows: 2227680 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (12): COUNTRY, Country, PRODUCT, Product, SECTOR, Sector, FLOW, Flow, TI...
## dbl (1): Value
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
indicator_economic_2021 <- folder_path("indicator_economic_2021.csv") %>%
read_csv()
## Rows: 45305 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): LOCATION, Country, FLOW, Flow, TIME, Time, Flag Codes, Flags
## dbl (1): Value
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
folder_path_0 <- partial(here, "00_raw_data")
imf_wb_country_groups <- folder_path_0("imf_wb_country_groups.csv") %>%
read_csv()
## Rows: 2587 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): country_name, country_group, group_type
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
folder_path_wb <- partial(here, "00_raw_data", "world_bank")
co2_emission_per_capita <- folder_path_wb("co2_emission_per_capita.xls") %>%
read_excel()
##2.3 Merge and data cleaning
#dataset used to include both quartly and annual data
#to simplify the dataset, only include the annual data
#Is there a easy way to do this?
end_use_price_usd_annual <- end_use_price_usd_2021 %>%
filter(TIME == c("1978","1979","1980","1981","1982","1983","1984","1985","1986","1987","1988","1989","1990","1991","1992","1993","1994","1995","1996","1997","1998","1999","2000","2001","2002","2003","2004","2005","2006","2007","2008","2009","2010","2011","2012","2013","2014","2015","2016","2017","2018","2019","2020","2021"))
## Warning in TIME == c("1978", "1979", "1980", "1981", "1982", "1983", "1984", :
## longer object length is not a multiple of shorter object length
indicator_economic_annual <- indicator_economic_2021 %>%
filter(TIME == c("1978","1979","1980","1981","1982","1983","1984","1985","1986","1987","1988","1989","1990","1991","1992","1993","1994","1995","1996","1997","1998","1999","2000","2001","2002","2003","2004","2005","2006","2007","2008","2009","2010","2011","2012","2013","2014","2015","2016","2017","2018","2019","2020","2021"))
## Warning in TIME == c("1978", "1979", "1980", "1981", "1982", "1983", "1984", :
## longer object length is not a multiple of shorter object length
energy <- merge(end_use_price_usd_annual,indicator_economic_annual, by=c("Country","Time","TIME"))
#delect null or duplicate columns
energy <- subset(energy, select = -c(Flags.x,Flags.y,LOCATION,TIME))
#rename the columns
energy_renamed <- energy %>%
rename(
country = Country,
iso3_country = COUNTRY,
year = Time,
product = PRODUCT,
product_detail = Product,
sector = SECTOR,
sector_detail = Sector,
energy_price_flow = FLOW.x,
energy_price_flow_detail = Flow.x,
energy_price_value = Value.x,
economic_indicator = FLOW.y,
economic_indicator_detail = Flow.y,
economic_indicator_value = Value.y
)
imf_wb_country_groups_renamed <- imf_wb_country_groups %>%
rename(country = country_name )
energy_wprice_windicator_wgroup <- merge(energy_renamed, imf_wb_country_groups_renamed, by = c("country"))
#view(energy_wprice_windicator_wgroup)
#unique(energy_wprice_windicator_wgroup$sector_detail)
#unique(energy_wprice_windicator_wgroup$country_group)
#view(co2_emission_per_capita)
co2_emission_per_capita_edited <- co2_emission_per_capita %>%
#pivot longer to match previous data
pivot_longer(cols = 5:66, names_to = "year", values_to = "co2_per_capital") %>%
#rename columns
rename(
country = `Country Name`,
iso3_country = `Country Code`,
co2_emission_indicator = `Indicator Code`,
co2_emission_indicator_detail = `Indicator Name`
) %>%
#delete years not included in other datasets
filter(year >= "1978")
#view(co2_emission_per_capita_edited)
energy_co2 <- merge(energy_wprice_windicator_wgroup, co2_emission_per_capita_edited, by = c("country", "year","iso3_country"))
#view(energy_co2)
# delete null explanations
energy_co2 <- subset(energy_co2, select = -c(`Flag Codes.x`,`Flag Codes.y`))
#view(energy_co2)
#unique(energy_co2$product_detail)
#unique(energy_co2$product_detail)
##2.3 Final Dataset Output
write_csv(energy_co2, here("02_data_final", "energy_co2.csv"))
I export the dataset so that I can work direcly on it afterwards.
#3 What I have right now? ##3.1 country group
unique(energy_co2$country_group)
## [1] "High income"
## [2] "East Asia & Pacific"
## [3] "Post-demographic dividend"
## [4] "G20"
## [5] "Advanced G20"
## [6] "OECD members"
## [7] "World"
## [8] "Advanced Economies"
## [9] "Europe & Central Asia"
## [10] "European Union"
## [11] "Euro area"
## [12] "Euro Area"
## [13] "Oil producers"
## [14] "North America"
## [15] "G7"
## [16] "IDA & IBRD total"
## [17] "IBRD only"
## [18] "Latin America & Caribbean"
## [19] "Late-demographic dividend"
## [20] "Latin America & Caribbean (IDA & IBRD)"
## [21] "Emerging Market Economies"
## [22] "Emerging Market and Middle-Income Latin America"
## [23] "Middle income"
## [24] "Latin America & Caribbean (excluding high income)"
## [25] "Low & middle income"
## [26] "Upper middle income"
## [27] "Small states"
## [28] "Other small states"
## [29] "Central Europe and the Baltics"
## [30] "Emerging Market and Middle-Income Europe"
## [31] "Early-demographic dividend"
## [32] "Middle East & North Africa"
## [33] "Emerging G20"
## [34] "Europe & Central Asia (IDA & IBRD)"
##3.2 time period
unique(energy_co2$year)
## [1] "1978" "1988" "1989" "1999" "2000" "2010" "2011" "2021" "1987" "1998"
## [11] "2009" "2020" "1986" "1997" "2008" "2019" "1985" "1996" "2007" "2018"
## [21] "1983" "1984" "1994" "1995" "2005" "2006" "2016" "2017" "1982" "1993"
## [31] "2004" "2015" "1981" "1992" "2003" "2014" "1980" "1991" "2002" "2013"
## [41] "1979" "1990" "2001" "2012"
It looks like I have plenty of data, but the problem is there exists many NULL values, which makes me worried.
##3.3 energy product
unique(energy_co2$product_detail)
## [1] "High sulphur fuel oil (tonne)" "Light fuel oil (1000 litres)"
## [3] "Premium unleaded 98 RON (litre)" "Regular unleaded gasoline (litre)"
## [5] "Coking coal (tonne)" "Premium leaded gasoline (litre)"
## [7] "Natural gas (MWh)" "Steam coal (tonne)"
## [9] "Regular leaded gasoline (litre)" "Automotive diesel (litre)"
## [11] "Electricity (MWh)" "Liquefied petroleum gas (litre)"
## [13] "Low sulphur fuel oil (tonne)" "Premium unleaded 95 RON (litre)"
##3.3 Energy product cost by sector
unique(energy_co2$sector_detail)
## [1] "Industry" "Households" "Electricity generation"
##3.4 Elements Influencing Energy Price
unique(energy_co2$economic_indicator_detail)
## [1] "Consumer Price Index" "U.S. dollar exchange rate"
## [3] "Purchasing Power Parities" "Producer Price Index"
## [5] "CPI Energy"
unique(energy_co2$co2_emission_indicator_detail)
## [1] "CO2 emissions (metric tons per capita)"