#1. Overlook The goal of this R Markdown notebook is to record the data cleaning process for the project analyzing energy price, co2 emission, and policy efficiency.
##1.1 Framework ###1.1.1 Locate and load dataset Country group (for categorical analysis) End-use energy price in USD: This data focuses on advanced economy. Economic indicator (CPI, PPP) to control. Greenhouse Gas Emission ###1.1.2 Merge and clean See code. ###1.1.3 Ouput This dataset would exclude the seasonal elements, while the time series analysis would be based on the initial end-use price database.
#2. Code ##2.1 load packages
##2.2 load datebases
#load raw data
folder_path <- partial(here, "00_raw_data","iea_energy")
end_use_price_usd_2021 <- folder_path("end_use_price_usd_2021.csv") %>%
read_csv()
## Rows: 2227680 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (12): COUNTRY, Country, PRODUCT, Product, SECTOR, Sector, FLOW, Flow, TI...
## dbl (1): Value
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
indicator_economic_2021 <- folder_path("indicator_economic_2021.csv") %>%
read_csv()
## Rows: 45305 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): LOCATION, Country, FLOW, Flow, TIME, Time, Flag Codes, Flags
## dbl (1): Value
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
folder_path_0 <- partial(here, "00_raw_data")
imf_wb_country_groups <- folder_path_0("imf_wb_country_groups.csv") %>%
read_csv()
## Rows: 2587 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): country_name, country_group, group_type
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
folder_path_wb <- partial(here, "00_raw_data", "world_bank")
co2_emission_per_capita <- folder_path_wb("co2_emission_per_capita.xls") %>%
read_excel()
end_use_price_usd_2021 %>% glimpse()
## Rows: 2,227,680
## Columns: 13
## $ COUNTRY <chr> "AUS", "AUS", "AUS", "AUS", "AUS", "AUS", "AUS", "AUS", "…
## $ Country <chr> "Australia", "Australia", "Australia", "Australia", "Aust…
## $ PRODUCT <chr> "HSFO", "HSFO", "HSFO", "HSFO", "HSFO", "HSFO", "HSFO", "…
## $ Product <chr> "High sulphur fuel oil (tonne)", "High sulphur fuel oil (…
## $ SECTOR <chr> "INDUSTRY", "INDUSTRY", "INDUSTRY", "INDUSTRY", "INDUSTRY…
## $ Sector <chr> "Industry", "Industry", "Industry", "Industry", "Industry…
## $ FLOW <chr> "USDTAX_UNIT", "USDTAX_UNIT", "USDTAX_UNIT", "USDTAX_UNIT…
## $ Flow <chr> "Total tax (USD/unit)", "Total tax (USD/unit)", "Total ta…
## $ TIME <chr> "1978", "1978-Q1", "1978-Q2", "1978-Q3", "1978-Q4", "1979…
## $ Time <chr> "1978", "Q1-1978", "Q2-1978", "Q3-1978", "Q4-1978", "1979…
## $ Value <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.00000…
## $ `Flag Codes` <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Flags <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
energy_na_count <- end_use_price_usd_2021 %>%
subset(select = -c(COUNTRY,PRODUCT,SECTOR, FLOW, TIME, `Flag Codes`, Flags)) %>%
dplyr::filter(!grepl("Q", Time)) %>%
pivot_wider(names_from = Time, values_from = Value)
colSums(is.na(energy_na_count))
## Country Product Sector Flow 1978 1979 1980 1981 1982 1983
## 0 0 0 0 8281 8165 8069 8050 8050 8044
## 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993
## 8059 8008 7959 7946 7934 7932 7810 7747 7691 7665
## 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003
## 7611 7605 7632 7578 7629 7669 7570 7604 7521 7569
## 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013
## 7599 7419 7467 7434 7275 7308 7335 7392 7334 7333
## 2014 2015 2016 2017 2018 2019 2020 2021
## 7377 7415 7380 7404 7350 7419 7506 7970
It appears that data performs better after 2000.
unique(end_use_price_usd_2021$Product)
## [1] "High sulphur fuel oil (tonne)" "Low sulphur fuel oil (tonne)"
## [3] "Light fuel oil (1000 litres)" "Automotive diesel (litre)"
## [5] "Premium unleaded 98 RON (litre)" "Premium unleaded 95 RON (litre)"
## [7] "Regular unleaded gasoline (litre)" "Premium leaded gasoline (litre)"
## [9] "Regular leaded gasoline (litre)" "Liquefied petroleum gas (litre)"
## [11] "Natural gas (MWh)" "Steam coal (tonne)"
## [13] "Coking coal (tonne)" "Electricity (MWh)"
glimpse(end_use_price_usd_2021)
## Rows: 2,227,680
## Columns: 13
## $ COUNTRY <chr> "AUS", "AUS", "AUS", "AUS", "AUS", "AUS", "AUS", "AUS", "…
## $ Country <chr> "Australia", "Australia", "Australia", "Australia", "Aust…
## $ PRODUCT <chr> "HSFO", "HSFO", "HSFO", "HSFO", "HSFO", "HSFO", "HSFO", "…
## $ Product <chr> "High sulphur fuel oil (tonne)", "High sulphur fuel oil (…
## $ SECTOR <chr> "INDUSTRY", "INDUSTRY", "INDUSTRY", "INDUSTRY", "INDUSTRY…
## $ Sector <chr> "Industry", "Industry", "Industry", "Industry", "Industry…
## $ FLOW <chr> "USDTAX_UNIT", "USDTAX_UNIT", "USDTAX_UNIT", "USDTAX_UNIT…
## $ Flow <chr> "Total tax (USD/unit)", "Total tax (USD/unit)", "Total ta…
## $ TIME <chr> "1978", "1978-Q1", "1978-Q2", "1978-Q3", "1978-Q4", "1979…
## $ Time <chr> "1978", "Q1-1978", "Q2-1978", "Q3-1978", "Q4-1978", "1979…
## $ Value <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.00000…
## $ `Flag Codes` <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Flags <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
energy_na_count_2 <- energy_na_count %>%
filter(Product == "Natural gas (MWh)")
colSums(is.na(energy_na_count_2))
## Country Product Sector Flow 1978 1979 1980 1981 1982 1983
## 0 0 0 0 452 432 423 426 426 432
## 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993
## 426 402 402 402 384 384 375 374 366 363
## 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003
## 363 369 372 363 387 399 402 422 396 390
## 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013
## 378 378 396 366 294 288 288 288 282 285
## 2014 2015 2016 2017 2018 2019 2020 2021
## 288 297 288 288 276 282 306 467
Data between 2008 to 2019 appears to be better, while missing data is common every year.
imf_wb_country_groups %>%
filter(group_type == "IMF") %>%
count(country_group, sort = TRUE)
## # A tibble: 19 × 2
## country_group n
## <chr> <int>
## 1 Emerging Market Economies 96
## 2 Low-Income Developing Countries 59
## 3 Advanced Economies 39
## 4 Low-Income Developing Sub-Saharan Africa 35
## 5 Emerging Market and Middle-Income Latin America 30
## 6 Oil producers 27
## 7 Emerging Market and Middle-Income Asia 19
## 8 Euro Area 19
## 9 G20 19
## 10 Emerging Market and Middle-Income Europe 17
## 11 Emerging Market and Middle-Income Middle East North Africa and Pakistan 17
## 12 Low-Income Developing Asia 11
## 13 Emerging G20 10
## 14 Low-Income Developing Others 10
## 15 Advanced G20 9
## 16 G7 7
## 17 Low-Income Oil Producers 5
## 18 Low-Income Developing Latin America 3
## 19 Emerging Market and Middle-Income Africa 2
unique(co2_emission_per_capita$`Indicator Name`)
## [1] "CO2 emissions (metric tons per capita)"
unique(co2_emission_per_capita$`Indicator Code`)
## [1] "EN.ATM.CO2E.PC"
Since Indicator Name and Indicator Code both contain only one variable, I will delete them later.
#count NAs
colSums(is.na(co2_emission_per_capita))
## Country Name Country Code Indicator Name Indicator Code 1960
## 0 0 0 0 266
## 1961 1962 1963 1964 1965
## 266 266 266 266 266
## 1966 1967 1968 1969 1970
## 266 266 266 266 266
## 1971 1972 1973 1974 1975
## 266 266 266 266 266
## 1976 1977 1978 1979 1980
## 266 266 266 266 266
## 1981 1982 1983 1984 1985
## 266 266 266 266 266
## 1986 1987 1988 1989 1990
## 266 266 266 266 33
## 1991 1992 1993 1994 1995
## 32 29 29 29 28
## 1996 1997 1998 1999 2000
## 28 28 29 29 28
## 2001 2002 2003 2004 2005
## 28 27 27 27 27
## 2006 2007 2008 2009 2010
## 27 27 27 27 27
## 2011 2012 2013 2014 2015
## 27 27 27 27 27
## 2016 2017 2018 2019 2020
## 27 27 27 27 266
## 2021
## 266
It indicates that data before 1990 is imcomplete, so I won’t analysis this party of the dataset.
##2.3 Merge and data cleaning
#1. energy price
end_use_price_usd_annual <- end_use_price_usd_2021 %>%
#delete quartly data
dplyr::filter(!grepl("Q", TIME)) %>%
#delete duplicate column
subset(select = -c(Flags,TIME,`Flag Codes`, PRODUCT,FLOW,SECTOR)) %>%
#rename unclear columns
rename(
country = Country,
iso3_country = COUNTRY,
product_detail = Product,
sector_detail = Sector,
energy_price_flow_detail = Flow,
energy_price_value = Value,
year = Time
) %>%
janitor::clean_names()
#2. economic indicator
indicator_economic_annual <- indicator_economic_2021 %>%
#delete quartly data
dplyr::filter(!grepl("Q", TIME)) %>%
#delete duplicate column
subset(select = -c(Flags,TIME,`Flag Codes`,LOCATION,FLOW)) %>%
#rename unclear columns
rename(
country = Country,
economic_indicator_flow_detail = Flow,
economic_indicator_value = Value,
year = Time
) %>%
janitor::clean_names() %>%
#frame the dataset
pivot_wider(
names_from = economic_indicator_flow_detail,
values_from = economic_indicator_value)
#merge the energy price and economic indicator database
energy <- merge(end_use_price_usd_annual,indicator_economic_annual,
by=c("country","year"))
#3. country group info
imf_wb_country_groups$equal_one <- 1
imf_wb_country_groups_renamed <- imf_wb_country_groups %>%
filter(group_type == "IMF") %>%
dplyr::distinct() %>%
#rename
rename(country = country_name ) %>%
#frame
pivot_wider(
names_from = country_group,
values_from = equal_one,
values_fill = 0)
energy_wprice_windicator_wgroup <- merge(energy, imf_wb_country_groups_renamed, by = c("country"))
#glimpse(energy_wprice_windicator_wgroup)
#4. CO2 emission data (lack post-pandemic data)
co2_emission_per_capita_edited <- co2_emission_per_capita %>%
#pivot longer to match previous data
pivot_longer(cols = 5:66, names_to = "year", values_to = "co2_per_capital") %>%
subset(select = -c(`Indicator Name`,`Indicator Code`)) %>%
#rename columns
rename(
country = `Country Name`,
iso3_country = `Country Code`
) %>%
#delete years not included in other datasets
filter(year >= "1978")
#view(co2_emission_per_capita_edited)
energy_co2_2 <- merge(energy_wprice_windicator_wgroup, co2_emission_per_capita_edited, by = c("country", "year","iso3_country"))
glimpse(energy_co2_2)
## Rows: 365,904
## Columns: 33
## $ country <chr> …
## $ year <chr> …
## $ iso3_country <chr> …
## $ product_detail <chr> …
## $ sector_detail <chr> …
## $ energy_price_flow_detail <chr> …
## $ energy_price_value <dbl> …
## $ `Consumer Price Index` <dbl> …
## $ `CPI Energy` <dbl> …
## $ `Producer Price Index` <dbl> …
## $ `Purchasing Power Parities` <dbl> …
## $ `U.S. dollar exchange rate` <dbl> …
## $ group_type <chr> …
## $ `Advanced Economies` <dbl> …
## $ `Advanced G20` <dbl> …
## $ `Emerging G20` <dbl> …
## $ `Emerging Market and Middle-Income Africa` <dbl> …
## $ `Emerging Market and Middle-Income Asia` <dbl> …
## $ `Emerging Market and Middle-Income Europe` <dbl> …
## $ `Emerging Market and Middle-Income Latin America` <dbl> …
## $ `Emerging Market and Middle-Income Middle East North Africa and Pakistan` <dbl> …
## $ `Emerging Market Economies` <dbl> …
## $ `Euro Area` <dbl> …
## $ G20 <dbl> …
## $ G7 <dbl> …
## $ `Low-Income Developing Asia` <dbl> …
## $ `Low-Income Developing Countries` <dbl> …
## $ `Low-Income Developing Latin America` <dbl> …
## $ `Low-Income Developing Others` <dbl> …
## $ `Low-Income Developing Sub-Saharan Africa` <dbl> …
## $ `Low-Income Oil Producers` <dbl> …
## $ `Oil producers` <dbl> …
## $ co2_per_capital <dbl> …
##2.3 Final Dataset Output
write_csv(energy_co2_2, here("02_data_final", "energy_co2_2.csv"))