#1. Overlook The goal of this R Markdown notebook is to record the data cleaning process for the project analyzing energy price, co2 emission, and policy efficiency.

##1.1 Framework ###1.1.1 Locate and load dataset Country group (for categorical analysis) End-use energy price in USD: This data focuses on advanced economy. Economic indicator (CPI, PPP) to control. Greenhouse Gas Emission ###1.1.2 Merge and clean See code. ###1.1.3 Ouput This dataset would exclude the seasonal elements, while the time series analysis would be based on the initial end-use price database.

#2. Code ##2.1 load packages

##2.2 load datebases

#load raw data
folder_path <- partial(here, "00_raw_data","iea_energy")

end_use_price_usd_2021 <- folder_path("end_use_price_usd_2021.csv") %>%
  read_csv()
## Rows: 2227680 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (12): COUNTRY, Country, PRODUCT, Product, SECTOR, Sector, FLOW, Flow, TI...
## dbl  (1): Value
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
indicator_economic_2021 <- folder_path("indicator_economic_2021.csv") %>%
  read_csv()
## Rows: 45305 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): LOCATION, Country, FLOW, Flow, TIME, Time, Flag Codes, Flags
## dbl (1): Value
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
folder_path_0 <- partial(here, "00_raw_data")

imf_wb_country_groups <- folder_path_0("imf_wb_country_groups.csv") %>%
  read_csv()
## Rows: 2587 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): country_name, country_group, group_type
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
folder_path_wb <- partial(here, "00_raw_data", "world_bank")

co2_emission_per_capita <- folder_path_wb("co2_emission_per_capita.xls") %>%
  read_excel()
end_use_price_usd_2021 %>% glimpse()
## Rows: 2,227,680
## Columns: 13
## $ COUNTRY      <chr> "AUS", "AUS", "AUS", "AUS", "AUS", "AUS", "AUS", "AUS", "…
## $ Country      <chr> "Australia", "Australia", "Australia", "Australia", "Aust…
## $ PRODUCT      <chr> "HSFO", "HSFO", "HSFO", "HSFO", "HSFO", "HSFO", "HSFO", "…
## $ Product      <chr> "High sulphur fuel oil (tonne)", "High sulphur fuel oil (…
## $ SECTOR       <chr> "INDUSTRY", "INDUSTRY", "INDUSTRY", "INDUSTRY", "INDUSTRY…
## $ Sector       <chr> "Industry", "Industry", "Industry", "Industry", "Industry…
## $ FLOW         <chr> "USDTAX_UNIT", "USDTAX_UNIT", "USDTAX_UNIT", "USDTAX_UNIT…
## $ Flow         <chr> "Total tax (USD/unit)", "Total tax (USD/unit)", "Total ta…
## $ TIME         <chr> "1978", "1978-Q1", "1978-Q2", "1978-Q3", "1978-Q4", "1979…
## $ Time         <chr> "1978", "Q1-1978", "Q2-1978", "Q3-1978", "Q4-1978", "1979…
## $ Value        <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.00000…
## $ `Flag Codes` <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Flags        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
energy_na_count <- end_use_price_usd_2021 %>%
  subset(select = -c(COUNTRY,PRODUCT,SECTOR, FLOW, TIME,  `Flag Codes`, Flags)) %>%
  dplyr::filter(!grepl("Q", Time)) %>%
  pivot_wider(names_from = Time, values_from = Value) 
  
colSums(is.na(energy_na_count))
## Country Product  Sector    Flow    1978    1979    1980    1981    1982    1983 
##       0       0       0       0    8281    8165    8069    8050    8050    8044 
##    1984    1985    1986    1987    1988    1989    1990    1991    1992    1993 
##    8059    8008    7959    7946    7934    7932    7810    7747    7691    7665 
##    1994    1995    1996    1997    1998    1999    2000    2001    2002    2003 
##    7611    7605    7632    7578    7629    7669    7570    7604    7521    7569 
##    2004    2005    2006    2007    2008    2009    2010    2011    2012    2013 
##    7599    7419    7467    7434    7275    7308    7335    7392    7334    7333 
##    2014    2015    2016    2017    2018    2019    2020    2021 
##    7377    7415    7380    7404    7350    7419    7506    7970

It appears that data performs better after 2000.

unique(end_use_price_usd_2021$Product)
##  [1] "High sulphur fuel oil (tonne)"     "Low sulphur fuel oil (tonne)"     
##  [3] "Light fuel oil (1000 litres)"      "Automotive diesel (litre)"        
##  [5] "Premium unleaded 98 RON (litre)"   "Premium unleaded 95 RON (litre)"  
##  [7] "Regular unleaded gasoline (litre)" "Premium leaded gasoline (litre)"  
##  [9] "Regular leaded gasoline (litre)"   "Liquefied petroleum gas (litre)"  
## [11] "Natural gas (MWh)"                 "Steam coal (tonne)"               
## [13] "Coking coal (tonne)"               "Electricity (MWh)"
glimpse(end_use_price_usd_2021)
## Rows: 2,227,680
## Columns: 13
## $ COUNTRY      <chr> "AUS", "AUS", "AUS", "AUS", "AUS", "AUS", "AUS", "AUS", "…
## $ Country      <chr> "Australia", "Australia", "Australia", "Australia", "Aust…
## $ PRODUCT      <chr> "HSFO", "HSFO", "HSFO", "HSFO", "HSFO", "HSFO", "HSFO", "…
## $ Product      <chr> "High sulphur fuel oil (tonne)", "High sulphur fuel oil (…
## $ SECTOR       <chr> "INDUSTRY", "INDUSTRY", "INDUSTRY", "INDUSTRY", "INDUSTRY…
## $ Sector       <chr> "Industry", "Industry", "Industry", "Industry", "Industry…
## $ FLOW         <chr> "USDTAX_UNIT", "USDTAX_UNIT", "USDTAX_UNIT", "USDTAX_UNIT…
## $ Flow         <chr> "Total tax (USD/unit)", "Total tax (USD/unit)", "Total ta…
## $ TIME         <chr> "1978", "1978-Q1", "1978-Q2", "1978-Q3", "1978-Q4", "1979…
## $ Time         <chr> "1978", "Q1-1978", "Q2-1978", "Q3-1978", "Q4-1978", "1979…
## $ Value        <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.00000…
## $ `Flag Codes` <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ Flags        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
energy_na_count_2 <- energy_na_count %>%
  filter(Product == "Natural gas (MWh)")
  
colSums(is.na(energy_na_count_2))
## Country Product  Sector    Flow    1978    1979    1980    1981    1982    1983 
##       0       0       0       0     452     432     423     426     426     432 
##    1984    1985    1986    1987    1988    1989    1990    1991    1992    1993 
##     426     402     402     402     384     384     375     374     366     363 
##    1994    1995    1996    1997    1998    1999    2000    2001    2002    2003 
##     363     369     372     363     387     399     402     422     396     390 
##    2004    2005    2006    2007    2008    2009    2010    2011    2012    2013 
##     378     378     396     366     294     288     288     288     282     285 
##    2014    2015    2016    2017    2018    2019    2020    2021 
##     288     297     288     288     276     282     306     467

Data between 2008 to 2019 appears to be better, while missing data is common every year.

imf_wb_country_groups %>%
  filter(group_type == "IMF") %>%
  count(country_group, sort = TRUE) 
## # A tibble: 19 × 2
##    country_group                                                               n
##    <chr>                                                                   <int>
##  1 Emerging Market Economies                                                  96
##  2 Low-Income Developing Countries                                            59
##  3 Advanced Economies                                                         39
##  4 Low-Income Developing Sub-Saharan Africa                                   35
##  5 Emerging Market and Middle-Income Latin America                            30
##  6 Oil producers                                                              27
##  7 Emerging Market and Middle-Income Asia                                     19
##  8 Euro Area                                                                  19
##  9 G20                                                                        19
## 10 Emerging Market and Middle-Income Europe                                   17
## 11 Emerging Market and Middle-Income Middle East North Africa and Pakistan    17
## 12 Low-Income Developing Asia                                                 11
## 13 Emerging G20                                                               10
## 14 Low-Income Developing Others                                               10
## 15 Advanced G20                                                                9
## 16 G7                                                                          7
## 17 Low-Income Oil Producers                                                    5
## 18 Low-Income Developing Latin America                                         3
## 19 Emerging Market and Middle-Income Africa                                    2
unique(co2_emission_per_capita$`Indicator Name`)
## [1] "CO2 emissions (metric tons per capita)"
unique(co2_emission_per_capita$`Indicator Code`)
## [1] "EN.ATM.CO2E.PC"

Since Indicator Name and Indicator Code both contain only one variable, I will delete them later.

#count NAs
colSums(is.na(co2_emission_per_capita))
##   Country Name   Country Code Indicator Name Indicator Code           1960 
##              0              0              0              0            266 
##           1961           1962           1963           1964           1965 
##            266            266            266            266            266 
##           1966           1967           1968           1969           1970 
##            266            266            266            266            266 
##           1971           1972           1973           1974           1975 
##            266            266            266            266            266 
##           1976           1977           1978           1979           1980 
##            266            266            266            266            266 
##           1981           1982           1983           1984           1985 
##            266            266            266            266            266 
##           1986           1987           1988           1989           1990 
##            266            266            266            266             33 
##           1991           1992           1993           1994           1995 
##             32             29             29             29             28 
##           1996           1997           1998           1999           2000 
##             28             28             29             29             28 
##           2001           2002           2003           2004           2005 
##             28             27             27             27             27 
##           2006           2007           2008           2009           2010 
##             27             27             27             27             27 
##           2011           2012           2013           2014           2015 
##             27             27             27             27             27 
##           2016           2017           2018           2019           2020 
##             27             27             27             27            266 
##           2021 
##            266

It indicates that data before 1990 is imcomplete, so I won’t analysis this party of the dataset.

##2.3 Merge and data cleaning

#1. energy price
end_use_price_usd_annual <- end_use_price_usd_2021 %>%
  #delete quartly data
  dplyr::filter(!grepl("Q", TIME)) %>%
  #delete duplicate column
  subset(select = -c(Flags,TIME,`Flag Codes`, PRODUCT,FLOW,SECTOR)) %>%
  #rename unclear columns
  rename(
    country = Country,
    iso3_country = COUNTRY,
    product_detail = Product,
    sector_detail = Sector,
    energy_price_flow_detail = Flow,
    energy_price_value = Value,
    year = Time
  ) %>%
  janitor::clean_names()  

#2. economic indicator
indicator_economic_annual <- indicator_economic_2021 %>%
  #delete quartly data
  dplyr::filter(!grepl("Q", TIME)) %>%
  #delete duplicate column
  subset(select = -c(Flags,TIME,`Flag Codes`,LOCATION,FLOW)) %>%
  #rename unclear columns
  rename(
    country = Country,
    economic_indicator_flow_detail = Flow,
    economic_indicator_value = Value,
    year = Time
  ) %>%
  janitor::clean_names() %>%
  #frame the dataset
  pivot_wider(
    names_from = economic_indicator_flow_detail, 
    values_from = economic_indicator_value) 

#merge the energy price and economic indicator database
energy <- merge(end_use_price_usd_annual,indicator_economic_annual, 
                by=c("country","year"))

#3. country group info
imf_wb_country_groups$equal_one <- 1
imf_wb_country_groups_renamed <- imf_wb_country_groups %>%
  filter(group_type == "IMF") %>%
  dplyr::distinct() %>%
  #rename
  rename(country = country_name ) %>%
  #frame
  pivot_wider(
    names_from = country_group, 
    values_from = equal_one, 
    values_fill = 0) 
  
energy_wprice_windicator_wgroup <- merge(energy, imf_wb_country_groups_renamed, by = c("country"))

#glimpse(energy_wprice_windicator_wgroup)

#4. CO2 emission data (lack post-pandemic data)
co2_emission_per_capita_edited <- co2_emission_per_capita %>%
  #pivot longer to match previous data
  pivot_longer(cols = 5:66, names_to = "year", values_to = "co2_per_capital") %>%
  subset(select = -c(`Indicator Name`,`Indicator Code`)) %>%
  #rename columns
  rename(
    country = `Country Name`,
    iso3_country = `Country Code`
  ) %>%
  #delete years not included in other datasets
  filter(year >= "1978")

#view(co2_emission_per_capita_edited)
  
energy_co2_2 <- merge(energy_wprice_windicator_wgroup, co2_emission_per_capita_edited, by = c("country", "year","iso3_country"))

glimpse(energy_co2_2)
## Rows: 365,904
## Columns: 33
## $ country                                                                   <chr> …
## $ year                                                                      <chr> …
## $ iso3_country                                                              <chr> …
## $ product_detail                                                            <chr> …
## $ sector_detail                                                             <chr> …
## $ energy_price_flow_detail                                                  <chr> …
## $ energy_price_value                                                        <dbl> …
## $ `Consumer Price Index`                                                    <dbl> …
## $ `CPI Energy`                                                              <dbl> …
## $ `Producer Price Index`                                                    <dbl> …
## $ `Purchasing Power Parities`                                               <dbl> …
## $ `U.S. dollar exchange rate`                                               <dbl> …
## $ group_type                                                                <chr> …
## $ `Advanced Economies`                                                      <dbl> …
## $ `Advanced G20`                                                            <dbl> …
## $ `Emerging G20`                                                            <dbl> …
## $ `Emerging Market and Middle-Income Africa`                                <dbl> …
## $ `Emerging Market and Middle-Income Asia`                                  <dbl> …
## $ `Emerging Market and Middle-Income Europe`                                <dbl> …
## $ `Emerging Market and Middle-Income Latin America`                         <dbl> …
## $ `Emerging Market and Middle-Income Middle East North Africa and Pakistan` <dbl> …
## $ `Emerging Market Economies`                                               <dbl> …
## $ `Euro Area`                                                               <dbl> …
## $ G20                                                                       <dbl> …
## $ G7                                                                        <dbl> …
## $ `Low-Income Developing Asia`                                              <dbl> …
## $ `Low-Income Developing Countries`                                         <dbl> …
## $ `Low-Income Developing Latin America`                                     <dbl> …
## $ `Low-Income Developing Others`                                            <dbl> …
## $ `Low-Income Developing Sub-Saharan Africa`                                <dbl> …
## $ `Low-Income Oil Producers`                                                <dbl> …
## $ `Oil producers`                                                           <dbl> …
## $ co2_per_capital                                                           <dbl> …

##2.3 Final Dataset Output

write_csv(energy_co2_2, here("02_data_final", "energy_co2_2.csv"))