SOA Research
SOA Research
Initial Settings
# Package Loading======================== Package Loading========================
library(tidyverse)
library(ggthemr)
library(ggthemr)
library(ggcorrplot)
ggthemr("fresh", type = "outer", layout = "scientific", spacing = 2)
library(scales)
library(ggridges)
library(imputeTS)
# Data Loading===========================
biocpacity <- read_csv("Biocapacity by Source.csv")
co2emission <- read_csv("CO2e Emissions by SectorSource.csv")
company <- read_csv("CompanyData.csv")
ecofootprint <- read_csv("Ecological Footprint by Source.csv")
country_data <- co2emission %>% select(Year, population, gdp, land_area, forested_land,
percent_population, energy_use, renewable_energy)
co2emission <- co2emission %>% select(-population, -gdp, -land_area, -forested_land,
-percent_population, -energy_use, -renewable_energy)General Assumptions
No commission charges or extra fees during the carbon credits transaction
Data Manipulation
# Merge in columns =======================================
biocpacity <- biocpacity %>% pivot_longer(c("Built_up_Land", "Cropland", "Fishing_Grounds",
"Forest_Products", "Grazing_Land", "Biocapacity_Total"), names_to = "Type")
co2emission <- co2emission %>% pivot_longer(c("B", "E", "I", "O", "T", "W", "Emission_Total"),
names_to = "Type")
## Hold on to the rest: ecofootprint, companyInsights through Visulization
General emission trend
# Have a look ============================================= DATASET: co2emission
# General emission trend
co2emission %>% filter(Type == "Emission_Total") %>% ggplot(aes(x = Year, y = value)) +
geom_point(aes(size = log(value) * 100), alpha = 0.5) + geom_line(size = 1.5) +
scale_x_continuous(breaks = c(1999, 2005, 2011, 2015, 2019)) + geom_vline(xintercept = 2001,
linetype = 2, color = "orange", size = 1.2) + geom_vline(xintercept = 2011, linetype = 2,
color = "orange", size = 1.2) + labs(x = "Year", y = "Co2e Emission") + ggtitle("CO2e Emission Trend",
subtitle = "emission of co2e started dropping since 2011") + theme(panel.grid.major.x = element_blank(),
legend.position = "none")Trend in different type
## Trend in different type
co2emission %>% filter(Type != "Emission_Total") %>% ggplot() + geom_bar(aes(x = Year,
y = value, fill = value), stat = "identity") + facet_wrap(~Type) + ggtitle("Emission of Sectors",
"Max Emission Sector: Energy, Manufacturing and Construction ") + theme(legend.title = element_blank())Biocpacity
# DATASET: biocpacity The total biocpacity didn't had many alteration
biocpacity %>% filter(Type != "Biocapacity_Total") %>% ggplot() + geom_bar(aes(x = Year,
y = value, fill = Type), stat = "identity", position = "stack") + labs(x = "Year",
y = "Biocapacity Contribution") + ggtitle("Biocapacity Contribution Trend", "The total emission capacity and the proportion had little alteration")Tricky Companies
About the data
See the proportion of missing values.
all_zeros <- (company$`2019` == 0) & (company$`2018` == 0) & (company$`2017` == 0) &
(company$`2016` == 0) & (company$`2015` == 0)
have_zero <- (company$`2019` == 0) | (company$`2018` == 0) | (company$`2017` == 0) |
(company$`2016` == 0) | (company$`2015` == 0)
print(paste0("There are ", round(mean(all_zeros, na.rm = T), 4) * 100, "% of company that have no data"))[1] "There are 15.95% of company that have no data"
[1] "50.7% of company have at least one 0"
company <- company %>% filter(!((company$`2019` == 0) && (company$`2018` == 0) &&
(company$`2017` == 0) && (company$`2016` == 0) && (company$`2015` == 0))) %>%
na_if(0)
round(colMeans(is.na(company)), 2)Company ID Sector Location 2019 2018 2017 2016
0.00 0.00 0.00 0.24 0.30 0.28 0.27
2015
0.28
Besides, there exist some companies that have multiple sectors. We just deleted those since there only a few observations.
Var1 Freq
1 5.48531e+12 6
2 2.45743e+13 4
3 1.1289e+13 2
4 2.02059e+13 2
5 2.73896e+13 2
6 3.66985e+13 2
7 8.7252e+13 2
8 8.82264e+13 2
9 1.26735e+11 1
10 1.50619e+11 1
There are 1049 locations in total.
Impute the missing values
- linear interpolatation
temp_df <- company %>% filter(!(`Company ID` %in% c(5.48531e+12, 2.45743e+13, 1.1289e+13,
2.02059e+13, 2.73896e+13, 3.66985e+13, 8.7252e+13, 8.82264e+13))) %>% select(`2019`:`2015`)
rownames(temp_df) <- company %>% filter(!(`Company ID` %in% c(5.48531e+12, 2.45743e+13,
1.1289e+13, 2.02059e+13, 2.73896e+13, 3.66985e+13, 8.7252e+13, 8.82264e+13))) %>%
pull(`Company ID`)
d <- temp_df %>% rownames_to_column %>% gather(var, value, -rowname) %>% spread(rowname,
value)
d_noNA <- d %>% select(var) %>% cbind(na_interpolation(d[-1], option = "linear"))
rownames(d_noNA) <- d_noNA$var
new_company <- d_noNA %>% rownames_to_column %>% gather(var, value, -rowname) %>%
spread(rowname, value) %>% mutate(var = as.numeric(var)) %>% rename(`Company ID` = var) %>%
left_join(company[, c(1:3)], by = "Company ID") %>% select(`Company ID`, Sector,
Location, everything()) %>% mutate_at(vars(`2015`, `2016`, `2017`, `2018`, `2019`),
as.numeric) %>% rename(year_2015 = `2015`, year_2016 = `2016`, year_2017 = `2017`,
year_2018 = `2018`, year_2019 = `2019`) %>% na.omit() %>% mutate(Company_Total = year_2015 +
year_2016 + year_2017 + year_2018 + year_2019) %>% rename(`2015` = year_2015,
`2016` = year_2016, `2017` = year_2017, `2018` = year_2018, `2019` = year_2019) %>%
pivot_longer(`2015`:`2019`, names_to = "Year")Insights on company data
Sector
new_company %>% group_by(`Company ID`) %>% ggplot() + geom_density(aes(log(Company_Total),
fill = Sector, color = Sector), alpha = 0.8) + labs(x = "Log(Company Total Emissions)") +
ggtitle("Distribution of Company Total Emissions", "Waste and Transport are the dominant emission sources")Information about Pullanta
Total_emission <- co2emission %>% filter(Type == "Emission_Total")
corr_data <- country_data %>% left_join(Total_emission, by = "Year") %>% select(-Type) %>%
rename(Emission = value) %>% select(Year, Emission, everything())
corr <- round(cor(corr_data[-1] %>% scale()), 1)
p.mat <- cor_pmat(corr_data[-1] %>% scale())
ggcorrplot(corr, hc.order = T, type = "lower", p.mat = p.mat)From the correlation graph, the CO2E emission is correlated with
Land Area
Energy Use
Population
GDP
Specially, the emssion of greenhouse gases is highly correlated with the land area. On the other hand, forested land, percent population, renewable energy consumption don’t lay much impact on the emission of CO2e, which is quite unexpected. (We did standardized the data by scaling and centralizing. But hence we are generating the correlation matrix, it doesn’t really matter.) As we look over the dataset, there exist huge gaps between renewable energy use and other energy consumptions. In other words, the renewable energy in Pullanta had slow development over these years.
Let’s go down to the factors that mostly effect the greenhouse gases emission.
GDP & Popultaion
GrowthRate <- country_data %>% # Population Rate
mutate(Previous_Year = lag(population, 1), Change = population - Previous_Year, Population_Growth_Rate = Change/Previous_Year *
100) %>% # GDP Rate
mutate(Previous_GDP = lag(gdp, 1), Change_GDP = gdp - Previous_GDP, GDP_Growth_Rate = Change_GDP/Previous_GDP *
100) %>% filter(Year != 2019, Year != 1995) %>% pivot_longer(c(Population_Growth_Rate,
GDP_Growth_Rate), names_to = "Growth_Rates")
GrowthRate_GDP <- GrowthRate %>% filter(Growth_Rates == "GDP_Growth_Rate") %>% ggplot(aes(x = Year,
y = value)) + geom_line(size = 1.5) + geom_smooth(linetype = 2) + scale_x_continuous(breaks = c(1995,
2000, 2005, 2010, 2015, 2018)) + labs(y = "Rate(%)") + ggtitle("GDP growth rate")
GrowthRate_Pop <- GrowthRate %>% filter(Growth_Rates == "Population_Growth_Rate") %>%
ggplot(aes(x = Year, y = value)) + geom_line(size = 1.5) + geom_smooth(linetype = 2) +
scale_x_continuous(breaks = c(1995, 2000, 2005, 2010, 2015, 2018)) + labs(y = "Rate(%)") +
ggtitle("Population growth rate")
gridExtra::grid.arrange(GrowthRate_Pop, GrowthRate_GDP, nrow = 1)Energy Use
country_data %>% mutate(Previous_Year = lag(energy_use, 1), Change = energy_use -
Previous_Year, Energy_Growth_Rate = Change/Previous_Year * 100) %>% filter(Year !=
2019, Year != 1995) %>% ggplot(aes(x = Year, y = Energy_Growth_Rate)) + geom_bar(aes(fill = Energy_Growth_Rate),
stat = "identity") + geom_smooth(linetype = 2, alpha = 0.3) + theme(legend.position = "top") +
labs(y = "Rate(%)") + ggtitle("Growth Rate of Energy")Land Area
country_data %>% mutate(Previous_Year = lag(land_area, 1), Change = land_area - Previous_Year,
Land_Growth_Rate = Change/Previous_Year * 100) %>% filter(Year != 2019, Year !=
1995) %>% ggplot(aes(x = Year, y = Land_Growth_Rate)) + geom_bar(aes(fill = Land_Growth_Rate),
stat = "identity") + geom_smooth(linetype = 2, alpha = 0.3) + theme(legend.position = "top") +
labs(y = "Rate(%)") + ggtitle("Growth Rate of Land Use")