Question Formation + Exploratory Data Analysis

Author

Haotian Duan

Data Preparation

library(readxl)
library(dplyr)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

library(ggplot2)
library(RColorBrewer)
# data processing
df <- read_excel("~/Final Project Data/00_data_raw/ESG_score.xlsx")
df$stock <- gsub('.*(\\d{6}).*', '\\1', df$stock)
write.csv(df, file = "~/Final Project Data/01_data_cleaning/ESG_score.csv", row.names = FALSE)

stock <- read_excel("~/Final Project Data/00_data_raw/stock.xlsx")
stock$stock <- sprintf("%06d", as.numeric(stock$stock))
write.csv(stock, file = "~/Final Project Data/01_data_cleaning/stock.csv", row.names = FALSE)

df2 <- read.csv('~/Final Project Data/01_data_cleaning/ESG_score.csv')
df1 <- read.csv('~/Final Project Data/01_data_cleaning/stock.csv')
merge <- merge(df1, df2, by = c('stock', 'year'), all.x = TRUE)
merge <- na.omit(merge)

write.csv(merge, file = '~/Final Project Data/01_data_cleaning/stock_esg.csv', row.names = FALSE)

merged_data <- read.csv("~/Final Project Data/01_data_cleaning/stock_esg.csv")

# calculate year return
calculate_annual_return <- function(data) {
  data <- data %>% arrange(year)  # Arrange data by year
  data$year_return <- c(NA, diff(data$close)/data$close[-nrow(data)])  # Calculate annual returns
  return(data)
}
merged_data <- merged_data %>%
  group_by(stock) %>%
  do(calculate_annual_return(.))
merged_data$year_return <- ifelse(is.na(merged_data$year_return), 0, merged_data$year_return)
write.csv(merged_data, file = '~/Final Project Data/01_data_cleaning/stock_esg.csv')

First, I will calculate the quantitative relationship among ESG composite index, E index, S index and G index ratings to observe the ESG scores of China’s A-share listed companies. From the results shown in the figure below, we can see that the ESG index and S (Social) index are mostly rated as B,BB and CCC, while the E (Environmental) index is mainly rated as C. It is worth noting that the Goverance index is mostly ahead of the BBB index. This means that many companies have poor performance in environmental protection and need to strengthen environmental awareness and practices; The deficiencies in social responsibility and social impact require more social input and improvement. While most companies are doing relatively well in terms of governance structure and management, there is still room for improvement.

# hist
library(ggplot2)
library(gridExtra)


Attaching package: 'gridExtra'

The following object is masked from 'package:dplyr':

    combine

plot_rating_histogram <- function(data, rating_col, title) {
  ggplot(data, aes(x = !!as.symbol(rating_col), fill = !!as.symbol(rating_col))) +
    geom_bar() +
    labs(title = title, x = rating_col, y = "Frequency") +
    theme_minimal() +
    theme(legend.position = "none")
}
esg_histogram <- plot_rating_histogram(merged_data, "grade", "ESG Composite Rating")
e_histogram <- plot_rating_histogram(merged_data, "E_grade", "Environmental (E) Rating")
s_histogram <- plot_rating_histogram(merged_data, "S_grade", "Social (S) Rating")
g_histogram <- plot_rating_histogram(merged_data, "G_grade", "Governance (G) Rating")
combined_plot <- grid.arrange(esg_histogram, e_histogram, s_histogram, g_histogram, 
                              nrow = 2, ncol = 2, top = "Distribution of ESG Ratings")

print(combined_plot)

TableGrob (3 x 2) "arrange": 5 grobs
  z     cells    name                grob
1 1 (2-2,1-1) arrange      gtable[layout]
2 2 (2-2,2-2) arrange      gtable[layout]
3 3 (3-3,1-1) arrange      gtable[layout]
4 4 (3-3,2-2) arrange      gtable[layout]
5 5 (1-1,1-2) arrange text[GRID.text.136]

In the second step, I drew the correlation coefficient heat map of each numerical indicator to detect whether there is correlation between various variables, so as to avoid the occurrence of regression analysis multicollinearity in the following steps. In the following steps, I will use G_score, S_score,E_score and score as independent variables, and year_return,volume and circulation_value as dependent variables. Looking at the impact of ESG rating index on stock return rate, trading volume and current market value, it can be noticed from the heat map that among the four independent variables mentioned above, score is highly correlated with the other three independent variables. The reason is that score is weighted by the scores of the other three indexes, while the correlation among the other three indexes is low. So it can be used for regression analysis.

# heat map
numeric_vars <- select(merged_data, c(volume,value,circulation_value,total_value,score,E_score, S_score, G_score,year_return))

Adding missing grouping variables: `stock`

correlation_matrix <- cor(numeric_vars)
correlation_df <- as.data.frame(as.table(correlation_matrix))
colnames(correlation_df) <- c("Variable1", "Variable2", "Correlation")
heatmap_plot <- ggplot(correlation_df, aes(Variable1, Variable2, fill = Correlation)) +
  geom_tile() +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red", 
                       midpoint = 0, limits = c(-1, 1), name = "Correlation") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        plot.title = element_text(hjust = 0.5)) +  
  labs(title = "Heatmap of Correlation between Numeric Variables")
print(heatmap_plot)

industry_colors <- c("1" = "blue", "2" = "red", "3" = "green", "4" = "purple", "5" = "orange", "6" = "black")
industry_labels <- c("1" = "Finance", "2" = "Utilities", "3" = "Properties", "4" = "Conglomerates", "5" = "Industrials", "6" = "Commerce")

Then, I aggregate the data according to industry classification, calculate the average rate of return of each industry and the average of each index in each year, and then draw the annual change curve. The results can be seen in the following set of graphs. It can be seen that the difference between the five charts is obvious, not only the trend, but also the performance of different industries in different indexes have different rankings.

Before analyzing the results, I would like to explain the classification of industries. In the table, I used numerical serial numbers to represent each industry for easy classification. 1-6 represent: including Finance, Utilities, Properties, Conglomerates, Industrials, Commerce six major industries.

It can be seen that the yields of the six industries have remained almost consistent in the intermediate process, and by the end of 2021, each industry has different results. In the composite index, Finance and Properties have higher scores, while Conglomerates and Utilities have the lowest overall scores, which has a lot to do with the nature of their work and the environment. From the perspective of each grading index, the E index increases year by year, which means that listed companies pay more and more attention to environmental Governance, while the governance score decreases year by year until 2019, which may be because the score is very high at the beginning, and the stability and improvement of this part is ignored later.

Through the above EDA analysis, we can see some relationships between ESG index and industry, time, and market. In the next part we will do regression analysis using indices and quotations to try to find more meaningful results.

# industry
library(ggplot2)
library(dplyr)
scale_color_manual(values = c("1" = "blue", "2" = "red", "3" = "green", "4" = "purple", "5" = "orange", "6" = "black"),
                   labels = c("1" = "Finance", "2" = "Utilities", "3" = "Properties", "4" = "Conglomerates", "5" = "Industrials", "6" = "Commerce"))

<ggproto object: Class ScaleDiscrete, Scale, gg>
    aesthetics: colour
    axis_order: function
    break_info: function
    break_positions: function
    breaks: waiver
    call: call
    clone: function
    dimension: function
    drop: TRUE
    expand: waiver
    get_breaks: function
    get_breaks_minor: function
    get_labels: function
    get_limits: function
    guide: legend
    is_discrete: function
    is_empty: function
    labels: Finance Utilities Properties Conglomerates Industrials C ...
    limits: function
    make_sec_title: function
    make_title: function
    map: function
    map_df: function
    n.breaks.cache: NULL
    na.translate: TRUE
    na.value: grey50
    name: waiver
    palette: function
    palette.cache: NULL
    position: left
    range: environment
    rescale: function
    reset: function
    scale_name: manual
    train: function
    train_df: function
    transform: function
    transform_df: function
    super:  <ggproto object: Class ScaleDiscrete, Scale, gg>

industry_returns <- merged_data %>%
  group_by(industry, year) %>%
  summarize(mean_return = mean(year_return, na.rm = TRUE))

`summarise()` has grouped output by 'industry'. You can override using the
`.groups` argument.

line_chart <- ggplot(industry_returns, aes(x = year, y = mean_return, color = factor(industry))) +
  geom_line(size = 1.5, alpha = 0.8) + 
  labs(title = "Mean Annual Return by Industry",
       x = "Year", y = "Mean Annual Return",
       color = "Industry") +
  theme_minimal() +
  theme(legend.position = "right", 
        plot.title = element_text(size = 16, hjust = 0.5, face = "bold"), 
        axis.title.x = element_text(size = 14, face = "bold"),  
        axis.title.y = element_text(size = 14, face = "bold"), 
        axis.text = element_text(size = 12), 
        panel.grid.major = element_line(color = "lightgray"),
        panel.grid.minor = element_blank())

Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

line_chart <- line_chart +
  scale_color_manual(values = c("1" = "blue", "2" = "red", "3" = "green", "4" = "purple", "5" = "orange", "6" = "black"),
                     labels = c("1" = "Finance", "2" = "Utilities", "3" = "Properties", "4" = "Conglomerates", "5" = "Industrials", "6" = "Commerce"))
print(line_chart)

industry_scores <- merged_data %>%
  group_by(industry, year) %>%
  summarize(mean_score = mean(score, na.rm = TRUE))

`summarise()` has grouped output by 'industry'. You can override using the
`.groups` argument.

line_chart <- ggplot(industry_scores, aes(x = year, y = mean_score, color = factor(industry))) +
  geom_line(size = 1.5, alpha = 0.8) + 
  labs(title = "Mean ESG Score by Industry", 
       x = "Year", y = "Mean ESG Score", 
       color = "Industry") +
  scale_color_manual(values = industry_colors, labels = industry_labels) + 
  theme_minimal() +
  theme(legend.position = "right", 
        plot.title = element_text(size = 16, hjust = 0.5, face = "bold"),
        axis.title.x = element_text(size = 14, face = "bold"), 
        axis.title.y = element_text(size = 14, face = "bold"),  
        axis.text = element_text(size = 12),  
        panel.grid.major = element_line(color = "lightgray"), 
        panel.grid.minor = element_blank())
print(line_chart)

industry_scores <- merged_data %>%
  group_by(industry, year) %>%
  summarize(mean_score = mean(S_score, na.rm = TRUE))

`summarise()` has grouped output by 'industry'. You can override using the
`.groups` argument.

line_chart <- ggplot(industry_scores, aes(x = year, y = mean_score, color = factor(industry))) +
  geom_line(size = 1.5, alpha = 0.8) +  
  labs(title = "Mean Social Score by Industry", 
       x = "Year", y = "Mean Social Score", 
       color = "Industry") +
  scale_color_manual(values = industry_colors, labels = industry_labels) + 
  theme_minimal() +
  theme(legend.position = "right", 
        plot.title = element_text(size = 16, hjust = 0.5, face = "bold"), 
        axis.title.x = element_text(size = 14, face = "bold"), 
        axis.title.y = element_text(size = 14, face = "bold"),
        axis.text = element_text(size = 12),
        panel.grid.major = element_line(color = "lightgray"), 
        panel.grid.minor = element_blank()) 
print(line_chart)

industry_scores <- merged_data %>%
  group_by(industry, year) %>%
  summarize(mean_score = mean(G_score, na.rm = TRUE))

`summarise()` has grouped output by 'industry'. You can override using the
`.groups` argument.

line_chart <- ggplot(industry_scores, aes(x = year, y = mean_score, color = factor(industry))) +
  geom_line(size = 1.5, alpha = 0.8) +  
  labs(title = "Mean Governance Score by Industry", 
       x = "Year", y = "Mean Governance Score", 
       color = "Industry") +
  scale_color_manual(values = industry_colors, labels = industry_labels) + 
  theme_minimal() +
  theme(legend.position = "right", 
        plot.title = element_text(size = 16, hjust = 0.5, face = "bold"), 
        axis.title.x = element_text(size = 14, face = "bold"), 
        axis.title.y = element_text(size = 14, face = "bold"),
        axis.text = element_text(size = 12),
        panel.grid.major = element_line(color = "lightgray"), 
        panel.grid.minor = element_blank()) 
print(line_chart)