# Load packages
library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(lubridate)
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(fixest)
library(sampleSelection)
## Loading required package: maxLik
## Loading required package: miscTools
## 
## Please cite the 'maxLik' package as:
## Henningsen, Arne and Toomet, Ott (2011). maxLik: A package for maximum likelihood estimation in R. Computational Statistics 26(3), 443-458. DOI 10.1007/s00180-010-0217-1.
## 
## If you have questions, suggestions, or comments regarding the 'maxLik' package, please use a forum or 'tracker' at maxLik's R-Forge site:
## https://r-forge.r-project.org/projects/maxlik/
library(modelsummary)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
financial <- read_excel("GISC Financial.xlsx")|>
  filter(`(costat) Active/Inactive Status Marker`== "A", `(fic) Current ISO Country Code - Incorporation` == "USA") |>
  select(subindustry_code = `(gsubind) GIC Sub-Industries`, 
         date = `(datadate) Data Date`, 
         gvkey = `(gvkey) Global Company Key`, 
         name = `(conm) Company Name`, 
         total_asset = `(at) Assets - Total`, 
         current_debt = `(dlc) Debt in Current Liabilities - Total`,
         long_term_debt = `(dltt) Long-Term Debt - Total`, 
         net_income = `(ni) Net Income (Loss)`, 
         capex = `(capx) Capital Expenditures`) |>
  mutate(total_debt = current_debt + long_term_debt, 
         ROA = 100*net_income/(total_asset - total_debt), 
         year = year(date),
         gvkey = as.character(gvkey)) |>
  select(-current_debt, -long_term_debt, -net_income, -total_debt, -date)
ghg_intensity = read_csv("GHG intensity.csv") |>
  mutate(year = year(periodenddate), gvkey = as.character(gvkey)) |>
  select(gvkey, year, ghg_intensity = di_319405)
## Rows: 1926 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): gvkey, ticker, companyname, country
## dbl  (4): companyid, institutionid, fiscalyear, di_319405
## date (1): periodenddate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
esg_score <- read_csv("ESG Global Score.csv") |>
  filter(
    csascoretypename == "Raw",
    aspectname %in% c(
      "S&P Global ESG Score",
      "Environmental Dimension",
      "Social Dimension",
      "Economic Governance Dimension"
    )
  ) |>
  transmute(
    scoredate,
    year = lubridate::year(scoredate),
    gvkey = as.character(gvkey),
    score_type = aspectname,
    scorevalue
  ) |>
  arrange(gvkey, year, score_type, scoredate) |>
  group_by(gvkey, year, score_type) |>
  slice_tail(n = 1) |>
  ungroup() |>
  select(-scoredate) |>
  pivot_wider(
    names_from = score_type,
    values_from = scorevalue
  ) |>
  rename(
    ESG_composite_score = `S&P Global ESG Score`,
    E_score = `Environmental Dimension`,
    S_score = `Social Dimension`,
    G_score = `Economic Governance Dimension`
  )
## Rows: 5624 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (6): csascoretypename, scoretype, aspectname, ticker, companyname, country
## dbl  (3): institutionid, scorevalue, gvkey
## date (1): scoredate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
gas_production <- read_csv("Oil and gas production.csv") |>
  select(year = datadate, gvkey, natural_gas_production = ogpngq, oil_production = ogpoilq, country = fic) |>
  filter(country == "USA", !is.na(natural_gas_production), !is.na(oil_production)) |>
  mutate(year = year(year))|>
  group_by(gvkey, year) |>
  mutate(annual_total_gas_production = sum(natural_gas_production,na.rm = TRUE),
           annual_total_oil_production = sum(oil_production, na.rm = TRUE),
         gvkey = as.character(gvkey)) |>
  slice(1) |>
  mutate(
    gas_boe = annual_total_gas_production / 6000,
    total_boe = annual_total_oil_production + gas_boe,
    gas_percent = 100*gas_boe / total_boe
  ) |>
  select(year, gvkey, gas_percent) 
## Rows: 18531 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (8): datafmt, indfmt, consol, gvkey, conm, tic, fic, conml
## dbl  (3): gsubind, ogpngq, ogpoilq
## date (1): datadate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
brent_oil <- read_csv("DCOILBRENTEU.csv") |>
  mutate(year = year(observation_date)) |>
  rename("oil_price" = DCOILBRENTEU) |>
  filter(!is.na(oil_price)) |>
  group_by (year) |>
  summarize(oil_price = mean(oil_price)) 
## Rows: 4221 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (1): DCOILBRENTEU
## date (1): observation_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
joint_data <- financial |>
  left_join(esg_score, by = join_by(gvkey, year)) |> 
  left_join(ghg_intensity, by = join_by(gvkey, year)) |>
  left_join(gas_production, by = join_by(gvkey, year)) |>
  left_join(brent_oil, by = join_by(year)) 
#Regression with Heckman Selection Model using lagged ESG scores

# Create lagged ESG variables and selection indicators
df <- joint_data |>
  arrange(gvkey, year) |>
  group_by(gvkey) |>
  mutate(
    ESG_lag = lag(ESG_composite_score),
    E_lag   = lag(E_score),
    S_lag   = lag(S_score),
    G_lag   = lag(G_score),

    ESG_obs = if_else(!is.na(ESG_lag), 1, 0),
    E_obs   = if_else(!is.na(E_lag),   1, 0),
    S_obs   = if_else(!is.na(S_lag),   1, 0),
    G_obs   = if_else(!is.na(G_lag),   1, 0)
  ) |>
  ungroup() |>
  mutate(
    log_capex = log(capex + 1),
    subindustry_code = as.factor(subindustry_code),
    year = as.integer(year)
  ) |>
  filter(!is.na(log_capex), !is.na(year), !is.na(subindustry_code))

# 3) Function to create IMR from first-stage probit
add_imr <- function(data, sel_var, imr_name) {
  f <- as.formula(
    paste0(sel_var, " ~ log_capex + factor(year) + subindustry_code")
  )

  sel_mod <- glm(f, data = data, family = binomial(link = "probit"))

  xb  <- predict(sel_mod, type = "link")
  phi <- dnorm(xb)
  Phi <- pnorm(xb)

  data[[imr_name]] <- ifelse(data[[sel_var]] == 1, phi / pmax(Phi, 1e-8), NA_real_)
  data
}

# 4) Add IMRs for each score
df <- add_imr(df, "ESG_obs", "imr_ESG")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
df <- add_imr(df, "E_obs",   "imr_E")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
df <- add_imr(df, "S_obs",   "imr_S")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
df <- add_imr(df, "G_obs",   "imr_G")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# 5) Function to run 4 Heckman-style regressions for any lagged ESG variable
run_heckman_set <- function(data, x_var, sel_var, imr_var, label) {

  reg_df <- data |>
    filter(
      .data[[sel_var]] == 1,
      !is.na(ROA),
      !is.na(.data[[x_var]]),
      !is.na(ghg_intensity),
      !is.na(gas_percent),
      !is.na(oil_price),
      !is.na(.data[[imr_var]])
    )

  f1 <- as.formula(
    paste0("ROA ~ ", x_var, " + ghg_intensity + gas_percent + ", imr_var)
  )

  f2 <- as.formula(
    paste0("ROA ~ ", x_var, " + ghg_intensity + gas_percent + oil_price + ", imr_var)
  )
  
  f3 <- as.formula(
    paste0("ROA ~ ", x_var, " + ghg_intensity + gas_percent + oil_price + ", imr_var, " | gvkey")
  )

  f4 <- as.formula(
    paste0("ROA ~ ", x_var, " + ghg_intensity + gas_percent + oil_price + ", imr_var, " | gvkey + year")
  )

  list(
    setNames(list(feols(f1, data = reg_df, vcov = ~ gvkey)), paste0(label, " (1)"))[[1]],
    setNames(list(feols(f2, data = reg_df, vcov = ~ gvkey)), paste0(label, " (2)"))[[1]],
    setNames(list(feols(f3, data = reg_df, vcov = ~ gvkey)), paste0(label, " Firm FE (3)"))[[1]],
    setNames(list(feols(f4, data = reg_df, vcov = ~ gvkey)), paste0(label, " Firm+Year FE (4)"))[[1]]
  )
}

# 6) Run model sets
mods_ESG <- run_heckman_set(df, "ESG_lag", "ESG_obs", "imr_ESG", "Composite ESG")
## NOTE: 1 fixed-effect singleton was removed (1 observation).
## NOTE: 1/6 fixed-effect singletons were removed (7 observations).
## The variable 'oil_price' has been removed because of collinearity (see
## $collin.var).
mods_E   <- run_heckman_set(df, "E_lag",   "E_obs",   "imr_E",   "E score")
## NOTE: 1 fixed-effect singleton was removed (1 observation).
## NOTE: 1/6 fixed-effect singletons were removed (7 observations).
## The variable 'oil_price' has been removed because of collinearity (see
## $collin.var).
mods_S   <- run_heckman_set(df, "S_lag",   "S_obs",   "imr_S",   "S score")
## NOTE: 1 fixed-effect singleton was removed (1 observation).
## NOTE: 1/6 fixed-effect singletons were removed (7 observations).
## The variable 'oil_price' has been removed because of collinearity (see
## $collin.var).
mods_G   <- run_heckman_set(df, "G_lag",   "G_obs",   "imr_G",   "G score")
## NOTE: 1 fixed-effect singleton was removed (1 observation).
## NOTE: 1/6 fixed-effect singletons were removed (7 observations).
## The variable 'oil_price' has been removed because of collinearity (see
## $collin.var).
output_table <- modelsummary(
  c(mods_ESG, mods_E, mods_S, mods_G),
  stars = TRUE,
  statistic = "({std.error})",
  gof_omit = "AIC|BIC|Log.Lik|RMSE|Adj|Within|Pseudo"
)
output_table
(1) (2) (3) (4) (5) (6) (7) (8) (9) (10) (11) (12) (13) (14) (15) (16)
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001
(Intercept) 26.404 -53.129 26.408 -54.323 37.234 -49.976 14.800 -66.713
(27.120) (38.045) (24.504) (37.689) (23.378) (36.652) (25.238) (48.224)
ESG_lag 0.328 -0.009 0.355 -0.849
(0.388) (0.343) (0.704) (1.672)
ghg_intensity 0.002 0.006 0.004 0.024 0.001 0.005 0.003 0.018 0.002 0.007 0.004 0.022 0.000 0.004 0.004 0.030
(0.004) (0.006) (0.007) (0.018) (0.005) (0.006) (0.007) (0.014) (0.004) (0.005) (0.008) (0.013) (0.005) (0.007) (0.007) (0.022)
gas_percent 0.177 0.075 1.005 2.555 0.182 0.079 1.238 1.380 0.163 0.049 0.929 2.111 0.193 0.103 1.054 3.983
(0.200) (0.228) (1.767) (2.538) (0.199) (0.233) (1.705) (1.442) (0.191) (0.191) (1.668) (1.909) (0.196) (0.249) (1.994) (3.619)
imr_ESG -26.163 -17.004 16.106 -28.918
(15.904) (13.363) (28.475) (114.589)
oil_price 0.963+ 1.011 0.957+ 0.999 1.018+ 1.016 0.930+ 1.019
(0.486) (0.603) (0.487) (0.562) (0.512) (0.598) (0.470) (0.603)
E_lag 0.308 0.032 0.575 0.404
(0.298) (0.325) (0.577) (0.902)
imr_E -24.523 -16.362 26.176 1.476
(15.835) (13.089) (28.389) (102.198)
S_lag 0.094 -0.277 0.216 -0.370
(0.308) (0.324) (0.411) (0.651)
imr_S -29.497+ -19.051 13.423 -18.165
(15.921) (13.098) (26.146) (95.413)
G_lag 0.531 0.342 0.286 -1.880
(0.349) (0.305) (0.725) (2.530)
imr_G -25.070 -13.855 14.202 -64.479
(14.778) (13.389) (29.431) (139.339)
Num.Obs. 44 44 43 37 44 44 43 37 44 44 43 37 44 44 43 37
R2 0.103 0.291 0.490 0.650 0.106 0.291 0.500 0.646 0.096 0.299 0.488 0.646 0.122 0.302 0.489 0.669
Std.Errors by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey
FE: gvkey X X X X X X X X
FE: year X X X X
#Regression with Heckman Selection Model using ESG scores

# Create selection indicators
df2 <- joint_data |>
  arrange(gvkey, year) |>
  group_by(gvkey) |>
  mutate(
    ESG_obs = if_else(!is.na(ESG_composite_score), 1, 0),
    E_obs   = if_else(!is.na(E_score),   1, 0),
    S_obs   = if_else(!is.na(S_score),   1, 0),
    G_obs   = if_else(!is.na(G_score),   1, 0)
  ) |>
  ungroup() |>
  mutate(
    log_capex = log(capex + 1),
    subindustry_code = as.factor(subindustry_code),
    year = as.integer(year)
  ) |>
  filter(!is.na(log_capex), !is.na(year), !is.na(subindustry_code))

# 3) Function to create IMR from first-stage probit
add_imr <- function(data, sel_var, imr_name) {
  f <- as.formula(
    paste0(sel_var, " ~ log_capex + factor(year) + subindustry_code")
  )

  sel_mod <- glm(f, data = data, family = binomial(link = "probit"))

  xb  <- predict(sel_mod, type = "link")
  phi <- dnorm(xb)
  Phi <- pnorm(xb)

  data[[imr_name]] <- ifelse(data[[sel_var]] == 1, phi / pmax(Phi, 1e-8), NA_real_)
  data
}

# 4) Add IMRs for each score
df2 <- add_imr(df2, "ESG_obs", "imr_ESG")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
df2 <- add_imr(df2, "E_obs",   "imr_E")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
df2 <- add_imr(df2, "S_obs",   "imr_S")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
df2 <- add_imr(df2, "G_obs",   "imr_G")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# 5) Function to run 4 Heckman-style regressions for ESG variables
run_heckman_set <- function(data, x_var, sel_var, imr_var, label) {

  reg_df <- data |>
    filter(
      .data[[sel_var]] == 1,
      !is.na(ROA),
      !is.na(.data[[x_var]]),
      !is.na(ghg_intensity),
      !is.na(gas_percent),
      !is.na(oil_price),
      !is.na(.data[[imr_var]])
    )

  f1 <- as.formula(
    paste0("ROA ~ ", x_var, " + ghg_intensity + gas_percent + ", imr_var)
  )

  f2 <- as.formula(
    paste0("ROA ~ ", x_var, " + ghg_intensity + gas_percent + oil_price + ", imr_var)
  )
  
  f3 <- as.formula(
    paste0("ROA ~ ", x_var, " + ghg_intensity + gas_percent + oil_price + ", imr_var, " | gvkey")
  )

  f4 <- as.formula(
    paste0("ROA ~ ", x_var, " + ghg_intensity + gas_percent + oil_price + ", imr_var, " | gvkey + year")
  )

  list(
    setNames(list(feols(f1, data = reg_df, vcov = ~ gvkey)), paste0(label, " (1)"))[[1]],
    setNames(list(feols(f2, data = reg_df, vcov = ~ gvkey)), paste0(label, " (2)"))[[1]],
    setNames(list(feols(f3, data = reg_df, vcov = ~ gvkey)), paste0(label, " Firm FE (3)"))[[1]],
    setNames(list(feols(f4, data = reg_df, vcov = ~ gvkey)), paste0(label, " Firm+Year FE (4)"))[[1]]
  )
}

# 6) Run model sets
mods2_ESG <- run_heckman_set(df2, "ESG_composite_score", "ESG_obs", "imr_ESG", "Composite ESG")
## NOTE: 1 fixed-effect singleton was removed (1 observation).
## NOTE: 1/6 fixed-effect singletons were removed (7 observations).
## The variable 'oil_price' has been removed because of collinearity (see
## $collin.var).
mods2_E   <- run_heckman_set(df2, "E_score",   "E_obs",   "imr_E",   "E score")
## NOTE: 1 fixed-effect singleton was removed (1 observation).
## NOTE: 1/6 fixed-effect singletons were removed (7 observations).
## The variable 'oil_price' has been removed because of collinearity (see
## $collin.var).
mods2_S   <- run_heckman_set(df2, "S_score",   "S_obs",   "imr_S",   "S score")
## NOTE: 1 fixed-effect singleton was removed (1 observation).
## NOTE: 1/6 fixed-effect singletons were removed (7 observations).
## The variable 'oil_price' has been removed because of collinearity (see
## $collin.var).
mods2_G   <- run_heckman_set(df2, "G_score",   "G_obs",   "imr_G",   "G score")
## NOTE: 1 fixed-effect singleton was removed (1 observation).
## NOTE: 1/6 fixed-effect singletons were removed (7 observations).
## The variable 'oil_price' has been removed because of collinearity (see
## $collin.var).
output_table2 <- modelsummary(
  c(mods2_ESG, mods2_E, mods2_S, mods2_G),
  stars = TRUE,
  statistic = "({std.error})",
  gof_omit = "AIC|BIC|Log.Lik|RMSE|Adj|Within|Pseudo"
)
output_table2
(1) (2) (3) (4) (5) (6) (7) (8) (9) (10) (11) (12) (13) (14) (15) (16)
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001
(Intercept) 20.154 -50.242 30.804 -43.438 33.484 -45.826 2.420 -62.296
(39.083) (37.854) (35.889) (32.150) (36.377) (33.549) (36.412) (47.363)
ESG_composite_score 0.551 0.120 -0.261 -2.432+
(0.591) (0.608) (0.331) (1.128)
ghg_intensity 0.005 0.005 0.002 0.009 0.006 0.006 0.002 0.009 0.006 0.006 0.002 0.010 0.003 0.003 0.000 0.008
(0.006) (0.007) (0.005) (0.009) (0.006) (0.007) (0.005) (0.008) (0.006) (0.007) (0.005) (0.008) (0.005) (0.007) (0.005) (0.009)
gas_percent 0.169 0.098 1.425 2.497 0.161 0.086 1.312 3.317 0.168 0.089 1.355 2.703 0.167 0.108 1.793 2.301
(0.273) (0.244) (1.917) (1.826) (0.246) (0.228) (1.985) (2.403) (0.266) (0.226) (1.747) (1.937) (0.284) (0.268) (2.166) (1.819)
imr_ESG -31.266 -20.678 25.879 60.834
(27.117) (25.277) (25.134) (48.518)
oil_price 0.885* 1.231* 0.933* 1.264* 0.920* 1.272* 0.827* 1.126*
(0.361) (0.485) (0.373) (0.525) (0.366) (0.514) (0.344) (0.438)
E_score 0.318 -0.108 -0.416 -2.809+
(0.429) (0.430) (0.540) (1.362)
imr_E -33.575 -24.662 19.595 29.942
(27.420) (25.171) (27.713) (51.762)
S_score 0.340 -0.049 -0.370 -1.889
(0.434) (0.428) (0.378) (1.059)
imr_S -35.687 -23.040 24.745 55.450
(28.403) (26.015) (21.612) (49.208)
G_score 0.818 0.434 0.298 -0.891
(0.608) (0.658) (0.537) (1.253)
imr_G -28.507 -17.188 34.172 98.196
(25.426) (25.391) (29.736) (73.538)
Num.Obs. 55 55 54 48 55 55 54 48 55 55 54 48 55 55 54 48
R2 0.094 0.185 0.504 0.598 0.086 0.186 0.507 0.622 0.088 0.185 0.506 0.599 0.113 0.194 0.505 0.569
Std.Errors by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey
FE: gvkey X X X X X X X X
FE: year X X X X
# Run regression without Heckman selection
reg_df3 <- df2 |>
  filter(!is.na(ESG_composite_score),
         !is.na(ghg_intensity),
         !is.na(gas_percent),
         !is.na(oil_price))

run_esg_set_fe <- function(data, x_var, label) {

  reg_df3 <- data |>
    filter(
      !is.na(ROA),
      !is.na(.data[[x_var]]),
      !is.na(ghg_intensity),
      !is.na(gas_percent)
    )

  mods3 <- list(
    feols(
      as.formula(paste0("ROA ~ ", x_var, " | gvkey + year")),
      data = reg_df3,
      vcov = ~gvkey
    ),

    feols(
      as.formula(paste0("ROA ~ ", x_var, " + ghg_intensity | gvkey + year")),
      data = reg_df3,
      vcov = ~gvkey
    ),

    feols(
      as.formula(paste0("ROA ~ ", x_var, " + gas_percent | gvkey + year")),
      data = reg_df3,
      vcov = ~gvkey
    ),

    feols(
      as.formula(paste0("ROA ~ ", x_var, " + ghg_intensity + gas_percent | gvkey + year")),
      data = reg_df3,
      vcov = ~gvkey
    )
  )

  names(mods3) <- c(
    paste0(" (1)"),
    paste0(" (2)"),
    paste0(" (3)"),
    paste0( " (4)")
  )

  mods3
}

# Run model sets
mods3_ESG <- run_esg_set_fe(reg_df3, "ESG_composite_score", "Composite ESG")
## NOTE: 1/6 fixed-effect singletons were removed (7 observations).
## NOTE: 1/6 fixed-effect singletons were removed (7 observations).
## NOTE: 1/6 fixed-effect singletons were removed (7 observations).
## NOTE: 1/6 fixed-effect singletons were removed (7 observations).
mods3_E   <- run_esg_set_fe(reg_df3, "E_score", "E Score")
## NOTE: 1/6 fixed-effect singletons were removed (7 observations).
## NOTE: 1/6 fixed-effect singletons were removed (7 observations).
## NOTE: 1/6 fixed-effect singletons were removed (7 observations).
## NOTE: 1/6 fixed-effect singletons were removed (7 observations).
mods3_S   <- run_esg_set_fe(reg_df3, "S_score", "S Score")
## NOTE: 1/6 fixed-effect singletons were removed (7 observations).
## NOTE: 1/6 fixed-effect singletons were removed (7 observations).
## NOTE: 1/6 fixed-effect singletons were removed (7 observations).
## NOTE: 1/6 fixed-effect singletons were removed (7 observations).
mods3_G   <- run_esg_set_fe(reg_df3, "G_score", "G Score")
## NOTE: 1/6 fixed-effect singletons were removed (7 observations).
## NOTE: 1/6 fixed-effect singletons were removed (7 observations).
## NOTE: 1/6 fixed-effect singletons were removed (7 observations).
## NOTE: 1/6 fixed-effect singletons were removed (7 observations).
output_table3 <- modelsummary(
  c(mods3_ESG, mods3_E, mods3_S, mods3_G),
  stars = TRUE,
  statistic = "({std.error})",
  gof_omit = "AIC|BIC|Log.Lik|RMSE|Adj|Within|Pseudo"
)
output_table3
(1) (2) (3) (4) (1) (2) (3) (4) (1) (2) (3) (4) (1) (2) (3) (4)
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001
ESG_composite_score -2.739 -2.747 -2.745+ -2.754+
(1.667) (1.592) (1.517) (1.428)
ghg_intensity 0.011 0.012 0.009 0.010 0.011 0.012 0.011 0.012+
(0.007) (0.007) (0.006) (0.006) (0.007) (0.007) (0.006) (0.006)
gas_percent 2.413 2.619 3.266 3.425 2.627 2.840 2.282 2.481
(2.221) (2.138) (2.604) (2.519) (2.264) (2.187) (2.424) (2.341)
E_score -2.826+ -2.751+ -3.054* -2.983+
(1.333) (1.367) (1.360) (1.407)
S_score -2.052 -2.069 -2.110 -2.133
(1.413) (1.363) (1.341) (1.268)
G_score -1.022 -1.110 -0.914 -0.999
(1.697) (1.618) (1.559) (1.477)
Num.Obs. 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48 48
R2 0.562 0.577 0.574 0.591 0.586 0.596 0.608 0.620 0.562 0.577 0.576 0.594 0.523 0.538 0.533 0.551
Std.Errors by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey
FE: gvkey X X X X X X X X X X X X X X X X
FE: year X X X X X X X X X X X X X X X X
#trend aligns with that of the bigger sample using Heckman selection model where there is a significant negative relationship between ROA and E score only
#Check missing data - number of companies
n_distinct(esg_score$gvkey)
## [1] 84
n_distinct(financial$gvkey)
## [1] 210
n_distinct(gas_production$gvkey)
## [1] 223
n_distinct(ghg_intensity$gvkey)
## [1] 306
n_distinct(joint_data$gvkey)
## [1] 210
n_distinct(df2$gvkey)
## [1] 199
n_distinct(reg_df3$gvkey)
## [1] 14
#Check whether Heckman selection is significant - ie. whether selection bias matters
summary(mods2_E[[4]])
## OLS estimation, Dep. Var.: ROA
## Observations: 48
## Fixed-effects: gvkey: 12,  year: 6
## Standard-errors: Clustered (gvkey) 
##                Estimate Std. Error   t value Pr(>|t|)    
## E_score       -2.808875   1.362296 -2.061869  0.06366 .  
## ghg_intensity  0.009045   0.007940  1.139230  0.27881    
## gas_percent    3.317273   2.402967  1.380490  0.19484    
## imr_E         29.942302  51.762426  0.578456  0.57461    
## ... 1 variable was removed because of collinearity (oil_price)
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 32.4     Adj. R2: 0.341566
##              Within R2: 0.217103
summary(mods2_ESG[[4]])
## OLS estimation, Dep. Var.: ROA
## Observations: 48
## Fixed-effects: gvkey: 12,  year: 6
## Standard-errors: Clustered (gvkey) 
##                      Estimate Std. Error  t value Pr(>|t|)    
## ESG_composite_score -2.431556   1.128482 -2.15471 0.054201 .  
## ghg_intensity        0.009351   0.008562  1.09217 0.298100    
## gas_percent          2.496526   1.825810  1.36735 0.198804    
## imr_ESG             60.834131  48.517689  1.25385 0.235887    
## ... 1 variable was removed because of collinearity (oil_price)
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 33.4     Adj. R2: 0.299658
##              Within R2: 0.167274
#not significant  
# Multicollinearity checks
cor(df2[, c("ghg_intensity", "gas_percent", "oil_price")], use = "complete.obs")
##               ghg_intensity gas_percent   oil_price
## ghg_intensity     1.0000000 -0.15165434 -0.14686080
## gas_percent      -0.1516543  1.00000000  0.06177314
## oil_price        -0.1468608  0.06177314  1.00000000
# Check VIF of models without Heckman selection
vif(mods3_ESG[[4]])
## Warning in vif.default(mods3_ESG[[4]]): No intercept: vifs may not be sensible.
##                        GVIF Df GVIF^(1/(2*Df))
## ESG_composite_score 1.90096  0             Inf
## ghg_intensity       1.90096  0             Inf
## gas_percent         1.90096  0             Inf
vif(mods3_E[[4]])
## Warning in vif.default(mods3_E[[4]]): No intercept: vifs may not be sensible.
##                   GVIF Df GVIF^(1/(2*Df))
## E_score       3.720113  0             Inf
## ghg_intensity 3.720113  0             Inf
## gas_percent   3.720113  0             Inf
vif(mods3_S[[4]])
## Warning in vif.default(mods3_S[[4]]): No intercept: vifs may not be sensible.
##                   GVIF Df GVIF^(1/(2*Df))
## S_score       2.421516  0             Inf
## ghg_intensity 2.421516  0             Inf
## gas_percent   2.421516  0             Inf
vif(mods3_G[[4]])
## Warning in vif.default(mods3_G[[4]]): No intercept: vifs may not be sensible.
##                   GVIF Df GVIF^(1/(2*Df))
## G_score       2.050702  0             Inf
## ghg_intensity 2.050702  0             Inf
## gas_percent   2.050702  0             Inf
#GVIG values are all within 1-5 range -> acceptable range -> no indication of multicollinearity