# Load packages
library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(lubridate)
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(fixest)
library(sampleSelection)
## Loading required package: maxLik
## Loading required package: miscTools
## 
## Please cite the 'maxLik' package as:
## Henningsen, Arne and Toomet, Ott (2011). maxLik: A package for maximum likelihood estimation in R. Computational Statistics 26(3), 443-458. DOI 10.1007/s00180-010-0217-1.
## 
## If you have questions, suggestions, or comments regarding the 'maxLik' package, please use a forum or 'tracker' at maxLik's R-Forge site:
## https://r-forge.r-project.org/projects/maxlik/
library(modelsummary)
financial <- read_excel("GISC Financial.xlsx")|>
  filter(`(costat) Active/Inactive Status Marker`== "A", `(fic) Current ISO Country Code - Incorporation` == "USA") |>
  select(subindustry_code = `(gsubind) GIC Sub-Industries`, 
         date = `(datadate) Data Date`, 
         gvkey = `(gvkey) Global Company Key`, 
         name = `(conm) Company Name`, 
         total_asset = `(at) Assets - Total`, 
         current_debt = `(dlc) Debt in Current Liabilities - Total`,
         long_term_debt = `(dltt) Long-Term Debt - Total`, 
         net_income = `(ni) Net Income (Loss)`, 
         capex = `(capx) Capital Expenditures`) |>
  mutate(total_debt = current_debt + long_term_debt, 
         ROA = 100*net_income/(total_asset - total_debt), 
         year = year(date),
         gvkey = as.character(gvkey)) |>
  select(-total_asset, -current_debt, -long_term_debt, -net_income, -total_debt, -date)
ghg_intensity = read_csv("GHG intensity.csv") |>
  mutate(year = year(periodenddate), gvkey = as.character(gvkey)) |>
  select(gvkey, year, ghg_intensity = di_319405)
## Rows: 1926 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): gvkey, ticker, companyname, country
## dbl  (4): companyid, institutionid, fiscalyear, di_319405
## date (1): periodenddate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
esg_score <- read_csv("ESG Global Score.csv") |>
  filter(
    csascoretypename == "Raw",
    aspectname %in% c(
      "S&P Global ESG Score",
      "Environmental Dimension",
      "Social Dimension",
      "Economic Governance Dimension"
    )
  ) |>
  transmute(
    scoredate,
    year = lubridate::year(scoredate),
    gvkey = as.character(gvkey),
    score_type = aspectname,
    scorevalue
  ) |>
  arrange(gvkey, year, score_type, scoredate) |>
  group_by(gvkey, year, score_type) |>
  slice_tail(n = 1) |>
  ungroup() |>
  select(-scoredate) |>
  pivot_wider(
    names_from = score_type,
    values_from = scorevalue
  ) |>
  rename(
    ESG_composite_score = `S&P Global ESG Score`,
    E_score = `Environmental Dimension`,
    S_score = `Social Dimension`,
    G_score = `Economic Governance Dimension`
  )
## Rows: 5624 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (6): csascoretypename, scoretype, aspectname, ticker, companyname, country
## dbl  (3): institutionid, scorevalue, gvkey
## date (1): scoredate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
gas_production <- read_csv("Oil and gas production.csv") |>
  select(year = datadate, gvkey, natural_gas_production = ogpngq, oil_production = ogpoilq, country = fic) |>
  filter(country == "USA", !is.na(natural_gas_production), !is.na(oil_production)) |>
  mutate(year = year(year))|>
  group_by(gvkey, year) |>
  mutate(annual_total_gas_production = sum(natural_gas_production,na.rm = TRUE),
           annual_total_oil_production = sum(oil_production, na.rm = TRUE),
         gvkey = as.character(gvkey)) |>
  slice(1) |>
  mutate(
    gas_boe = annual_total_gas_production / 6000,
    total_boe = annual_total_oil_production + gas_boe,
    gas_percent = 100*gas_boe / total_boe
  ) |>
  select(year, gvkey, gas_percent) 
## Rows: 18531 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (8): datafmt, indfmt, consol, gvkey, conm, tic, fic, conml
## dbl  (3): gsubind, ogpngq, ogpoilq
## date (1): datadate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
brent_oil <- read_csv("DCOILBRENTEU.csv") |>
  mutate(year = year(observation_date)) |>
  rename("oil_price" = DCOILBRENTEU) |>
  filter(!is.na(oil_price)) |>
  group_by (year) |>
  summarize(oil_price = mean(oil_price)) 
## Rows: 4221 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (1): DCOILBRENTEU
## date (1): observation_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
joint_data <- financial |>
  left_join(esg_score, by = join_by(gvkey, year)) |> 
  left_join(ghg_intensity, by = join_by(gvkey, year)) |>
  left_join(gas_production, by = join_by(gvkey, year)) |>
  left_join(brent_oil, by = join_by(year)) 
n_distinct(esg_score$gvkey)
## [1] 84
n_distinct(financial$gvkey)
## [1] 210
n_distinct(gas_production$gvkey)
## [1] 223
n_distinct(ghg_intensity$gvkey)
## [1] 306
#Regression with Heckman Selection Model using lagged ESG scores

# Create lagged ESG variables and selection indicators
df <- joint_data |>
  arrange(gvkey, year) |>
  group_by(gvkey) |>
  mutate(
    ESG_lag = lag(ESG_composite_score),
    E_lag   = lag(E_score),
    S_lag   = lag(S_score),
    G_lag   = lag(G_score),

    ESG_obs = if_else(!is.na(ESG_lag), 1, 0),
    E_obs   = if_else(!is.na(E_lag),   1, 0),
    S_obs   = if_else(!is.na(S_lag),   1, 0),
    G_obs   = if_else(!is.na(G_lag),   1, 0)
  ) |>
  ungroup() |>
  mutate(
    log_capex = log(capex + 1),
    subindustry_code = as.factor(subindustry_code),
    year = as.integer(year)
  ) |>
  filter(!is.na(log_capex), !is.na(year), !is.na(subindustry_code))

# 3) Function to create IMR from first-stage probit
add_imr <- function(data, sel_var, imr_name) {
  f <- as.formula(
    paste0(sel_var, " ~ log_capex + factor(year) + subindustry_code")
  )

  sel_mod <- glm(f, data = data, family = binomial(link = "probit"))

  xb  <- predict(sel_mod, type = "link")
  phi <- dnorm(xb)
  Phi <- pnorm(xb)

  data[[imr_name]] <- ifelse(data[[sel_var]] == 1, phi / pmax(Phi, 1e-8), NA_real_)
  data
}

# 4) Add IMRs for each score
df <- add_imr(df, "ESG_obs", "imr_ESG")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
df <- add_imr(df, "E_obs",   "imr_E")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
df <- add_imr(df, "S_obs",   "imr_S")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
df <- add_imr(df, "G_obs",   "imr_G")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# 5) Function to run 4 Heckman-style regressions for any lagged ESG variable
run_heckman_set <- function(data, x_var, sel_var, imr_var, label) {

  reg_df <- data |>
    filter(
      .data[[sel_var]] == 1,
      !is.na(ROA),
      !is.na(.data[[x_var]]),
      !is.na(ghg_intensity),
      !is.na(gas_percent),
      !is.na(oil_price),
      !is.na(.data[[imr_var]])
    )

  f1 <- as.formula(
    paste0("ROA ~ ", x_var, " + ghg_intensity + gas_percent + ", imr_var)
  )

  f2 <- as.formula(
    paste0("ROA ~ ", x_var, " + ghg_intensity + gas_percent + oil_price + ", imr_var)
  )
  
  f3 <- as.formula(
    paste0("ROA ~ ", x_var, " + ghg_intensity + gas_percent + oil_price + ", imr_var, " | gvkey")
  )

  f4 <- as.formula(
    paste0("ROA ~ ", x_var, " + ghg_intensity + gas_percent + oil_price + ", imr_var, " | gvkey + year")
  )

  list(
    setNames(list(feols(f1, data = reg_df, vcov = ~ gvkey)), paste0(label, " (1)"))[[1]],
    setNames(list(feols(f2, data = reg_df, vcov = ~ gvkey)), paste0(label, " (2)"))[[1]],
    setNames(list(feols(f3, data = reg_df, vcov = ~ gvkey)), paste0(label, " Firm FE (3)"))[[1]],
    setNames(list(feols(f4, data = reg_df, vcov = ~ gvkey)), paste0(label, " Firm+Year FE (4)"))[[1]]
  )
}

# 6) Run model sets
mods_ESG <- run_heckman_set(df, "ESG_lag", "ESG_obs", "imr_ESG", "Composite ESG")
## NOTE: 1 fixed-effect singleton was removed (1 observation).
## NOTE: 1/6 fixed-effect singletons were removed (7 observations).
## The variable 'oil_price' has been removed because of collinearity (see
## $collin.var).
mods_E   <- run_heckman_set(df, "E_lag",   "E_obs",   "imr_E",   "E score")
## NOTE: 1 fixed-effect singleton was removed (1 observation).
## NOTE: 1/6 fixed-effect singletons were removed (7 observations).
## The variable 'oil_price' has been removed because of collinearity (see
## $collin.var).
mods_S   <- run_heckman_set(df, "S_lag",   "S_obs",   "imr_S",   "S score")
## NOTE: 1 fixed-effect singleton was removed (1 observation).
## NOTE: 1/6 fixed-effect singletons were removed (7 observations).
## The variable 'oil_price' has been removed because of collinearity (see
## $collin.var).
mods_G   <- run_heckman_set(df, "G_lag",   "G_obs",   "imr_G",   "G score")
## NOTE: 1 fixed-effect singleton was removed (1 observation).
## NOTE: 1/6 fixed-effect singletons were removed (7 observations).
## The variable 'oil_price' has been removed because of collinearity (see
## $collin.var).
output_table <- modelsummary(
  c(mods_ESG, mods_E, mods_S, mods_G),
  stars = TRUE,
  statistic = "({std.error})",
  gof_omit = "AIC|BIC|Log.Lik|RMSE|Adj|Within|Pseudo"
)
output_table
(1) (2) (3) (4) (5) (6) (7) (8) (9) (10) (11) (12) (13) (14) (15) (16)
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001
(Intercept) 26.404 -53.129 26.408 -54.323 37.234 -49.976 14.800 -66.713
(27.120) (38.045) (24.504) (37.689) (23.378) (36.652) (25.238) (48.224)
ESG_lag 0.328 -0.009 0.355 -0.849
(0.388) (0.343) (0.704) (1.672)
ghg_intensity 0.002 0.006 0.004 0.024 0.001 0.005 0.003 0.018 0.002 0.007 0.004 0.022 0.000 0.004 0.004 0.030
(0.004) (0.006) (0.007) (0.018) (0.005) (0.006) (0.007) (0.014) (0.004) (0.005) (0.008) (0.013) (0.005) (0.007) (0.007) (0.022)
gas_percent 0.177 0.075 1.005 2.555 0.182 0.079 1.238 1.380 0.163 0.049 0.929 2.111 0.193 0.103 1.054 3.983
(0.200) (0.228) (1.767) (2.538) (0.199) (0.233) (1.705) (1.442) (0.191) (0.191) (1.668) (1.909) (0.196) (0.249) (1.994) (3.619)
imr_ESG -26.163 -17.004 16.106 -28.918
(15.904) (13.363) (28.475) (114.589)
oil_price 0.963+ 1.011 0.957+ 0.999 1.018+ 1.016 0.930+ 1.019
(0.486) (0.603) (0.487) (0.562) (0.512) (0.598) (0.470) (0.603)
E_lag 0.308 0.032 0.575 0.404
(0.298) (0.325) (0.577) (0.902)
imr_E -24.523 -16.362 26.176 1.476
(15.835) (13.089) (28.389) (102.198)
S_lag 0.094 -0.277 0.216 -0.370
(0.308) (0.324) (0.411) (0.651)
imr_S -29.497+ -19.051 13.423 -18.165
(15.921) (13.098) (26.146) (95.413)
G_lag 0.531 0.342 0.286 -1.880
(0.349) (0.305) (0.725) (2.530)
imr_G -25.070 -13.855 14.202 -64.479
(14.778) (13.389) (29.431) (139.339)
Num.Obs. 44 44 43 37 44 44 43 37 44 44 43 37 44 44 43 37
R2 0.103 0.291 0.490 0.650 0.106 0.291 0.500 0.646 0.096 0.299 0.488 0.646 0.122 0.302 0.489 0.669
Std.Errors by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey
FE: gvkey X X X X X X X X
FE: year X X X X
#Regression with Heckman Selection Model using ESG scores

# Create lagged ESG variables and selection indicators
df2 <- joint_data |>
  arrange(gvkey, year) |>
  group_by(gvkey) |>
  mutate(
    ESG_obs = if_else(!is.na(ESG_composite_score), 1, 0),
    E_obs   = if_else(!is.na(E_score),   1, 0),
    S_obs   = if_else(!is.na(S_score),   1, 0),
    G_obs   = if_else(!is.na(G_score),   1, 0)
  ) |>
  ungroup() |>
  mutate(
    log_capex = log(capex + 1),
    subindustry_code = as.factor(subindustry_code),
    year = as.integer(year)
  ) |>
  filter(!is.na(log_capex), !is.na(year), !is.na(subindustry_code))

# 3) Function to create IMR from first-stage probit
add_imr <- function(data, sel_var, imr_name) {
  f <- as.formula(
    paste0(sel_var, " ~ log_capex + factor(year) + subindustry_code")
  )

  sel_mod <- glm(f, data = data, family = binomial(link = "probit"))

  xb  <- predict(sel_mod, type = "link")
  phi <- dnorm(xb)
  Phi <- pnorm(xb)

  data[[imr_name]] <- ifelse(data[[sel_var]] == 1, phi / pmax(Phi, 1e-8), NA_real_)
  data
}

# 4) Add IMRs for each score
df2 <- add_imr(df, "ESG_obs", "imr_ESG")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
df2 <- add_imr(df, "E_obs",   "imr_E")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
df2 <- add_imr(df, "S_obs",   "imr_S")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
df2 <- add_imr(df, "G_obs",   "imr_G")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# 5) Function to run 4 Heckman-style regressions for any lagged ESG variable
run_heckman_set <- function(data, x_var, sel_var, imr_var, label) {

  reg_df <- data |>
    filter(
      .data[[sel_var]] == 1,
      !is.na(ROA),
      !is.na(.data[[x_var]]),
      !is.na(ghg_intensity),
      !is.na(gas_percent),
      !is.na(oil_price),
      !is.na(.data[[imr_var]])
    )

  f1 <- as.formula(
    paste0("ROA ~ ", x_var, " + ghg_intensity + gas_percent + ", imr_var)
  )

  f2 <- as.formula(
    paste0("ROA ~ ", x_var, " + ghg_intensity + gas_percent + oil_price + ", imr_var)
  )
  
  f3 <- as.formula(
    paste0("ROA ~ ", x_var, " + ghg_intensity + gas_percent + oil_price + ", imr_var, " | gvkey")
  )

  f4 <- as.formula(
    paste0("ROA ~ ", x_var, " + ghg_intensity + gas_percent + oil_price + ", imr_var, " | gvkey + year")
  )

  list(
    setNames(list(feols(f1, data = reg_df, vcov = ~ gvkey)), paste0(label, " (1)"))[[1]],
    setNames(list(feols(f2, data = reg_df, vcov = ~ gvkey)), paste0(label, " (2)"))[[1]],
    setNames(list(feols(f3, data = reg_df, vcov = ~ gvkey)), paste0(label, " Firm FE (3)"))[[1]],
    setNames(list(feols(f4, data = reg_df, vcov = ~ gvkey)), paste0(label, " Firm+Year FE (4)"))[[1]]
  )
}

# 6) Run model sets
mods2_ESG <- run_heckman_set(df2, "ESG_composite_score", "ESG_obs", "imr_ESG", "Composite ESG")
## NOTE: 3 fixed-effect singletons were removed (3 observations).
## NOTE: 4/5 fixed-effect singletons were removed (9 observations).
## The variable 'oil_price' has been removed because of collinearity (see
## $collin.var).
mods2_E   <- run_heckman_set(df2, "E_score",   "E_obs",   "imr_E",   "E score")
## NOTE: 3 fixed-effect singletons were removed (3 observations).
## NOTE: 4/5 fixed-effect singletons were removed (9 observations).
## The variable 'oil_price' has been removed because of collinearity (see
## $collin.var).
mods2_S   <- run_heckman_set(df2, "S_score",   "S_obs",   "imr_S",   "S score")
## NOTE: 3 fixed-effect singletons were removed (3 observations).
## NOTE: 4/5 fixed-effect singletons were removed (9 observations).
## The variable 'oil_price' has been removed because of collinearity (see
## $collin.var).
mods2_G   <- run_heckman_set(df2, "G_score",   "G_obs",   "imr_G",   "G score")
## NOTE: 3 fixed-effect singletons were removed (3 observations).
## NOTE: 4/5 fixed-effect singletons were removed (9 observations).
## The variable 'oil_price' has been removed because of collinearity (see
## $collin.var).
output_table2 <- modelsummary(
  c(mods2_ESG, mods2_E, mods2_S, mods2_G),
  stars = TRUE,
  statistic = "({std.error})",
  gof_omit = "AIC|BIC|Log.Lik|RMSE|Adj|Within|Pseudo"
)
output_table2
(1) (2) (3) (4) (5) (6) (7) (8) (9) (10) (11) (12) (13) (14) (15) (16)
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001
(Intercept) 19.003 -54.141 25.355 -54.403 32.002 -62.065 1.346 -60.168
(35.409) (36.053) (31.354) (34.035) (29.785) (38.249) (40.002) (44.222)
ESG_composite_score 0.459 -0.496 0.199 -1.186
(0.470) (0.374) (0.469) (1.445)
ghg_intensity 0.003 0.008 0.002 0.031 0.003 0.009 0.005 0.031* 0.004 0.008 0.001 0.031 0.002 0.007 0.000 0.032
(0.006) (0.007) (0.008) (0.017) (0.006) (0.006) (0.008) (0.010) (0.005) (0.006) (0.008) (0.020) (0.007) (0.008) (0.008) (0.023)
gas_percent 0.119 0.067 -0.974 3.436 0.126 0.057 -1.336 2.891+ 0.127 0.056 -0.837 3.339 0.103 0.069 -0.925 3.125
(0.216) (0.220) (2.577) (2.897) (0.201) (0.223) (2.878) (1.412) (0.212) (0.194) (2.441) (3.456) (0.214) (0.249) (2.540) (3.529)
imr_ESG -26.552 -25.374 -0.494 121.709
(23.541) (22.618) (41.000) (142.764)
oil_price 1.257+ 1.031 1.320* 1.234 1.292+ 0.985 1.133+ 0.987
(0.582) (0.728) (0.590) (0.753) (0.589) (0.666) (0.553) (0.725)
E_score 0.311 -0.572 -0.769 -3.314***
(0.352) (0.395) (0.881) (0.610)
imr_E -26.989 -29.252 -23.508 185.420+
(23.005) (22.760) (49.966) (80.483)
S_score 0.247 -0.500 0.397 -0.231
(0.356) (0.306) (0.422) (0.508)
imr_S -30.490 -23.796 3.139 87.185
(23.367) (20.930) (36.483) (138.631)
G_score 0.718 -0.120 0.391 0.656
(0.487) (0.422) (0.574) (1.541)
imr_G -23.663 -20.865 1.598 60.913
(24.165) (23.476) (41.623) (125.917)
Num.Obs. 35 35 32 26 35 35 32 26 35 35 32 26 35 35 32 26
R2 0.100 0.329 0.393 0.625 0.093 0.341 0.406 0.723 0.090 0.339 0.397 0.617 0.125 0.315 0.396 0.619
Std.Errors by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey by: gvkey
FE: gvkey X X X X X X X X
FE: year X X X X
# Regression only on companies with ESG scores available
# model 1: with lag composite ESG
reg_df2 <- df |>
  filter(!is.na(ESG_lag),
         !is.na(ghg_intensity),
         !is.na(gas_percent),
         !is.na(oil_price))

model1 = feols(ROA ~ ESG_lag + ghg_intensity + gas_percent | gvkey + year, data = reg_df2)
## NOTES: 2 observations removed because of NA values (LHS: 2).
##        1/6 fixed-effect singletons were removed (7 observations).
summary(model1)
## OLS estimation, Dep. Var.: ROA
## Observations: 37
## Fixed-effects: gvkey: 11,  year: 5
## Standard-errors: IID 
##                Estimate Std. Error   t value Pr(>|t|)    
## ESG_lag       -0.734308   1.371445 -0.535426 0.598567    
## ghg_intensity  0.020573   0.010856  1.895172 0.073389 .  
## gas_percent    1.988374   2.918956  0.681194 0.503968    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 24.3     Adj. R2: 0.335084
##              Within R2: 0.15955
#model 2: with ESG composite score
reg_df3 <- df |>
  filter(!is.na(ESG_composite_score),
         !is.na(ghg_intensity),
         !is.na(gas_percent),
         !is.na(oil_price))

model2 = feols(ROA ~ ESG_composite_score + ghg_intensity + gas_percent | gvkey + year, data = reg_df3)
## NOTES: 3 observations removed because of NA values (LHS: 3).
##        1/6 fixed-effect singletons were removed (7 observations).
summary(model2)
## OLS estimation, Dep. Var.: ROA
## Observations: 48
## Fixed-effects: gvkey: 12,  year: 6
## Standard-errors: IID 
##                      Estimate Std. Error   t value Pr(>|t|)    
## ESG_composite_score -2.754122   1.556699 -1.769206 0.087752 .  
## ghg_intensity        0.011544   0.010762  1.072722 0.292555    
## gas_percent          2.618743   2.640316  0.991829 0.329775    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 33.6     Adj. R2: 0.313677
##              Within R2: 0.153719
#model 3: with E score
model3 =feols(ROA ~ E_score + ghg_intensity + gas_percent + capex | gvkey + year, data = reg_df3)
## NOTES: 3 observations removed because of NA values (LHS: 3).
##        1/6 fixed-effect singletons were removed (7 observations).
summary(model3)
## OLS estimation, Dep. Var.: ROA
## Observations: 48
## Fixed-effects: gvkey: 12,  year: 6
## Standard-errors: IID 
##                Estimate Std. Error   t value Pr(>|t|)    
## E_score       -3.020232   1.286668 -2.347329 0.026492 *  
## ghg_intensity  0.010000   0.010513  0.951231 0.349925    
## gas_percent    3.506526   2.603352  1.346928 0.189201    
## capex         -0.003431   0.005882 -0.583307 0.564528    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 32.2     Adj. R2: 0.347211
##              Within R2: 0.223816
#model 4: with S score
model4 = feols(ROA ~ S_score + ghg_intensity + gas_percent + capex | gvkey + year, data = reg_df3)
## NOTES: 3 observations removed because of NA values (LHS: 3).
##        1/6 fixed-effect singletons were removed (7 observations).
summary(model4)
## OLS estimation, Dep. Var.: ROA
## Observations: 48
## Fixed-effects: gvkey: 12,  year: 6
## Standard-errors: IID 
##                Estimate Std. Error   t value Pr(>|t|)    
## S_score       -2.098446   1.197477 -1.752390 0.091058 .  
## ghg_intensity  0.011776   0.010915  1.078940 0.290159    
## gas_percent    2.865594   2.683127  1.068005 0.294974    
## capex         -0.001426   0.006155 -0.231696 0.818521    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 33.5     Adj. R2: 0.294263
##              Within R2: 0.160859
#model 5: with G score
model5 = feols(ROA ~ G_score + ghg_intensity + gas_percent + capex | gvkey + year, data = reg_df3)
## NOTES: 3 observations removed because of NA values (LHS: 3).
##        1/6 fixed-effect singletons were removed (7 observations).
summary(model5)
## OLS estimation, Dep. Var.: ROA
## Observations: 48
## Fixed-effects: gvkey: 12,  year: 6
## Standard-errors: IID 
##                Estimate Std. Error   t value Pr(>|t|) 
## G_score       -0.957424   1.758802 -0.544361  0.59066 
## ghg_intensity  0.011721   0.011467  1.022134  0.31579 
## gas_percent    2.539226   2.821255  0.900034  0.37606 
## capex         -0.002545   0.006422 -0.396227  0.69505 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 35.2     Adj. R2: 0.222528
##              Within R2: 0.075564
#trend aligns with that of the bigger sample using Heckman selection model where there is a significant negative relationship between ROA and E score only