ESG Research Project

# Load packages
library(readxl)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.2     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(dplyr)
library(lubridate)
library(janitor)

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(fixest)
library(sampleSelection)

## Loading required package: maxLik
## Loading required package: miscTools
## 
## Please cite the 'maxLik' package as:
## Henningsen, Arne and Toomet, Ott (2011). maxLik: A package for maximum likelihood estimation in R. Computational Statistics 26(3), 443-458. DOI 10.1007/s00180-010-0217-1.
## 
## If you have questions, suggestions, or comments regarding the 'maxLik' package, please use a forum or 'tracker' at maxLik's R-Forge site:
## https://r-forge.r-project.org/projects/maxlik/

library(modelsummary)

financial <- read_excel("GISC Financial.xlsx")|>
  filter(`(costat) Active/Inactive Status Marker`== "A", `(fic) Current ISO Country Code - Incorporation` == "USA") |>
  select(subindustry_code = `(gsubind) GIC Sub-Industries`, 
         date = `(datadate) Data Date`, 
         gvkey = `(gvkey) Global Company Key`, 
         name = `(conm) Company Name`, 
         total_asset = `(at) Assets - Total`, 
         current_debt = `(dlc) Debt in Current Liabilities - Total`,
         long_term_debt = `(dltt) Long-Term Debt - Total`, 
         net_income = `(ni) Net Income (Loss)`, 
         capex = `(capx) Capital Expenditures`) |>
  mutate(total_debt = current_debt + long_term_debt, 
         ROA = 100*net_income/(total_asset - total_debt), 
         year = year(date),
         gvkey = as.character(gvkey)) |>
  select(-total_asset, -current_debt, -long_term_debt, -net_income, -total_debt, -date)

ghg_intensity = read_csv("GHG intensity.csv") |>
  mutate(year = year(periodenddate), gvkey = as.character(gvkey)) |>
  select(gvkey, year, ghg_intensity = di_319405)

## Rows: 1926 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): gvkey, ticker, companyname, country
## dbl  (4): companyid, institutionid, fiscalyear, di_319405
## date (1): periodenddate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

esg_score <- read_csv("ESG Global Score.csv") |>
  filter(
    csascoretypename == "Raw",
    aspectname %in% c(
      "S&P Global ESG Score",
      "Environmental Dimension",
      "Social Dimension",
      "Economic Governance Dimension"
    )
  ) |>
  transmute(
    scoredate,
    year = lubridate::year(scoredate),
    gvkey = as.character(gvkey),
    score_type = aspectname,
    scorevalue
  ) |>
  arrange(gvkey, year, score_type, scoredate) |>
  group_by(gvkey, year, score_type) |>
  slice_tail(n = 1) |>
  ungroup() |>
  select(-scoredate) |>
  pivot_wider(
    names_from = score_type,
    values_from = scorevalue
  ) |>
  rename(
    ESG_composite_score = `S&P Global ESG Score`,
    E_score = `Environmental Dimension`,
    S_score = `Social Dimension`,
    G_score = `Economic Governance Dimension`
  )

## Rows: 5624 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (6): csascoretypename, scoretype, aspectname, ticker, companyname, country
## dbl  (3): institutionid, scorevalue, gvkey
## date (1): scoredate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

gas_production <- read_csv("Oil and gas production.csv") |>
  select(year = datadate, gvkey, natural_gas_production = ogpngq, oil_production = ogpoilq, country = fic) |>
  filter(country == "USA", !is.na(natural_gas_production), !is.na(oil_production)) |>
  mutate(year = year(year))|>
  group_by(gvkey, year) |>
  mutate(annual_total_gas_production = sum(natural_gas_production,na.rm = TRUE),
           annual_total_oil_production = sum(oil_production, na.rm = TRUE),
         gvkey = as.character(gvkey)) |>
  slice(1) |>
  mutate(
    gas_boe = annual_total_gas_production / 6000,
    total_boe = annual_total_oil_production + gas_boe,
    gas_percent = 100*gas_boe / total_boe
  ) |>
  select(year, gvkey, gas_percent)

## Rows: 18531 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (8): datafmt, indfmt, consol, gvkey, conm, tic, fic, conml
## dbl  (3): gsubind, ogpngq, ogpoilq
## date (1): datadate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

brent_oil <- read_csv("DCOILBRENTEU.csv") |>
  mutate(year = year(observation_date)) |>
  rename("oil_price" = DCOILBRENTEU) |>
  filter(!is.na(oil_price)) |>
  group_by (year) |>
  summarize(oil_price = mean(oil_price))

## Rows: 4221 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (1): DCOILBRENTEU
## date (1): observation_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

joint_data <- financial |>
  left_join(esg_score, by = join_by(gvkey, year)) |> 
  left_join(ghg_intensity, by = join_by(gvkey, year)) |>
  left_join(gas_production, by = join_by(gvkey, year)) |>
  left_join(brent_oil, by = join_by(year))

n_distinct(esg_score$gvkey)

## [1] 84

n_distinct(financial$gvkey)

## [1] 210

n_distinct(gas_production$gvkey)

## [1] 223

n_distinct(ghg_intensity$gvkey)

## [1] 306

#Regression with Heckman Selection Model using lagged ESG scores

# Create lagged ESG variables and selection indicators
df <- joint_data |>
  arrange(gvkey, year) |>
  group_by(gvkey) |>
  mutate(
    ESG_lag = lag(ESG_composite_score),
    E_lag   = lag(E_score),
    S_lag   = lag(S_score),
    G_lag   = lag(G_score),

    ESG_obs = if_else(!is.na(ESG_lag), 1, 0),
    E_obs   = if_else(!is.na(E_lag),   1, 0),
    S_obs   = if_else(!is.na(S_lag),   1, 0),
    G_obs   = if_else(!is.na(G_lag),   1, 0)
  ) |>
  ungroup() |>
  mutate(
    log_capex = log(capex + 1),
    subindustry_code = as.factor(subindustry_code),
    year = as.integer(year)
  ) |>
  filter(!is.na(log_capex), !is.na(year), !is.na(subindustry_code))

# 3) Function to create IMR from first-stage probit
add_imr <- function(data, sel_var, imr_name) {
  f <- as.formula(
    paste0(sel_var, " ~ log_capex + factor(year) + subindustry_code")
  )

  sel_mod <- glm(f, data = data, family = binomial(link = "probit"))

  xb  <- predict(sel_mod, type = "link")
  phi <- dnorm(xb)
  Phi <- pnorm(xb)

  data[[imr_name]] <- ifelse(data[[sel_var]] == 1, phi / pmax(Phi, 1e-8), NA_real_)
  data
}

# 4) Add IMRs for each score
df <- add_imr(df, "ESG_obs", "imr_ESG")

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

df <- add_imr(df, "E_obs",   "imr_E")

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

df <- add_imr(df, "S_obs",   "imr_S")

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

df <- add_imr(df, "G_obs",   "imr_G")

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

# 5) Function to run 4 Heckman-style regressions for any lagged ESG variable
run_heckman_set <- function(data, x_var, sel_var, imr_var, label) {

  reg_df <- data |>
    filter(
      .data[[sel_var]] == 1,
      !is.na(ROA),
      !is.na(.data[[x_var]]),
      !is.na(ghg_intensity),
      !is.na(gas_percent),
      !is.na(oil_price),
      !is.na(.data[[imr_var]])
    )

  f1 <- as.formula(
    paste0("ROA ~ ", x_var, " + ghg_intensity + gas_percent + ", imr_var)
  )

  f2 <- as.formula(
    paste0("ROA ~ ", x_var, " + ghg_intensity + gas_percent + oil_price + ", imr_var)
  )
  
  f3 <- as.formula(
    paste0("ROA ~ ", x_var, " + ghg_intensity + gas_percent + oil_price + ", imr_var, " | gvkey")
  )

  f4 <- as.formula(
    paste0("ROA ~ ", x_var, " + ghg_intensity + gas_percent + oil_price + ", imr_var, " | gvkey + year")
  )

  list(
    setNames(list(feols(f1, data = reg_df, vcov = ~ gvkey)), paste0(label, " (1)"))[[1]],
    setNames(list(feols(f2, data = reg_df, vcov = ~ gvkey)), paste0(label, " (2)"))[[1]],
    setNames(list(feols(f3, data = reg_df, vcov = ~ gvkey)), paste0(label, " Firm FE (3)"))[[1]],
    setNames(list(feols(f4, data = reg_df, vcov = ~ gvkey)), paste0(label, " Firm+Year FE (4)"))[[1]]
  )
}

# 6) Run model sets
mods_ESG <- run_heckman_set(df, "ESG_lag", "ESG_obs", "imr_ESG", "Composite ESG")

## NOTE: 1 fixed-effect singleton was removed (1 observation).

## NOTE: 1/6 fixed-effect singletons were removed (7 observations).

## The variable 'oil_price' has been removed because of collinearity (see
## $collin.var).

mods_E   <- run_heckman_set(df, "E_lag",   "E_obs",   "imr_E",   "E score")

## NOTE: 1 fixed-effect singleton was removed (1 observation).

## NOTE: 1/6 fixed-effect singletons were removed (7 observations).

## The variable 'oil_price' has been removed because of collinearity (see
## $collin.var).

mods_S   <- run_heckman_set(df, "S_lag",   "S_obs",   "imr_S",   "S score")

## NOTE: 1 fixed-effect singleton was removed (1 observation).

## NOTE: 1/6 fixed-effect singletons were removed (7 observations).

## The variable 'oil_price' has been removed because of collinearity (see
## $collin.var).

mods_G   <- run_heckman_set(df, "G_lag",   "G_obs",   "imr_G",   "G score")

## NOTE: 1 fixed-effect singleton was removed (1 observation).

## NOTE: 1/6 fixed-effect singletons were removed (7 observations).

## The variable 'oil_price' has been removed because of collinearity (see
## $collin.var).

output_table <- modelsummary(
  c(mods_ESG, mods_E, mods_S, mods_G),
  stars = TRUE,
  statistic = "({std.error})",
  gof_omit = "AIC|BIC|Log.Lik|RMSE|Adj|Within|Pseudo"
)
output_table

	(1)	(2)	(3)	(4)	(5)	(6)	(7)	(8)	(9)	(10)	(11)	(12)	(13)	(14)	(15)	(16)
+ p < 0.1, * p < 0.05, p < 0.01, * p < 0.001
(Intercept)	26.404	-53.129			26.408	-54.323			37.234	-49.976			14.800	-66.713
	(27.120)	(38.045)			(24.504)	(37.689)			(23.378)	(36.652)			(25.238)	(48.224)
ESG_lag	0.328	-0.009	0.355	-0.849
	(0.388)	(0.343)	(0.704)	(1.672)
ghg_intensity	0.002	0.006	0.004	0.024	0.001	0.005	0.003	0.018	0.002	0.007	0.004	0.022	0.000	0.004	0.004	0.030
	(0.004)	(0.006)	(0.007)	(0.018)	(0.005)	(0.006)	(0.007)	(0.014)	(0.004)	(0.005)	(0.008)	(0.013)	(0.005)	(0.007)	(0.007)	(0.022)
gas_percent	0.177	0.075	1.005	2.555	0.182	0.079	1.238	1.380	0.163	0.049	0.929	2.111	0.193	0.103	1.054	3.983
	(0.200)	(0.228)	(1.767)	(2.538)	(0.199)	(0.233)	(1.705)	(1.442)	(0.191)	(0.191)	(1.668)	(1.909)	(0.196)	(0.249)	(1.994)	(3.619)
imr_ESG	-26.163	-17.004	16.106	-28.918
	(15.904)	(13.363)	(28.475)	(114.589)
oil_price		0.963+	1.011			0.957+	0.999			1.018+	1.016			0.930+	1.019
		(0.486)	(0.603)			(0.487)	(0.562)			(0.512)	(0.598)			(0.470)	(0.603)
E_lag					0.308	0.032	0.575	0.404
					(0.298)	(0.325)	(0.577)	(0.902)
imr_E					-24.523	-16.362	26.176	1.476
					(15.835)	(13.089)	(28.389)	(102.198)
S_lag									0.094	-0.277	0.216	-0.370
									(0.308)	(0.324)	(0.411)	(0.651)
imr_S									-29.497+	-19.051	13.423	-18.165
									(15.921)	(13.098)	(26.146)	(95.413)
G_lag													0.531	0.342	0.286	-1.880
													(0.349)	(0.305)	(0.725)	(2.530)
imr_G													-25.070	-13.855	14.202	-64.479
													(14.778)	(13.389)	(29.431)	(139.339)
Num.Obs.	44	44	43	37	44	44	43	37	44	44	43	37	44	44	43	37
R2	0.103	0.291	0.490	0.650	0.106	0.291	0.500	0.646	0.096	0.299	0.488	0.646	0.122	0.302	0.489	0.669
Std.Errors	by: gvkey	by: gvkey	by: gvkey	by: gvkey	by: gvkey	by: gvkey	by: gvkey	by: gvkey	by: gvkey	by: gvkey	by: gvkey	by: gvkey	by: gvkey	by: gvkey	by: gvkey	by: gvkey
FE: gvkey			X	X			X	X			X	X			X	X
FE: year				X				X				X				X

#Regression with Heckman Selection Model using ESG scores

# Create lagged ESG variables and selection indicators
df2 <- joint_data |>
  arrange(gvkey, year) |>
  group_by(gvkey) |>
  mutate(
    ESG_obs = if_else(!is.na(ESG_composite_score), 1, 0),
    E_obs   = if_else(!is.na(E_score),   1, 0),
    S_obs   = if_else(!is.na(S_score),   1, 0),
    G_obs   = if_else(!is.na(G_score),   1, 0)
  ) |>
  ungroup() |>
  mutate(
    log_capex = log(capex + 1),
    subindustry_code = as.factor(subindustry_code),
    year = as.integer(year)
  ) |>
  filter(!is.na(log_capex), !is.na(year), !is.na(subindustry_code))

# 3) Function to create IMR from first-stage probit
add_imr <- function(data, sel_var, imr_name) {
  f <- as.formula(
    paste0(sel_var, " ~ log_capex + factor(year) + subindustry_code")
  )

  sel_mod <- glm(f, data = data, family = binomial(link = "probit"))

  xb  <- predict(sel_mod, type = "link")
  phi <- dnorm(xb)
  Phi <- pnorm(xb)

  data[[imr_name]] <- ifelse(data[[sel_var]] == 1, phi / pmax(Phi, 1e-8), NA_real_)
  data
}

# 4) Add IMRs for each score
df2 <- add_imr(df, "ESG_obs", "imr_ESG")

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

df2 <- add_imr(df, "E_obs",   "imr_E")

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

df2 <- add_imr(df, "S_obs",   "imr_S")

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

df2 <- add_imr(df, "G_obs",   "imr_G")

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

# 5) Function to run 4 Heckman-style regressions for any lagged ESG variable
run_heckman_set <- function(data, x_var, sel_var, imr_var, label) {

  reg_df <- data |>
    filter(
      .data[[sel_var]] == 1,
      !is.na(ROA),
      !is.na(.data[[x_var]]),
      !is.na(ghg_intensity),
      !is.na(gas_percent),
      !is.na(oil_price),
      !is.na(.data[[imr_var]])
    )

  f1 <- as.formula(
    paste0("ROA ~ ", x_var, " + ghg_intensity + gas_percent + ", imr_var)
  )

  f2 <- as.formula(
    paste0("ROA ~ ", x_var, " + ghg_intensity + gas_percent + oil_price + ", imr_var)
  )
  
  f3 <- as.formula(
    paste0("ROA ~ ", x_var, " + ghg_intensity + gas_percent + oil_price + ", imr_var, " | gvkey")
  )

  f4 <- as.formula(
    paste0("ROA ~ ", x_var, " + ghg_intensity + gas_percent + oil_price + ", imr_var, " | gvkey + year")
  )

  list(
    setNames(list(feols(f1, data = reg_df, vcov = ~ gvkey)), paste0(label, " (1)"))[[1]],
    setNames(list(feols(f2, data = reg_df, vcov = ~ gvkey)), paste0(label, " (2)"))[[1]],
    setNames(list(feols(f3, data = reg_df, vcov = ~ gvkey)), paste0(label, " Firm FE (3)"))[[1]],
    setNames(list(feols(f4, data = reg_df, vcov = ~ gvkey)), paste0(label, " Firm+Year FE (4)"))[[1]]
  )
}

# 6) Run model sets
mods2_ESG <- run_heckman_set(df2, "ESG_composite_score", "ESG_obs", "imr_ESG", "Composite ESG")

## NOTE: 3 fixed-effect singletons were removed (3 observations).

## NOTE: 4/5 fixed-effect singletons were removed (9 observations).

## The variable 'oil_price' has been removed because of collinearity (see
## $collin.var).

mods2_E   <- run_heckman_set(df2, "E_score",   "E_obs",   "imr_E",   "E score")

## NOTE: 3 fixed-effect singletons were removed (3 observations).

## NOTE: 4/5 fixed-effect singletons were removed (9 observations).

## The variable 'oil_price' has been removed because of collinearity (see
## $collin.var).

mods2_S   <- run_heckman_set(df2, "S_score",   "S_obs",   "imr_S",   "S score")

## NOTE: 3 fixed-effect singletons were removed (3 observations).

## NOTE: 4/5 fixed-effect singletons were removed (9 observations).

## The variable 'oil_price' has been removed because of collinearity (see
## $collin.var).

mods2_G   <- run_heckman_set(df2, "G_score",   "G_obs",   "imr_G",   "G score")

## NOTE: 3 fixed-effect singletons were removed (3 observations).

## NOTE: 4/5 fixed-effect singletons were removed (9 observations).

## The variable 'oil_price' has been removed because of collinearity (see
## $collin.var).

output_table2 <- modelsummary(
  c(mods2_ESG, mods2_E, mods2_S, mods2_G),
  stars = TRUE,
  statistic = "({std.error})",
  gof_omit = "AIC|BIC|Log.Lik|RMSE|Adj|Within|Pseudo"
)
output_table2

	(1)	(2)	(3)	(4)	(5)	(6)	(7)	(8)	(9)	(10)	(11)	(12)	(13)	(14)	(15)	(16)
+ p < 0.1, * p < 0.05, p < 0.01, * p < 0.001
(Intercept)	19.003	-54.141			25.355	-54.403			32.002	-62.065			1.346	-60.168
	(35.409)	(36.053)			(31.354)	(34.035)			(29.785)	(38.249)			(40.002)	(44.222)
ESG_composite_score	0.459	-0.496	0.199	-1.186
	(0.470)	(0.374)	(0.469)	(1.445)
ghg_intensity	0.003	0.008	0.002	0.031	0.003	0.009	0.005	0.031*	0.004	0.008	0.001	0.031	0.002	0.007	0.000	0.032
	(0.006)	(0.007)	(0.008)	(0.017)	(0.006)	(0.006)	(0.008)	(0.010)	(0.005)	(0.006)	(0.008)	(0.020)	(0.007)	(0.008)	(0.008)	(0.023)
gas_percent	0.119	0.067	-0.974	3.436	0.126	0.057	-1.336	2.891+	0.127	0.056	-0.837	3.339	0.103	0.069	-0.925	3.125
	(0.216)	(0.220)	(2.577)	(2.897)	(0.201)	(0.223)	(2.878)	(1.412)	(0.212)	(0.194)	(2.441)	(3.456)	(0.214)	(0.249)	(2.540)	(3.529)
imr_ESG	-26.552	-25.374	-0.494	121.709
	(23.541)	(22.618)	(41.000)	(142.764)
oil_price		1.257+	1.031			1.320*	1.234			1.292+	0.985			1.133+	0.987
		(0.582)	(0.728)			(0.590)	(0.753)			(0.589)	(0.666)			(0.553)	(0.725)
E_score					0.311	-0.572	-0.769	-3.314***
					(0.352)	(0.395)	(0.881)	(0.610)
imr_E					-26.989	-29.252	-23.508	185.420+
					(23.005)	(22.760)	(49.966)	(80.483)
S_score									0.247	-0.500	0.397	-0.231
									(0.356)	(0.306)	(0.422)	(0.508)
imr_S									-30.490	-23.796	3.139	87.185
									(23.367)	(20.930)	(36.483)	(138.631)
G_score													0.718	-0.120	0.391	0.656
													(0.487)	(0.422)	(0.574)	(1.541)
imr_G													-23.663	-20.865	1.598	60.913
													(24.165)	(23.476)	(41.623)	(125.917)
Num.Obs.	35	35	32	26	35	35	32	26	35	35	32	26	35	35	32	26
R2	0.100	0.329	0.393	0.625	0.093	0.341	0.406	0.723	0.090	0.339	0.397	0.617	0.125	0.315	0.396	0.619
Std.Errors	by: gvkey	by: gvkey	by: gvkey	by: gvkey	by: gvkey	by: gvkey	by: gvkey	by: gvkey	by: gvkey	by: gvkey	by: gvkey	by: gvkey	by: gvkey	by: gvkey	by: gvkey	by: gvkey
FE: gvkey			X	X			X	X			X	X			X	X
FE: year				X				X				X				X

# Regression only on companies with ESG scores available
# model 1: with lag composite ESG
reg_df2 <- df |>
  filter(!is.na(ESG_lag),
         !is.na(ghg_intensity),
         !is.na(gas_percent),
         !is.na(oil_price))

model1 = feols(ROA ~ ESG_lag + ghg_intensity + gas_percent | gvkey + year, data = reg_df2)

## NOTES: 2 observations removed because of NA values (LHS: 2).
##        1/6 fixed-effect singletons were removed (7 observations).

summary(model1)

## OLS estimation, Dep. Var.: ROA
## Observations: 37
## Fixed-effects: gvkey: 11,  year: 5
## Standard-errors: IID 
##                Estimate Std. Error   t value Pr(>|t|)    
## ESG_lag       -0.734308   1.371445 -0.535426 0.598567    
## ghg_intensity  0.020573   0.010856  1.895172 0.073389 .  
## gas_percent    1.988374   2.918956  0.681194 0.503968    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 24.3     Adj. R2: 0.335084
##              Within R2: 0.15955

#model 2: with ESG composite score
reg_df3 <- df |>
  filter(!is.na(ESG_composite_score),
         !is.na(ghg_intensity),
         !is.na(gas_percent),
         !is.na(oil_price))

model2 = feols(ROA ~ ESG_composite_score + ghg_intensity + gas_percent | gvkey + year, data = reg_df3)

## NOTES: 3 observations removed because of NA values (LHS: 3).
##        1/6 fixed-effect singletons were removed (7 observations).

summary(model2)

## OLS estimation, Dep. Var.: ROA
## Observations: 48
## Fixed-effects: gvkey: 12,  year: 6
## Standard-errors: IID 
##                      Estimate Std. Error   t value Pr(>|t|)    
## ESG_composite_score -2.754122   1.556699 -1.769206 0.087752 .  
## ghg_intensity        0.011544   0.010762  1.072722 0.292555    
## gas_percent          2.618743   2.640316  0.991829 0.329775    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 33.6     Adj. R2: 0.313677
##              Within R2: 0.153719

#model 3: with E score
model3 =feols(ROA ~ E_score + ghg_intensity + gas_percent + capex | gvkey + year, data = reg_df3)

## NOTES: 3 observations removed because of NA values (LHS: 3).
##        1/6 fixed-effect singletons were removed (7 observations).

summary(model3)

## OLS estimation, Dep. Var.: ROA
## Observations: 48
## Fixed-effects: gvkey: 12,  year: 6
## Standard-errors: IID 
##                Estimate Std. Error   t value Pr(>|t|)    
## E_score       -3.020232   1.286668 -2.347329 0.026492 *  
## ghg_intensity  0.010000   0.010513  0.951231 0.349925    
## gas_percent    3.506526   2.603352  1.346928 0.189201    
## capex         -0.003431   0.005882 -0.583307 0.564528    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 32.2     Adj. R2: 0.347211
##              Within R2: 0.223816

#model 4: with S score
model4 = feols(ROA ~ S_score + ghg_intensity + gas_percent + capex | gvkey + year, data = reg_df3)

## NOTES: 3 observations removed because of NA values (LHS: 3).
##        1/6 fixed-effect singletons were removed (7 observations).

summary(model4)

## OLS estimation, Dep. Var.: ROA
## Observations: 48
## Fixed-effects: gvkey: 12,  year: 6
## Standard-errors: IID 
##                Estimate Std. Error   t value Pr(>|t|)    
## S_score       -2.098446   1.197477 -1.752390 0.091058 .  
## ghg_intensity  0.011776   0.010915  1.078940 0.290159    
## gas_percent    2.865594   2.683127  1.068005 0.294974    
## capex         -0.001426   0.006155 -0.231696 0.818521    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 33.5     Adj. R2: 0.294263
##              Within R2: 0.160859

#model 5: with G score
model5 = feols(ROA ~ G_score + ghg_intensity + gas_percent + capex | gvkey + year, data = reg_df3)

## NOTES: 3 observations removed because of NA values (LHS: 3).
##        1/6 fixed-effect singletons were removed (7 observations).

summary(model5)

## OLS estimation, Dep. Var.: ROA
## Observations: 48
## Fixed-effects: gvkey: 12,  year: 6
## Standard-errors: IID 
##                Estimate Std. Error   t value Pr(>|t|) 
## G_score       -0.957424   1.758802 -0.544361  0.59066 
## ghg_intensity  0.011721   0.011467  1.022134  0.31579 
## gas_percent    2.539226   2.821255  0.900034  0.37606 
## capex         -0.002545   0.006422 -0.396227  0.69505 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 35.2     Adj. R2: 0.222528
##              Within R2: 0.075564

#trend aligns with that of the bigger sample using Heckman selection model where there is a significant negative relationship between ROA and E score only