rm(list=ls())
options(tinytex.engine = 'pdflatex')
#check remove packages we are not using
# Install packages
PackageNames <- c("glmnet",
"stargazer",
"car",
"lmtest",
"plm",
"ggplot2",
'dplyr',
'bslib',
'sandwich',
'corrplot',
'reshape2',
'vcd'
,'DescTools'
#'robustHD'
)
for(i in PackageNames){
if(!require(i, character.only = T)){
install.packages(i, dependencies = T)
require(i, character.only = T)
}
}
## Loading required package: glmnet
## Warning: package 'glmnet' was built under R version 4.3.3
## Loading required package: Matrix
## Loaded glmnet 4.1-8
## Loading required package: stargazer
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
## Loading required package: car
## Warning: package 'car' was built under R version 4.3.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.3.3
## Loading required package: lmtest
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.3.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: plm
## Warning: package 'plm' was built under R version 4.3.3
## Loading required package: ggplot2
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plm':
##
## between, lag, lead
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: bslib
##
## Attaching package: 'bslib'
## The following object is masked from 'package:utils':
##
## page
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 4.3.3
## Loading required package: corrplot
## corrplot 0.92 loaded
## Loading required package: reshape2
## Loading required package: vcd
## Warning: package 'vcd' was built under R version 4.3.3
## Loading required package: grid
## Loading required package: DescTools
## Warning: package 'DescTools' was built under R version 4.3.3
##
## Attaching package: 'DescTools'
## The following object is masked from 'package:car':
##
## Recode
#df <- read.csv('F:/Usuário/Documents/GitHub/IPO/data/processed_2014.csv')
df <- read.csv('C:/workspace/IPO/data/processed_2014.csv')
TobinsQ=firmsize+leverage+industrydummies are highly correlated?
NOTE: This model is reported with Robust Standard Errors with HC3
# cor(df$W_Leverage, df$W_Revenue)
df$W_Leverage <- scale(df$W_Leverage)
df$W_Revenue <- scale(df$W_Revenue)#((W_Revenue-average)/std)
management <- plm(W_TobinsQ ~
W_Leverage+
W_Revenue+
ICB_BasicMaterials+
ICB_ConsumerDiscretionary+
ICB_ConsumerStaples+
ICB_Energy+
ICB_Financials+
ICB_HealthCare+
ICB_Industrials+
ICB_RealEstate+
ICB_Technology+
ICB_Telecommunications+
ICB_Utilities ,
data = df, effect="twoways", index=c("ID", "Year"), model="within") #“within”, “random”, “ht”, “between”, “pooling”,
#summary(management)
coeftest(management, vcov=vcovHC, type="HC3")
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## W_Leverage 0.41202 0.13642 3.0203 0.002593 **
## W_Revenue -0.18856 0.22589 -0.8347 0.404091
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
I am running a few basic specification so we can generate some
insights. Note the First model is the
TobinsQ=firmsize+leverage+industrydummies with 2 ways fixed effects, the
second model use log. The third and fourth is using Winsorize data same
structures.
NOTE: In the next iteration I will have All the models using Robust
Standard Errors with HC3 (more severe punishment for
heteroskedasticity)
rev_1 <- plm(TobinsQ ~
Revenue+
Leverage+
ICB_BasicMaterials+
ICB_ConsumerDiscretionary+
ICB_ConsumerStaples+
ICB_Energy+
ICB_Financials+
ICB_HealthCare+
ICB_Industrials+
ICB_RealEstate+
ICB_Technology+
ICB_Telecommunications+
ICB_Utilities ,
data = df, effect="twoways", index=c("ID", "Year"), model="within") #“within”, “random”, “ht”, “between”, “pooling”,
rev_2 <- plm(TobinsQ ~
log_Revenue+
#leverage+
ICB_BasicMaterials+
ICB_ConsumerDiscretionary+
ICB_ConsumerStaples+
ICB_Energy+
ICB_Financials+
ICB_HealthCare+
ICB_Industrials+
ICB_RealEstate+
ICB_Technology+
ICB_Telecommunications+
ICB_Utilities ,
data = df, effect="twoways", index=c("ID", "Year"), model="within") #“within”, “random”, “ht”, “between”, “pooling”,
rev_3 <- plm(W_TobinsQ ~
W_Leverage+
W_Revenue+
ICB_BasicMaterials+
ICB_ConsumerDiscretionary+
ICB_ConsumerStaples+
ICB_Energy+
ICB_Financials+
ICB_HealthCare+
ICB_Industrials+
ICB_RealEstate+
ICB_Technology+
ICB_Telecommunications+
ICB_Utilities ,
data = df, effect="twoways", index=c("ID", "Year"), model="within") #“within”, “random”, “ht”, “between”, “pooling”,
rev_4 <- plm(W_TobinsQ ~
W_Leverage+
W_Revenue+
ICB_BasicMaterials+
ICB_ConsumerDiscretionary+
ICB_ConsumerStaples+
ICB_Energy+
ICB_Financials+
ICB_HealthCare+
ICB_Industrials+
ICB_RealEstate+
ICB_Technology+
ICB_Telecommunications+
ICB_Utilities ,
data = df, effect="twoways", index=c("ID", "Year"), model="pooling") #“within”, “random”, “ht”, “between”, “pooling”,
#stargazer(c_fe, c_tfe, c_lag_fe, c_lag_tfe, type = "text", summary = TRUE)
stargazer(
rev_1, rev_2, rev_3, rev_4,
type = "text",
summary = TRUE,
title = "Comparisons: Fixed Effects Specification",
#column.labels = c("Individual FE", "Two-Way FE", "Individual FE", "Two-Way FE"),
notes = "Models 2 and 4 use two-way fixed effects, while Models 1 and 3 use individual fixed effects."
)
##
## Comparisons: Fixed Effects Specification
## =========================================================================================================================
## Dependent variable:
## -----------------------------------------------------------------------------------------------
## TobinsQ W_TobinsQ
## (1) (2) (3) (4)
## -------------------------------------------------------------------------------------------------------------------------
## Revenue 0.00000
## (0.00000)
##
## Leverage 10.667**
## (4.775)
##
## log_Revenue 35.687
## (173.259)
##
## W_Leverage 0.412*** 0.661***
## (0.086) (0.095)
##
## W_Revenue -0.189 -0.374***
## (0.356) (0.098)
##
## ICB_BasicMaterials -4.661***
## (0.713)
##
## ICB_ConsumerDiscretionary -3.238***
## (0.453)
##
## ICB_ConsumerStaples -0.603
## (0.682)
##
## ICB_Energy -3.647***
## (0.564)
##
## ICB_Financials -4.049***
## (0.451)
##
## ICB_HealthCare -1.641***
## (0.405)
##
## ICB_Industrials -3.298***
## (0.465)
##
## ICB_RealEstate -4.324***
## (0.633)
##
## ICB_Technology -1.280***
## (0.492)
##
## ICB_Telecommunications 0.731
## (0.961)
##
## ICB_Utilities -3.175***
## (0.812)
##
## Constant 5.365***
## (0.370)
##
## -------------------------------------------------------------------------------------------------------------------------
## Observations 1,145 1,145 1,145 1,145
## R2 0.005 0.00004 0.024 0.227
## Adjusted R2 -0.207 -0.212 -0.184 0.219
## F Statistic 2.530* (df = 2; 943) 0.042 (df = 1; 944) 11.614*** (df = 2; 943) 25.616*** (df = 13; 1131)
## =========================================================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
## Models 2 and 4 use two-way fixed effects, while Models 1 and 3 use individual fixed effects.
##Including other variables
#variables to check
# R.D_Expense
# PPE_Total
# GDP
rev_1 <- plm(W_TobinsQ ~
W_Leverage+
W_Revenue+
Board_Committee+
ESG_Score+
CEO_Chairman_Duality_Current+
Independent_Board+
Total_Employment
,
data = df, effect="twoways", index=c("ID", "Year"), model="within") #“within”, “random”, “ht”, “between”, “pooling”,
rev_2 <- plm(W_TobinsQ ~
W_Leverage+
W_Revenue+
Board_Committee+
ESG_Score+
CEO_Chairman_Duality_Current+
Independent_Board+
Total_Employment+
Audit_Expertise+
CEO_Board_Member
,
data = df, effect="twoways", index=c("ID", "Year"), model="within") #“within”, “random”, “ht”, “between”, “pooling”,
rev_3 <- plm(W_TobinsQ ~
W_Leverage+
W_Revenue+
Board_Committee+
ESG_Score+
CEO_Chairman_Duality_Current+
Independent_Board+
Total_Employment+
Audit_Expertise+
CEO_Board_Member+
Employees_FTE_Period_End+
PCE+
Shareholders_Score+
Voting_Right_Share
,
data = df, effect="twoways", index=c("ID", "Year"), model="within") #“within”, “random”, “ht”, “between”, “pooling”,
rev_4 <- plm(W_TobinsQ ~
W_Leverage+
W_Revenue+
ICB_BasicMaterials+
ICB_ConsumerDiscretionary+
ICB_ConsumerStaples+
ICB_Energy+
ICB_Financials+
ICB_HealthCare+
ICB_Industrials+
ICB_RealEstate+
ICB_Technology+
ICB_Telecommunications+
ICB_Utilities ,
data = df, effect="twoways", index=c("ID", "Year"), model="pooling") #“within”, “random”, “ht”, “between”, “pooling”,
#stargazer(c_fe, c_tfe, c_lag_fe, c_lag_tfe, type = "text", summary = TRUE)
stargazer(
rev_1, rev_2, rev_3, rev_4,
type = "text",
summary = TRUE,
title = "Comparisons: Fixed Effects Specification",
#column.labels = c("Individual FE", "Two-Way FE", "Individual FE", "Two-Way FE"),
notes = "Models 2 and 4 use two-way fixed effects, while Models 1 and 3 use individual fixed effects."
)
##
## Comparisons: Fixed Effects Specification
## ============================================================================================================================
## Dependent variable:
## -----------------------------------------------------------------------------------------------
## W_TobinsQ
## (1) (2) (3) (4)
## ----------------------------------------------------------------------------------------------------------------------------
## W_Leverage 14.077** 14.430** 13.989* 0.661***
## (6.142) (7.101) (7.359) (0.095)
##
## W_Revenue 0.062 0.059 0.072 -0.374***
## (0.363) (0.364) (0.376) (0.098)
##
## Board_Committee -0.738 -0.450 -0.376
## (1.104) (1.166) (1.195)
##
## ESG_Score -0.010 -0.006 -0.011
## (0.016) (0.016) (0.017)
##
## CEO_Chairman_Duality_Current -0.869** -0.673 -0.678
## (0.401) (0.433) (0.442)
##
## Independent_Board 0.009 0.014 0.013
## (0.013) (0.014) (0.014)
##
## Total_Employment 0.0001 -0.0001 -0.00001
## (248.780) (242.866) (42.162)
##
## Audit_Expertise -0.310 -0.334
## (0.525) (0.578)
##
## CEO_Board_Member -1.637* -1.564*
## (0.867) (0.899)
##
## Employees_FTE_Period_End -0.00000
## (0.00003)
##
## PCE 0.00004
## (240.166)
##
## Shareholders_Score 0.004
## (0.007)
##
## Voting_Right_Share 0.565
## (1.769)
##
## ICB_BasicMaterials -4.661***
## (0.713)
##
## ICB_ConsumerDiscretionary -3.238***
## (0.453)
##
## ICB_ConsumerStaples -0.603
## (0.682)
##
## ICB_Energy -3.647***
## (0.564)
##
## ICB_Financials -4.049***
## (0.451)
##
## ICB_HealthCare -1.641***
## (0.405)
##
## ICB_Industrials -3.298***
## (0.465)
##
## ICB_RealEstate -4.324***
## (0.633)
##
## ICB_Technology -1.280***
## (0.492)
##
## ICB_Telecommunications 0.731
## (0.961)
##
## ICB_Utilities -3.175***
## (0.812)
##
## Constant 5.365***
## (0.370)
##
## ----------------------------------------------------------------------------------------------------------------------------
## Observations 383 381 375 1,145
## R2 0.041 0.054 0.056 0.227
## Adjusted R2 -0.454 -0.449 -0.483 0.219
## F Statistic 1.524 (df = 7; 252) 1.575 (df = 9; 248) 1.094 (df = 13; 238) 25.616*** (df = 13; 1131)
## ============================================================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
## Models 2 and 4 use two-way fixed effects, while Models 1 and 3 use individual fixed effects.
Checking Correlations
Main Variables
selected_data <- df[, c("TobinsQ",
"log_Net_Income",
"Leverage",
"Revenue",
"log_Market_Value",
"log_Revenue",
"Market_Cap",
"log_Market_Cap",
"W_Revenue",
"W_Leverage",
'W_TobinsQ'
)]
# Assuming 'selected_data' is your dataframe
# Convert all columns to numeric, handling conversion issues
selected_data[] <- lapply(selected_data, function(x) {
# Try converting to numeric and handle non-convertible cases by turning them into NA
as.numeric(as.character(x))
})
# Remove infinite values if any
selected_data[] <- lapply(selected_data, function(x) {
x[is.infinite(x)] <- NA
return(x)
})
# Recompute the correlation matrix handling NA values
correlation_matrix <- cor(selected_data, use = "pairwise.complete.obs")
# Check for any infinite or NA values in the correlation matrix
if(any(is.infinite(correlation_matrix), na.rm = TRUE) || any(is.na(correlation_matrix), na.rm = TRUE)) {
print("Correlation matrix still contains Inf or NA values")
} else {
# Perform hierarchical clustering if the correlation matrix is clean
dist_matrix <- as.dist(1 - correlation_matrix)
hc <- hclust(dist_matrix, method = "complete")
plot(hc) # Plot the dendrogram
}

# Optional: Plot correlation matrix if interactive and clean
if(interactive() && !any(is.na(correlation_matrix), na.rm = TRUE) && !any(is.infinite(correlation_matrix), na.rm = TRUE)) {
library(corrplot)
corrplot(correlation_matrix, method = "number", type = "upper", order = "hclust",
tl.col = "black", tl.srt = 45, # Text label color and rotation
addCoef.col = "black") # Add correlation coefficients in black
}
Correlation matrix - This is with all the data
#Variables we need to check before use
#"S.P500",
#"BaaCorpBond"
#"RealEstate"
#"Shares_Outstanding_Total"
#Columns of interes
selected_data <- df[, c("SIC_Code",
"Market_Cap",
"Revenue",
"Net_Income",
"Assets_Actual",
"Assets_Total",
"Debt_Actual",
"Debt_Total",
"Debt_LongTerm",
"ROA_Actual",
"ROE_Actual",
"Common_Shares_Total",
"Board_Committee",
"Executives_Compensation",
"Exec_Comp_To_Revenue",
"CEO_Chairman_Duality_Current",
"CEO_Chairman_Duality",
"CEO_Board_Member",
"CEO_Comp_Link_TSR",
"Voting_Right_Share",
"ESG_Score",
"Governance_Score",
"Shareholders_Score",
"Comp_Controversies_Score",
"Independent_Board",
"Comp_LT_Objectives",
"Audit_Expertise",
"R.D_Expense",
"R.D_Expense_Surprise",
"Advertising_Expense",
"PPE_Total",
"Capex_Total",
"Employees_FTE_Period_End",
"Equity_Total",
"Market_Value",
"GDP",
"PerCapitaPCE",
"PerCapitaIncome",
"PCE",
"Personal_Income",
"RealGDP_Millions",
"RealPersonalIncome_Millions",
"Total_Employment",
"Leverage",
"Solvency",
"TobinsQ",
"log_Assets",
"log_Revenue" )]
# Assuming 'selected_data' is your dataframe
# Convert all columns to numeric, handling conversion issues
selected_data[] <- lapply(selected_data, function(x) {
# Try converting to numeric and handle non-convertible cases by turning them into NA
as.numeric(as.character(x))
})
# Remove infinite values if any
selected_data[] <- lapply(selected_data, function(x) {
x[is.infinite(x)] <- NA
return(x)
})
# Recompute the correlation matrix handling NA values
correlation_matrix <- cor(selected_data, use = "pairwise.complete.obs")
# Check for any infinite or NA values in the correlation matrix
if(any(is.infinite(correlation_matrix), na.rm = TRUE) || any(is.na(correlation_matrix), na.rm = TRUE)) {
print("Correlation matrix still contains Inf or NA values")
} else {
# Perform hierarchical clustering if the correlation matrix is clean
dist_matrix <- as.dist(1 - correlation_matrix)
hc <- hclust(dist_matrix, method = "complete")
plot(hc) # Plot the dendrogram
}

# Optional: Plot correlation matrix if interactive and clean
if(interactive() && !any(is.na(correlation_matrix), na.rm = TRUE) && !any(is.infinite(correlation_matrix), na.rm = TRUE)) {
library(corrplot)
corrplot(correlation_matrix, method = "number", type = "upper", order = "hclust",
tl.col = "black", tl.srt = 45, # Text label color and rotation
addCoef.col = "black") # Add correlation coefficients in black
}
# Assuming the previous steps are performed and the correlation matrix is clean
if(!any(is.infinite(correlation_matrix), na.rm = TRUE) && !any(is.na(correlation_matrix), na.rm = TRUE)) {
dist_matrix <- as.dist(1 - correlation_matrix)
hc <- hclust(dist_matrix, method = "complete")
# Open a PNG device
png("hierarchical_clustering_dendrogram.png", width = 1920, height = 1080)
plot(hc) # Plot the dendrogram
dev.off() # Close the device
}
## png
## 2
Correlation matrix - Dummies
#Columns of interest
selected_data <- df[, c("ICB_BasicMaterials",
"ICB_ConsumerDiscretionary",
"ICB_ConsumerStaples",
"ICB_Energy",
"ICB_Financials",
"ICB_HealthCare",
"ICB_Industrials",
"ICB_RealEstate",
"ICB_Technology",
"ICB_Telecommunications",
"ICB_Utilities")]
# Calculate the correlation matrix
correlation_matrix <- cor(selected_data, use = "complete.obs") # 'use' handles missing values by using complete cases
# Initialize a matrix to store the Phi coefficients
phi_matrix <- matrix(nrow = ncol(selected_data), ncol = ncol(selected_data),
dimnames = list(colnames(selected_data), colnames(selected_data)))
# Loop over all pairs of variables to calculate the Phi coefficient
for (i in 1:ncol(selected_data)) {
for (j in i:ncol(selected_data)) {
if (i == j) {
phi_matrix[i, j] <- 1 # The correlation of a variable with itself is 1
} else {
# Calculate the Phi coefficient for each pair of columns
temp_table <- table(selected_data[, i], selected_data[, j])
phi_matrix[i, j] <- phi_matrix[j, i] <- assocstats(temp_table)$phi
}
}
}
# Display the Phi coefficient matrix
#print(phi_matrix)
#Visual Inspection
cor_matrix <- cor(selected_data, method = "pearson")
# Melt the correlation matrix for visualization
melted_cor_matrix <- melt(cor_matrix)
# Plotting the correlation matrix using ggplot2
ggplot(melted_cor_matrix, aes(x=Var1, y=Var2, fill=value)) +
geom_tile() +
scale_fill_gradient2(midpoint=0, low="blue", high="red", mid="white") + # Adjust color scales if needed
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), # Adjust text angle for better visibility
axis.text.y = element_text(angle = 45, vjust = 1))

Ignore from here forward for now
LASSO
# # Prepare data for glmnet
# x <- model.matrix(Market_Cap ~
# Employees_FTE_Period_End+
# Revenue+
# Net_Income+
# Assets_Actual+
# Debt_Actual+
# Debt_LongTerm+
# #ROA_Actual+ #Problem
# #ROE_Actual+ #Problem
# Equity_Total+
# PPE_Total+
# Capex_Total+
# R.D_Expense+
# #R.D_Expense_Surprise+ #Problem
# Advertising_Expense, data = df)[, -1] # Remove intercept
# y <- df$Market_Cap
#
# # Ridge Regression
# ridge_model <- glmnet(x, y, alpha = 0)
# cv_ridge <- cv.glmnet(x, y, alpha = 0)
# best_lambda_ridge <- cv_ridge$lambda.min
# ridge_model_final <- glmnet(x, y, alpha = 0, lambda = best_lambda_ridge)
# coef(ridge_model_final)
#
# # Running the model without one of the highly correlated variables
# linear_model <- lm(Market_Cap ~ Employees_FTE_Period_End + Revenue + Net_Income +
# Assets_Actual + Debt_Actual + Debt_LongTerm + ROA_Actual + ROE_Actual +
# Equity_Total + PPE_Total + Capex_Total + R.D_Expense + R.D_Expense_Surprise +
# Advertising_Expense, data = df)
# Check for VIF
#vif_values <- vif(linear_model)
#print(vif_values)
# Reviewing VIF values and deciding which variables to keep/remove
#Checking variables structure
#
# df<- pdata.frame(df, index = c("ID", "Year"))
# # df[is.na(df)] <- 0
# df[] <- lapply(df, function(x) if(is.factor(x)) as.numeric(x) else x)
# sapply(df,class)
# # colnames(df)
#
#
# # Assuming your data frame is named 'df'
# nan_counts <- sapply(df, function(x) sum(is.na(x)))
# print(nan_counts)
#
# summary_stats <- df %>%
# summarise(across(everything(), list(min = min,
# mean = mean,
# median = median,
# max = max,
# stdev = sd)))
Delete
# management <- plm(Market_Cap ~
# Employees_FTE_Period_End+
# Revenue+
# Net_Income+
# Assets_Actual+
# Debt_Actual+
# Debt_LongTerm+
# #ROA_Actual+ #Problem
# #ROE_Actual+ #Problem
# Equity_Total+
# PPE_Total+
# Capex_Total+
# R.D_Expense+
# #R.D_Expense_Surprise+ #Problem
# Advertising_Expense,
# data = df, effect="twoways", index=c("ID", "Year"), model="within") #“within”, “random”, “ht”, “between”, “pooling”,
# summary(management)
# coeftest(management, vcov=vcovHC, type="HC3")
LASSO
# # Prepare data for glmnet
# x <- model.matrix(Market_Cap ~
# Employees_FTE_Period_End+
# Revenue+
# Net_Income+
# Assets_Actual+
# Debt_Actual+
# Debt_LongTerm+
# #ROA_Actual+ #Problem
# #ROE_Actual+ #Problem
# Equity_Total+
# PPE_Total+
# Capex_Total+
# R.D_Expense+
# #R.D_Expense_Surprise+ #Problem
# Advertising_Expense, data = df)[, -1] # Remove intercept
# y <- df$Market_Cap
#
# # Ridge Regression
# ridge_model <- glmnet(x, y, alpha = 0)
# cv_ridge <- cv.glmnet(x, y, alpha = 0)
# best_lambda_ridge <- cv_ridge$lambda.min
# ridge_model_final <- glmnet(x, y, alpha = 0, lambda = best_lambda_ridge)
# coef(ridge_model_final)
DELETE
https://www.youtube.com/watch?v=v0PUi3S4l64 ## Dealing
with infinity values and NAN
# # Replace Inf/-Inf with NA in specific columns used in the model
# df$TobinsQ[df$TobinsQ == Inf | df$TobinsQ == -Inf] <- NA
# df$Leverage[df$Leverage == Inf | df$Leverage == -Inf] <- NA
# df$Revenue[df$Revenue == Inf | df$Revenue == -Inf] <- NA
#
#
# # Only omit rows with NA in the columns relevant to your model
# df <- df[!is.na(df$TobinsQ) & !is.na(df$Leverage) & !is.na(df$Revenue) & !is.na(df$ID) & !is.na(df$Year),]
# #df <- subset(df, !is.na(TobinsQ) & !is.na(Leverage) & !is.na(ID) & !is.na(Year) & !is.na(Revenue))
#
# # df <- df[!is.na(df$Leverage) & !is.na(df$Leverage) & !is.na(df$ID) & !is.na(df$Year), ]
# # df <- df[!is.na(df$Revenue) & !is.na(df$Revenue) & !is.na(df$ID) & !is.na(df$Year), ]
#
#
#
# # Only omit rows with NA in the columns relevant to your model
# #df <- df[!is.na(df$TobinsQ) & !is.na(df$Leverage) & !is.na(df$ID) & !is.na(df$Year), ]
#
Winsorizing
### ** Examples
#
# library(DescTools)
#
# ## generate data
# # set.seed(9128)
# # x <- round(runif(100) * 100, 1)
# #
# # (d.frm <- DescTools::Sort(data.frame(
# # x,
# # default = Winsorize(x),
# # quantile = Winsorize(x, quantile(x, probs=c(0.1, 0.8), na.rm = FALSE)),
# # fixed_val = Winsorize(x, val=c(15, 85)),
# # fixed_n = Winsorize(x, val=c(Small(x, k=3)[3], Large(x, k=3)[1])),
# # closest = Winsorize(x, val=unlist(Closest(x, c(30, 70))))
# # )))[c(1:10, 90:100), ]
#
# # df$Revenue
# df$W_Revenue <- Winsorize( df$Revenue, quantile( df$Revenue, probs=c(0.01,0.95), na.rm = FALSE))
# df$W_Leverage <- Winsorize( df$Leverage, quantile( df$Leverage, probs=c(0,01,0.95), na.rm = FALSE))
# df$W_TobinsQ <- Winsorize( df$TobinsQ, quantile( df$Leverage, probs=c(0.01,0.95), na.rm = FALSE))
#
# ### ** Examples
#
#
#
# library(DescTools)
#
# ## generate data
# set.seed(9128)
# x <- round(runif(100) * 100, 1)
#
# (d.frm <- DescTools::Sort(data.frame(
# x,
# default = Winsorize(x),
# quantile = Winsorize(x, quantile(x, probs=c(0.1, 0.8), na.rm = FALSE)),
# fixed_val = Winsorize(x, val=c(15, 85)),
# fixed_n = Winsorize(x, val=c(Small(x, k=3)[3], Large(x, k=3)[1])),
# closest = Winsorize(x, val=unlist(Closest(x, c(30, 70))))
# )))[c(1:10, 90:100), ]
#
#
# PlotLinesA(SetNames(d.frm, rownames=NULL), lwd=2, col=Pal("Tibco"),
# main="Winsorized Vector")
#
#
# z <- 0:10
# # twosided (default):
# Winsorize(z, val=c(2,8))
#
#
# # onesided:
# # ... replace all values > 8 with 8
# Winsorize(z, val=c(min(z), 8))
#
# # ... replace all values < 4 with 4
# Winsorize(z, val=c(4, max(z)))
#
#
#
# # Manual one-sided winsorization function to trim only upper values
# manual_trim_top <- function(x, lower=0.1, upper = 0.95) {
# upper_quantile <- quantile(x, probs = upper, na.rm = TRUE)
# x[x > upper_quantile] <- upper_quantile
# return(x)
# }
#
# # Applying the function to your dataset
# df$W_Revenue <- manual_trim_top(df$Revenue, upper = 0.99)
# df$W_Leverage <- manual_trim_top(df$Leverage, upper = 0.99)
# df$W_TobinsQ <- manual_trim_top(df$TobinsQ, upper = 0.99)
#
#
# quantile = Winsorize(x, quantile(x, probs=c(0.1, 0.8), na.rm = FALSE)),
#
#
# # Winsorize the variable at the 5th and 95th percentiles
# df$w_TobinsQ <- Winsorize(df$TobinsQ, limits = c(0, 0.01))
# #df$W_TobinsQ <- winsorize(df$TobinsQ, range = quantile(df$TobinsQ, probs = c(0, 0.99)))
# #df$W_Revenue <- winsorize(df$Revenue, range = quantile(df$Revenue, probs = c(0, 0.99)))
# df$W_Leverage <- winsorize(df$Leverage, range = quantile(df$Leverage, probs = c(0, 0.99), na.rm = FALSE))
#
# df$W_Leverage <- Winsorize(df$Leverage, minval = NULL, maxval = NULL, probs = c(0, 0.99), na.rm =TRUE, type =1)
#
# Winsorize(x, val = quantile(x, probs = c(0.05, 0.95), na.rm = FALSE))
#
#
#
# # Correct way to apply Winsorize with the minmax parameter
# df$W_TobinsQ <- Winsorize(df$TobinsQ, minmax = c(0.05, 0.95))
# df$W_Revenue <- Winsorize(df$Revenue, minmax = c(0.05, 0.95))
# df$W_Leverage <- Winsorize(df$Leverage, minmax = c(0.05, 0.95))
#
#
# # Test if any value in TobinsQ is greater than 10
# values_over_10 <- df$TobinsQ[df$TobinsQ > 10]
# result <- any(df$W_TobinsQ > 10)
# # Print the result
# print(values_over_10)
# #print(df$W_TobinsQ)
#
# summary(df$W_TobinsQ)
# summary(df$W_Revenue)
# summary(df$W_Leverage)
#
# sum(is.na(df$W_TobinsQ))
# sum(is.na(df$W_Revenue))
# sum(is.na(df$W_Leverage))
This is a check because I am having problems with these
variables
# # Correct usage of Winsorize with the range argument
# library(DescTools)
#
# df$W_TobinsQ <- Winsorize(df$TobinsQ, range = c(0.05, 0.95))
# df$W_Revenue <- Winsorize(df$Revenue, range = c(0.05, 0.95))
# df$W_Leverage <- Winsorize(df$Revenue, range = c(0.05, 0.95))
#
#
# library(car) # For vif function
# # Assuming W_Revenue and W_Leverage are numeric and no missing values
# fit_lm <- lm(W_TobinsQ ~ W_Revenue + W_Leverage, data = df)
# vif(fit_lm) # High VIF values indicate multicollinearity
#
# # Check for constant variables
# sapply(df[, c("W_TobinsQ", "W_Revenue", "W_Leverage")], function(x) length(unique(x)))