rm(list=ls())

options(tinytex.engine = 'pdflatex')



#check remove packages we are not using
# Install packages
 PackageNames <- c("glmnet", 
                   "stargazer", 
                   "car", 
                   "lmtest", 
                   "plm", 
                   "ggplot2",
                   'dplyr',
                   'bslib',
                   'sandwich',
                   'corrplot',
                   'reshape2',
                   'vcd'
                   ,'DescTools'
                   #'robustHD'
                   )
for(i in PackageNames){
  if(!require(i, character.only = T)){
    install.packages(i, dependencies = T)
   require(i, character.only = T)
  }
}
## Loading required package: glmnet
## Warning: package 'glmnet' was built under R version 4.3.3
## Loading required package: Matrix
## Loaded glmnet 4.1-8
## Loading required package: stargazer
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
## Loading required package: car
## Warning: package 'car' was built under R version 4.3.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.3.3
## Loading required package: lmtest
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.3.3
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: plm
## Warning: package 'plm' was built under R version 4.3.3
## Loading required package: ggplot2
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plm':
## 
##     between, lag, lead
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Loading required package: bslib
## 
## Attaching package: 'bslib'
## The following object is masked from 'package:utils':
## 
##     page
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 4.3.3
## Loading required package: corrplot
## corrplot 0.92 loaded
## Loading required package: reshape2
## Loading required package: vcd
## Warning: package 'vcd' was built under R version 4.3.3
## Loading required package: grid
## Loading required package: DescTools
## Warning: package 'DescTools' was built under R version 4.3.3
## 
## Attaching package: 'DescTools'
## The following object is masked from 'package:car':
## 
##     Recode
#df <- read.csv('F:/Usuário/Documents/GitHub/IPO/data/processed_2014.csv')
df <- read.csv('C:/workspace/IPO/data/processed_2014.csv')

TobinsQ=firmsize+leverage+industrydummies are highly correlated?

NOTE: This model is reported with Robust Standard Errors with HC3

# cor(df$W_Leverage, df$W_Revenue)
df$W_Leverage <- scale(df$W_Leverage)
df$W_Revenue <- scale(df$W_Revenue)#((W_Revenue-average)/std)


 management <- plm(W_TobinsQ ~
                  W_Leverage+
                  W_Revenue+
                   ICB_BasicMaterials+
                   ICB_ConsumerDiscretionary+
                   ICB_ConsumerStaples+
                   ICB_Energy+
                   ICB_Financials+
                   ICB_HealthCare+
                   ICB_Industrials+
                   ICB_RealEstate+
                   ICB_Technology+
                   ICB_Telecommunications+
                   ICB_Utilities ,
                 data = df, effect="twoways", index=c("ID", "Year"),  model="within") #“within”, “random”, “ht”, “between”, “pooling”,
 #summary(management)
 coeftest(management, vcov=vcovHC, type="HC3")
## 
## t test of coefficients:
## 
##            Estimate Std. Error t value Pr(>|t|)   
## W_Leverage  0.41202    0.13642  3.0203 0.002593 **
## W_Revenue  -0.18856    0.22589 -0.8347 0.404091   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

I am running a few basic specification so we can generate some insights. Note the First model is the TobinsQ=firmsize+leverage+industrydummies with 2 ways fixed effects, the second model use log. The third and fourth is using Winsorize data same structures.

NOTE: In the next iteration I will have All the models using Robust Standard Errors with HC3 (more severe punishment for heteroskedasticity)

rev_1 <- plm(TobinsQ ~ 
                  Revenue+
                  Leverage+
                  ICB_BasicMaterials+
                  ICB_ConsumerDiscretionary+
                  ICB_ConsumerStaples+
                  ICB_Energy+
                  ICB_Financials+
                  ICB_HealthCare+
                  ICB_Industrials+
                  ICB_RealEstate+
                  ICB_Technology+
                  ICB_Telecommunications+
                  ICB_Utilities ,
                data = df, effect="twoways", index=c("ID", "Year"),  model="within") #“within”, “random”, “ht”, “between”, “pooling”,

rev_2 <- plm(TobinsQ ~ 
                  log_Revenue+
                  #leverage+
                  ICB_BasicMaterials+
                  ICB_ConsumerDiscretionary+
                  ICB_ConsumerStaples+
                  ICB_Energy+
                  ICB_Financials+
                  ICB_HealthCare+
                  ICB_Industrials+
                  ICB_RealEstate+
                  ICB_Technology+
                  ICB_Telecommunications+
                  ICB_Utilities ,
                data = df, effect="twoways", index=c("ID", "Year"),  model="within") #“within”, “random”, “ht”, “between”, “pooling”,

rev_3 <- plm(W_TobinsQ ~
                  W_Leverage+
                  W_Revenue+
                   ICB_BasicMaterials+
                   ICB_ConsumerDiscretionary+
                   ICB_ConsumerStaples+
                   ICB_Energy+
                   ICB_Financials+
                   ICB_HealthCare+
                   ICB_Industrials+
                   ICB_RealEstate+
                   ICB_Technology+
                   ICB_Telecommunications+
                   ICB_Utilities ,
                 data = df, effect="twoways", index=c("ID", "Year"),  model="within") #“within”, “random”, “ht”, “between”, “pooling”,
             
             
rev_4 <- plm(W_TobinsQ ~
                  W_Leverage+
                  W_Revenue+
                   ICB_BasicMaterials+
                   ICB_ConsumerDiscretionary+
                   ICB_ConsumerStaples+
                   ICB_Energy+
                   ICB_Financials+
                   ICB_HealthCare+
                   ICB_Industrials+
                   ICB_RealEstate+
                   ICB_Technology+
                   ICB_Telecommunications+
                   ICB_Utilities ,
                 data = df, effect="twoways", index=c("ID", "Year"),  model="pooling") #“within”, “random”, “ht”, “between”, “pooling”,
             
  
#stargazer(c_fe, c_tfe, c_lag_fe, c_lag_tfe, type = "text", summary = TRUE)
stargazer(
  rev_1, rev_2, rev_3, rev_4,
  type = "text",
  summary = TRUE,
  title = "Comparisons: Fixed Effects Specification",
    #column.labels = c("Individual FE", "Two-Way FE", "Individual FE", "Two-Way FE"),
  notes = "Models 2 and 4 use two-way fixed effects, while Models 1 and 3 use individual fixed effects."
)
## 
## Comparisons: Fixed Effects Specification
## =========================================================================================================================
##                                                                 Dependent variable:                                      
##                           -----------------------------------------------------------------------------------------------
##                                              TobinsQ                                        W_TobinsQ                    
##                                     (1)                    (2)                    (3)                      (4)           
## -------------------------------------------------------------------------------------------------------------------------
## Revenue                           0.00000                                                                                
##                                  (0.00000)                                                                               
##                                                                                                                          
## Leverage                         10.667**                                                                                
##                                   (4.775)                                                                                
##                                                                                                                          
## log_Revenue                                              35.687                                                          
##                                                         (173.259)                                                        
##                                                                                                                          
## W_Leverage                                                                     0.412***                 0.661***         
##                                                                                 (0.086)                  (0.095)         
##                                                                                                                          
## W_Revenue                                                                       -0.189                  -0.374***        
##                                                                                 (0.356)                  (0.098)         
##                                                                                                                          
## ICB_BasicMaterials                                                                                      -4.661***        
##                                                                                                          (0.713)         
##                                                                                                                          
## ICB_ConsumerDiscretionary                                                                               -3.238***        
##                                                                                                          (0.453)         
##                                                                                                                          
## ICB_ConsumerStaples                                                                                      -0.603          
##                                                                                                          (0.682)         
##                                                                                                                          
## ICB_Energy                                                                                              -3.647***        
##                                                                                                          (0.564)         
##                                                                                                                          
## ICB_Financials                                                                                          -4.049***        
##                                                                                                          (0.451)         
##                                                                                                                          
## ICB_HealthCare                                                                                          -1.641***        
##                                                                                                          (0.405)         
##                                                                                                                          
## ICB_Industrials                                                                                         -3.298***        
##                                                                                                          (0.465)         
##                                                                                                                          
## ICB_RealEstate                                                                                          -4.324***        
##                                                                                                          (0.633)         
##                                                                                                                          
## ICB_Technology                                                                                          -1.280***        
##                                                                                                          (0.492)         
##                                                                                                                          
## ICB_Telecommunications                                                                                    0.731          
##                                                                                                          (0.961)         
##                                                                                                                          
## ICB_Utilities                                                                                           -3.175***        
##                                                                                                          (0.812)         
##                                                                                                                          
## Constant                                                                                                5.365***         
##                                                                                                          (0.370)         
##                                                                                                                          
## -------------------------------------------------------------------------------------------------------------------------
## Observations                       1,145                  1,145                  1,145                    1,145          
## R2                                 0.005                 0.00004                 0.024                    0.227          
## Adjusted R2                       -0.207                 -0.212                 -0.184                    0.219          
## F Statistic                2.530* (df = 2; 943)    0.042 (df = 1; 944)  11.614*** (df = 2; 943) 25.616*** (df = 13; 1131)
## =========================================================================================================================
## Note:                                                                                         *p<0.1; **p<0.05; ***p<0.01
##                              Models 2 and 4 use two-way fixed effects, while Models 1 and 3 use individual fixed effects.

##Including other variables

#variables to check 
# R.D_Expense
# PPE_Total
# GDP

rev_1 <- plm(W_TobinsQ ~
              W_Leverage+
              W_Revenue+
               Board_Committee+
               ESG_Score+
               CEO_Chairman_Duality_Current+
               Independent_Board+
               Total_Employment
,
                data = df, effect="twoways", index=c("ID", "Year"),  model="within") #“within”, “random”, “ht”, “between”, “pooling”,

rev_2 <- plm(W_TobinsQ ~
              W_Leverage+
              W_Revenue+
               Board_Committee+
               ESG_Score+
               CEO_Chairman_Duality_Current+
               Independent_Board+
               Total_Employment+
               Audit_Expertise+
               CEO_Board_Member
             ,
                data = df, effect="twoways", index=c("ID", "Year"),  model="within") #“within”, “random”, “ht”, “between”, “pooling”,

rev_3 <- plm(W_TobinsQ ~
              W_Leverage+
              W_Revenue+
               Board_Committee+
               ESG_Score+
               CEO_Chairman_Duality_Current+
               Independent_Board+
               Total_Employment+
               Audit_Expertise+
               CEO_Board_Member+
               Employees_FTE_Period_End+
               PCE+
               Shareholders_Score+
               Voting_Right_Share
             ,
                 data = df, effect="twoways", index=c("ID", "Year"),  model="within") #“within”, “random”, “ht”, “between”, “pooling”,
             
             
rev_4 <- plm(W_TobinsQ ~
                  W_Leverage+
                  W_Revenue+
                   ICB_BasicMaterials+
                   ICB_ConsumerDiscretionary+
                   ICB_ConsumerStaples+
                   ICB_Energy+
                   ICB_Financials+
                   ICB_HealthCare+
                   ICB_Industrials+
                   ICB_RealEstate+
                   ICB_Technology+
                   ICB_Telecommunications+
                   ICB_Utilities ,
                 data = df, effect="twoways", index=c("ID", "Year"),  model="pooling") #“within”, “random”, “ht”, “between”, “pooling”,
             
  
#stargazer(c_fe, c_tfe, c_lag_fe, c_lag_tfe, type = "text", summary = TRUE)
stargazer(
  rev_1, rev_2, rev_3, rev_4,
  type = "text",
  summary = TRUE,
  title = "Comparisons: Fixed Effects Specification",
    #column.labels = c("Individual FE", "Two-Way FE", "Individual FE", "Two-Way FE"),
  notes = "Models 2 and 4 use two-way fixed effects, while Models 1 and 3 use individual fixed effects."
)
## 
## Comparisons: Fixed Effects Specification
## ============================================================================================================================
##                                                                    Dependent variable:                                      
##                              -----------------------------------------------------------------------------------------------
##                                                                         W_TobinsQ                                           
##                                        (1)                    (2)                    (3)                      (4)           
## ----------------------------------------------------------------------------------------------------------------------------
## W_Leverage                          14.077**                14.430**               13.989*                 0.661***         
##                                      (6.142)                (7.101)                (7.359)                  (0.095)         
##                                                                                                                             
## W_Revenue                             0.062                  0.059                  0.072                  -0.374***        
##                                      (0.363)                (0.364)                (0.376)                  (0.098)         
##                                                                                                                             
## Board_Committee                      -0.738                  -0.450                 -0.376                                  
##                                      (1.104)                (1.166)                (1.195)                                  
##                                                                                                                             
## ESG_Score                            -0.010                  -0.006                 -0.011                                  
##                                      (0.016)                (0.016)                (0.017)                                  
##                                                                                                                             
## CEO_Chairman_Duality_Current        -0.869**                 -0.673                 -0.678                                  
##                                      (0.401)                (0.433)                (0.442)                                  
##                                                                                                                             
## Independent_Board                     0.009                  0.014                  0.013                                   
##                                      (0.013)                (0.014)                (0.014)                                  
##                                                                                                                             
## Total_Employment                     0.0001                 -0.0001                -0.00001                                 
##                                     (248.780)              (242.866)               (42.162)                                 
##                                                                                                                             
## Audit_Expertise                                              -0.310                 -0.334                                  
##                                                             (0.525)                (0.578)                                  
##                                                                                                                             
## CEO_Board_Member                                            -1.637*                -1.564*                                  
##                                                             (0.867)                (0.899)                                  
##                                                                                                                             
## Employees_FTE_Period_End                                                           -0.00000                                 
##                                                                                   (0.00003)                                 
##                                                                                                                             
## PCE                                                                                0.00004                                  
##                                                                                   (240.166)                                 
##                                                                                                                             
## Shareholders_Score                                                                  0.004                                   
##                                                                                    (0.007)                                  
##                                                                                                                             
## Voting_Right_Share                                                                  0.565                                   
##                                                                                    (1.769)                                  
##                                                                                                                             
## ICB_BasicMaterials                                                                                         -4.661***        
##                                                                                                             (0.713)         
##                                                                                                                             
## ICB_ConsumerDiscretionary                                                                                  -3.238***        
##                                                                                                             (0.453)         
##                                                                                                                             
## ICB_ConsumerStaples                                                                                         -0.603          
##                                                                                                             (0.682)         
##                                                                                                                             
## ICB_Energy                                                                                                 -3.647***        
##                                                                                                             (0.564)         
##                                                                                                                             
## ICB_Financials                                                                                             -4.049***        
##                                                                                                             (0.451)         
##                                                                                                                             
## ICB_HealthCare                                                                                             -1.641***        
##                                                                                                             (0.405)         
##                                                                                                                             
## ICB_Industrials                                                                                            -3.298***        
##                                                                                                             (0.465)         
##                                                                                                                             
## ICB_RealEstate                                                                                             -4.324***        
##                                                                                                             (0.633)         
##                                                                                                                             
## ICB_Technology                                                                                             -1.280***        
##                                                                                                             (0.492)         
##                                                                                                                             
## ICB_Telecommunications                                                                                       0.731          
##                                                                                                             (0.961)         
##                                                                                                                             
## ICB_Utilities                                                                                              -3.175***        
##                                                                                                             (0.812)         
##                                                                                                                             
## Constant                                                                                                   5.365***         
##                                                                                                             (0.370)         
##                                                                                                                             
## ----------------------------------------------------------------------------------------------------------------------------
## Observations                           383                    381                    375                     1,145          
## R2                                    0.041                  0.054                  0.056                    0.227          
## Adjusted R2                          -0.454                  -0.449                 -0.483                   0.219          
## F Statistic                    1.524 (df = 7; 252)    1.575 (df = 9; 248)    1.094 (df = 13; 238)  25.616*** (df = 13; 1131)
## ============================================================================================================================
## Note:                                                                                            *p<0.1; **p<0.05; ***p<0.01
##                                 Models 2 and 4 use two-way fixed effects, while Models 1 and 3 use individual fixed effects.

Checking Correlations

Main Variables

selected_data <- df[, c("TobinsQ", 
                  "log_Net_Income",
                  "Leverage",
                  "Revenue",
                  "log_Market_Value",
                  "log_Revenue",
                  "Market_Cap",
                  "log_Market_Cap",
                  "W_Revenue", 
                  "W_Leverage",
                  'W_TobinsQ'
                  )]

# Assuming 'selected_data' is your dataframe

# Convert all columns to numeric, handling conversion issues
selected_data[] <- lapply(selected_data, function(x) {
  # Try converting to numeric and handle non-convertible cases by turning them into NA
  as.numeric(as.character(x))
})

# Remove infinite values if any
selected_data[] <- lapply(selected_data, function(x) {
  x[is.infinite(x)] <- NA
  return(x)
})

# Recompute the correlation matrix handling NA values
correlation_matrix <- cor(selected_data, use = "pairwise.complete.obs")

# Check for any infinite or NA values in the correlation matrix
if(any(is.infinite(correlation_matrix), na.rm = TRUE) || any(is.na(correlation_matrix), na.rm = TRUE)) {
  print("Correlation matrix still contains Inf or NA values")
} else {
  # Perform hierarchical clustering if the correlation matrix is clean
  dist_matrix <- as.dist(1 - correlation_matrix)
  hc <- hclust(dist_matrix, method = "complete")
  plot(hc)  # Plot the dendrogram
}

# Optional: Plot correlation matrix if interactive and clean
if(interactive() && !any(is.na(correlation_matrix), na.rm = TRUE) && !any(is.infinite(correlation_matrix), na.rm = TRUE)) {
  library(corrplot)
  corrplot(correlation_matrix, method = "number", type = "upper", order = "hclust",
           tl.col = "black", tl.srt = 45,  # Text label color and rotation
           addCoef.col = "black")  # Add correlation coefficients in black
}

Correlation matrix - This is with all the data

#Variables we need to check before use
#"S.P500",                      
#"BaaCorpBond"                  
#"RealEstate"
#"Shares_Outstanding_Total"

#Columns of interes
selected_data <- df[, c("SIC_Code",                     
                        "Market_Cap",                  
                        "Revenue",                
                        "Net_Income",                   
                        "Assets_Actual",                
                        "Assets_Total",                
                        "Debt_Actual",                  
                        "Debt_Total",                   
                        "Debt_LongTerm",                
                        "ROA_Actual",                  
                        "ROE_Actual",                   
                        "Common_Shares_Total",          
                        "Board_Committee",              
                        "Executives_Compensation",      
                        "Exec_Comp_To_Revenue",         
                        "CEO_Chairman_Duality_Current",
                        "CEO_Chairman_Duality",         
                        "CEO_Board_Member",             
                        "CEO_Comp_Link_TSR",            
                        "Voting_Right_Share",          
                        "ESG_Score",                    
                        "Governance_Score",             
                        "Shareholders_Score",           
                        "Comp_Controversies_Score",    
                        "Independent_Board",            
                        "Comp_LT_Objectives",           
                        "Audit_Expertise",              
                        "R.D_Expense",                 
                        "R.D_Expense_Surprise",         
                        "Advertising_Expense",          
                        "PPE_Total",                    
                        "Capex_Total",                 
                        "Employees_FTE_Period_End",     
                        "Equity_Total",                 
                        "Market_Value",                 
                        "GDP",                         
                        "PerCapitaPCE",                 
                        "PerCapitaIncome",              
                        "PCE",                          
                        "Personal_Income",             
                        "RealGDP_Millions",             
                        "RealPersonalIncome_Millions",  
                        "Total_Employment",             
                        "Leverage",                     
                        "Solvency",                     
                        "TobinsQ",                     
                        "log_Assets",                 
                        "log_Revenue" )]


# Assuming 'selected_data' is your dataframe

# Convert all columns to numeric, handling conversion issues
selected_data[] <- lapply(selected_data, function(x) {
  # Try converting to numeric and handle non-convertible cases by turning them into NA
  as.numeric(as.character(x))
})

# Remove infinite values if any
selected_data[] <- lapply(selected_data, function(x) {
  x[is.infinite(x)] <- NA
  return(x)
})

# Recompute the correlation matrix handling NA values
correlation_matrix <- cor(selected_data, use = "pairwise.complete.obs")

# Check for any infinite or NA values in the correlation matrix
if(any(is.infinite(correlation_matrix), na.rm = TRUE) || any(is.na(correlation_matrix), na.rm = TRUE)) {
  print("Correlation matrix still contains Inf or NA values")
} else {
  # Perform hierarchical clustering if the correlation matrix is clean
  dist_matrix <- as.dist(1 - correlation_matrix)
  hc <- hclust(dist_matrix, method = "complete")
  plot(hc)  # Plot the dendrogram
}

# Optional: Plot correlation matrix if interactive and clean
if(interactive() && !any(is.na(correlation_matrix), na.rm = TRUE) && !any(is.infinite(correlation_matrix), na.rm = TRUE)) {
  library(corrplot)
  corrplot(correlation_matrix, method = "number", type = "upper", order = "hclust",
           tl.col = "black", tl.srt = 45,  # Text label color and rotation
           addCoef.col = "black")  # Add correlation coefficients in black
}



# Assuming the previous steps are performed and the correlation matrix is clean
if(!any(is.infinite(correlation_matrix), na.rm = TRUE) && !any(is.na(correlation_matrix), na.rm = TRUE)) {
  dist_matrix <- as.dist(1 - correlation_matrix)
  hc <- hclust(dist_matrix, method = "complete")

  # Open a PNG device
  png("hierarchical_clustering_dendrogram.png", width = 1920, height = 1080)
  plot(hc)  # Plot the dendrogram
  dev.off()  # Close the device
}
## png 
##   2

Correlation matrix - Dummies

#Columns of interest
selected_data <- df[, c("ICB_BasicMaterials",
                        "ICB_ConsumerDiscretionary",
                        "ICB_ConsumerStaples",
                        "ICB_Energy",
                        "ICB_Financials",
                        "ICB_HealthCare",
                        "ICB_Industrials",
                        "ICB_RealEstate",
                        "ICB_Technology",
                        "ICB_Telecommunications",
                        "ICB_Utilities")]

# Calculate the correlation matrix
correlation_matrix <- cor(selected_data, use = "complete.obs")  # 'use' handles missing values by using complete cases

# Initialize a matrix to store the Phi coefficients
phi_matrix <- matrix(nrow = ncol(selected_data), ncol = ncol(selected_data), 
                     dimnames = list(colnames(selected_data), colnames(selected_data)))

# Loop over all pairs of variables to calculate the Phi coefficient
for (i in 1:ncol(selected_data)) {
  for (j in i:ncol(selected_data)) {
    if (i == j) {
      phi_matrix[i, j] <- 1  # The correlation of a variable with itself is 1
    } else {
      # Calculate the Phi coefficient for each pair of columns
      temp_table <- table(selected_data[, i], selected_data[, j])
      phi_matrix[i, j] <- phi_matrix[j, i] <- assocstats(temp_table)$phi
    }
  }
} 
# Display the Phi coefficient matrix
#print(phi_matrix)
#Visual Inspection
cor_matrix <- cor(selected_data, method = "pearson")

# Melt the correlation matrix for visualization
melted_cor_matrix <- melt(cor_matrix)

# Plotting the correlation matrix using ggplot2
ggplot(melted_cor_matrix, aes(x=Var1, y=Var2, fill=value)) +
  geom_tile() +
  scale_fill_gradient2(midpoint=0, low="blue", high="red", mid="white") +  # Adjust color scales if needed
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),  # Adjust text angle for better visibility
        axis.text.y = element_text(angle = 45, vjust = 1))

Ignore from here forward for now

LASSO

# # Prepare data for glmnet
# x <- model.matrix(Market_Cap ~                
#                 Employees_FTE_Period_End+
#                 Revenue+
#                 Net_Income+
#                 Assets_Actual+
#                 Debt_Actual+ 
#                 Debt_LongTerm+
#                 #ROA_Actual+ #Problem
#                 #ROE_Actual+ #Problem
#                 Equity_Total+
#                 PPE_Total+
#                 Capex_Total+
#                 R.D_Expense+
#                 #R.D_Expense_Surprise+ #Problem
#                 Advertising_Expense, data = df)[, -1]  # Remove intercept
# y <- df$Market_Cap
# 
# # Ridge Regression
# ridge_model <- glmnet(x, y, alpha = 0)
# cv_ridge <- cv.glmnet(x, y, alpha = 0)
# best_lambda_ridge <- cv_ridge$lambda.min
# ridge_model_final <- glmnet(x, y, alpha = 0, lambda = best_lambda_ridge)
# coef(ridge_model_final)
# 
# # Running the model without one of the highly correlated variables
# linear_model <- lm(Market_Cap ~ Employees_FTE_Period_End + Revenue + Net_Income +
#                    Assets_Actual + Debt_Actual + Debt_LongTerm + ROA_Actual + ROE_Actual +
#                    Equity_Total + PPE_Total + Capex_Total + R.D_Expense + R.D_Expense_Surprise +
#                    Advertising_Expense, data = df)

# Check for VIF
#vif_values <- vif(linear_model)
#print(vif_values)

# Reviewing VIF values and deciding which variables to keep/remove

#Checking variables structure

# 
# df<- pdata.frame(df, index = c("ID", "Year"))
# # df[is.na(df)] <- 0
# df[] <- lapply(df, function(x) if(is.factor(x)) as.numeric(x) else x)
# sapply(df,class)
# # colnames(df)
# 
# 
# # Assuming your data frame is named 'df'
# nan_counts <- sapply(df, function(x) sum(is.na(x)))
# print(nan_counts)
# 
# summary_stats <- df %>%
#   summarise(across(everything(), list(min = min, 
#                                       mean = mean, 
#                                       median = median, 
#                                       max = max, 
#                                       stdev = sd)))
Delete
# management <- plm(Market_Cap ~                
#                 Employees_FTE_Period_End+
#                 Revenue+
#                 Net_Income+
#                 Assets_Actual+
#                 Debt_Actual+ 
#                 Debt_LongTerm+
#                 #ROA_Actual+ #Problem
#                 #ROE_Actual+ #Problem
#                 Equity_Total+
#                 PPE_Total+
#                 Capex_Total+
#                 R.D_Expense+
#                 #R.D_Expense_Surprise+ #Problem
#                 Advertising_Expense,
#                 data = df, effect="twoways", index=c("ID", "Year"),  model="within") #“within”, “random”, “ht”, “between”, “pooling”,
# summary(management)
# coeftest(management, vcov=vcovHC, type="HC3") 

LASSO

# # Prepare data for glmnet
# x <- model.matrix(Market_Cap ~                
#                 Employees_FTE_Period_End+
#                 Revenue+
#                 Net_Income+
#                 Assets_Actual+
#                 Debt_Actual+ 
#                 Debt_LongTerm+
#                 #ROA_Actual+ #Problem
#                 #ROE_Actual+ #Problem
#                 Equity_Total+
#                 PPE_Total+
#                 Capex_Total+
#                 R.D_Expense+
#                 #R.D_Expense_Surprise+ #Problem
#                 Advertising_Expense, data = df)[, -1]  # Remove intercept
# y <- df$Market_Cap
# 
# # Ridge Regression
# ridge_model <- glmnet(x, y, alpha = 0)
# cv_ridge <- cv.glmnet(x, y, alpha = 0)
# best_lambda_ridge <- cv_ridge$lambda.min
# ridge_model_final <- glmnet(x, y, alpha = 0, lambda = best_lambda_ridge)
# coef(ridge_model_final)

DELETE

https://www.youtube.com/watch?v=v0PUi3S4l64 ## Dealing with infinity values and NAN

# # Replace Inf/-Inf with NA in specific columns used in the model
# df$TobinsQ[df$TobinsQ == Inf | df$TobinsQ == -Inf] <- NA
# df$Leverage[df$Leverage == Inf | df$Leverage == -Inf] <- NA
# df$Revenue[df$Revenue == Inf | df$Revenue == -Inf] <- NA
# 
# 
# # Only omit rows with NA in the columns relevant to your model
# df <- df[!is.na(df$TobinsQ) & !is.na(df$Leverage) & !is.na(df$Revenue) & !is.na(df$ID) & !is.na(df$Year),]
# #df <- subset(df, !is.na(TobinsQ) & !is.na(Leverage) & !is.na(ID) & !is.na(Year) & !is.na(Revenue))
# 
# # df <- df[!is.na(df$Leverage) & !is.na(df$Leverage) & !is.na(df$ID) & !is.na(df$Year), ]
# # df <- df[!is.na(df$Revenue) & !is.na(df$Revenue) & !is.na(df$ID) & !is.na(df$Year), ]
# 
# 
# 
# # Only omit rows with NA in the columns relevant to your model
# #df <- df[!is.na(df$TobinsQ) & !is.na(df$Leverage) & !is.na(df$ID) & !is.na(df$Year), ]
# 

Winsorizing

### ** Examples


# 
# library(DescTools)
# 
# ## generate data
# # set.seed(9128)
# # x <- round(runif(100) * 100, 1)
# # 
# # (d.frm <- DescTools::Sort(data.frame(
# #   x,
# #   default   = Winsorize(x),
# #   quantile  = Winsorize(x, quantile(x, probs=c(0.1, 0.8), na.rm = FALSE)),
# #   fixed_val = Winsorize(x, val=c(15, 85)),
# #   fixed_n   = Winsorize(x, val=c(Small(x, k=3)[3], Large(x, k=3)[1])),
# #   closest   = Winsorize(x, val=unlist(Closest(x, c(30, 70))))
# # )))[c(1:10, 90:100), ]
# 
# # df$Revenue
# df$W_Revenue <-  Winsorize( df$Revenue, quantile( df$Revenue, probs=c(0.01,0.95), na.rm = FALSE))
# df$W_Leverage <- Winsorize( df$Leverage, quantile( df$Leverage, probs=c(0,01,0.95), na.rm = FALSE))
# df$W_TobinsQ <- Winsorize( df$TobinsQ, quantile( df$Leverage, probs=c(0.01,0.95), na.rm = FALSE))
# 
# ### ** Examples
# 
# 
# 
# library(DescTools)
# 
# ## generate data
# set.seed(9128)
#  x <- round(runif(100) * 100, 1)
# 
#  (d.frm <- DescTools::Sort(data.frame(
#    x,
#    default   = Winsorize(x),
#    quantile  = Winsorize(x, quantile(x, probs=c(0.1, 0.8), na.rm = FALSE)),
#    fixed_val = Winsorize(x, val=c(15, 85)),
#    fixed_n   = Winsorize(x, val=c(Small(x, k=3)[3], Large(x, k=3)[1])),
#    closest   = Winsorize(x, val=unlist(Closest(x, c(30, 70))))
#  )))[c(1:10, 90:100), ]
# 
# 
#  PlotLinesA(SetNames(d.frm, rownames=NULL), lwd=2, col=Pal("Tibco"),
#            main="Winsorized Vector")
# 
# 
# z <- 0:10
# # twosided (default):
# Winsorize(z, val=c(2,8))
# 
# 
# # onesided:
# # ... replace all values > 8 with 8
# Winsorize(z, val=c(min(z), 8))
# 
# # ... replace all values < 4 with 4
# Winsorize(z, val=c(4, max(z)))
# 
# 
# 
# # Manual one-sided winsorization function to trim only upper values
# manual_trim_top <- function(x, lower=0.1, upper = 0.95) {
#   upper_quantile <- quantile(x, probs = upper, na.rm = TRUE)
#   x[x > upper_quantile] <- upper_quantile
#   return(x)
# }
# 
# # Applying the function to your dataset
# df$W_Revenue <- manual_trim_top(df$Revenue, upper = 0.99)
# df$W_Leverage <- manual_trim_top(df$Leverage, upper = 0.99)
# df$W_TobinsQ <- manual_trim_top(df$TobinsQ, upper = 0.99)
# 
# 
#   quantile  = Winsorize(x, quantile(x, probs=c(0.1, 0.8), na.rm = FALSE)),
# 
# 
# # Winsorize the variable at the 5th and 95th percentiles
# df$w_TobinsQ <- Winsorize(df$TobinsQ, limits = c(0, 0.01))
# #df$W_TobinsQ <- winsorize(df$TobinsQ, range = quantile(df$TobinsQ, probs = c(0, 0.99)))
# #df$W_Revenue <- winsorize(df$Revenue, range = quantile(df$Revenue, probs = c(0, 0.99)))
# df$W_Leverage <- winsorize(df$Leverage, range = quantile(df$Leverage, probs = c(0, 0.99), na.rm = FALSE))
# 
# df$W_Leverage <- Winsorize(df$Leverage, minval = NULL, maxval = NULL, probs = c(0, 0.99), na.rm =TRUE, type =1)
# 
# Winsorize(x, val = quantile(x, probs = c(0.05, 0.95), na.rm = FALSE))
# 
# 
# 
# # Correct way to apply Winsorize with the minmax parameter
# df$W_TobinsQ <- Winsorize(df$TobinsQ, minmax = c(0.05, 0.95))
# df$W_Revenue <- Winsorize(df$Revenue, minmax = c(0.05, 0.95))
# df$W_Leverage <- Winsorize(df$Leverage, minmax = c(0.05, 0.95))
# 
# 
# # Test if any value in TobinsQ is greater than 10
# values_over_10 <- df$TobinsQ[df$TobinsQ > 10]
# result <- any(df$W_TobinsQ > 10)
# # Print the result
# print(values_over_10) 
# #print(df$W_TobinsQ)
# 
# summary(df$W_TobinsQ)
# summary(df$W_Revenue)
# summary(df$W_Leverage)
# 
# sum(is.na(df$W_TobinsQ))
# sum(is.na(df$W_Revenue))
# sum(is.na(df$W_Leverage))

This is a check because I am having problems with these variables

# # Correct usage of Winsorize with the range argument
# library(DescTools)
# 
# df$W_TobinsQ <- Winsorize(df$TobinsQ, range = c(0.05, 0.95))
# df$W_Revenue <- Winsorize(df$Revenue, range = c(0.05, 0.95))
# df$W_Leverage <- Winsorize(df$Revenue, range = c(0.05, 0.95))
# 
# 
# library(car)  # For vif function
# # Assuming W_Revenue and W_Leverage are numeric and no missing values
# fit_lm <- lm(W_TobinsQ ~ W_Revenue + W_Leverage, data = df)
# vif(fit_lm)  # High VIF values indicate multicollinearity
# 
# # Check for constant variables
# sapply(df[, c("W_TobinsQ", "W_Revenue", "W_Leverage")], function(x) length(unique(x)))