This function allows a user to quickly run diagnostics on a regression model in order to determine if linear regression is appropriate.

Reason for function creation

The summary output that you get from r’s summary() function does not have enough information to fully diagnose the model. This Function includes plots to determine homoscedacity, autocorrelation tests, and a multicolinearity test. Additionally, the output gets saved as a data frame, so it can easily be exported to other programs.

Importing the data and renaming the file as DF

library(readxl)
MaskedData <- read_excel("F:/Summer2022/MaskedData.xlsx", 
    col_types = c("date", "numeric", "numeric", 
        "numeric", "numeric", "numeric", 
        "numeric", "numeric", "numeric", 
        "numeric", "numeric", "numeric", 
        "numeric", "numeric", "numeric", 
        "numeric", "numeric", "numeric", 
        "numeric", "numeric", "numeric", 
        "numeric", "numeric", "numeric", 
        "numeric", "numeric", "numeric", 
        "numeric", "numeric", "numeric", 
        "numeric", "numeric", "numeric", 
        "numeric"))
DF <- MaskedData

LM_DIAGNOSTIC Function

Inputs:

Outputs:

LM_DIAGNOSTIC <- function(df, yVar, xVars){
  
  require(car)
  require(gridExtra)
  
  #Create Y variable for regression model
  depV <- paste(yVar, "~")
  
  #create X variable string for regression model
  indepV <- paste(xVars, collapse = " + ")
  
  #Combine to form the full model
  formula <- paste(depV, indepV)
  
  #Run regression model
  regM <- lm(formula, data = df)
  
  #Collect P Values and coefficients and make a dataframe
  pcValues <- data.frame(summary(regM)$coefficients[,1],summary(regM)$coefficients[,4])
  names(pcValues)[1] <- "Estimate"
  names(pcValues)[2] <- "P Value"
  pcValues$VarSig <- ifelse(pcValues$`P Value` < .05, "True", "False")
  
  #Multicollinearity Test, VIF Values as a dataframe
  vifDF <- data.frame(car::vif(regM))
  names(vifDF)[1] <- "VIF"
  vifDF$NoMulticollinearity <- ifelse(vifDF$VIF < 5, "True", "False")
  
  #Summary Table for Variables
  variableSummary <- merge(pcValues,vifDF, by = 'row.names', all = TRUE)
  names(variableSummary)[1] <- "Model Variables"
  variableSummary <<- variableSummary
  variableSummary <- t(variableSummary)
  
  #Adjusted R Square as dataframe
  adjR <- data.frame(summary(regM)$adj.r.squared)
  names(adjR)[1] <- "AdjRSquare"
  row.names(adjR)[1] <- "Model Fit"
  
  #creates a new column that states the strength of the RSquare
  adjR$ModelFit <- ifelse(adjR$`AdjRSquare` > .9, "Strong", 
                                  ifelse(adjR$`AdjRSquare` > .75, "Moderate",
                                         ifelse(adjR$`AdjRSquare` > .5, "Weak", "Insignificant" )))
 
  
  #Autocorrelation Test, Durbin Watson as dataframe
  dw <- car::durbinWatsonTest(regM)
  dwStat <- data.frame(dw$dw)
  names(dwStat)[1] <- "DWStat"
  row.names(dwStat)[1] <- "Autocorrelation Info"
  
  #creates a column that states if the durbin watson test indicates if there is autocorrelation 
  dwStat$Autocorr <- ifelse(dwStat$`DWStat` < 1.7 || dwStat$`DWStat` > 2.3, "True", "False")
 
  
  #Extract FStats and make dataframe
  fstat <- summary(regM)$fstatistic[1]
  predictVars <- summary(regM)$fstatistic[2]
  degreesFreedom <- summary(regM)$fstatistic[3]
  modelPValue <- pf(fstat,predictVars,degreesFreedom, lower.tail=F)
 
  fStatTable <- data.frame(fstat,predictVars,degreesFreedom,modelPValue)
  names(fStatTable)[1] <- "FStat"
  names(fStatTable)[2] <- "NumXVars"
  names(fStatTable)[3] <- "DF"
  names(fStatTable)[4] <- "FStat-PVal"
  
  #Extract Residual Standard Error as dataframe
  rse <- data.frame(summary(regM)$sigma)
  names(rse)[1] <- "RSE"
  
  #Table for general model diagnostics
  modelSummary <- merge(dwStat,adjR)
  modelSummary <- merge(modelSummary,fStatTable)
  modelSummary <- merge(modelSummary,rse)
  row.names(modelSummary)[1] <- "Model Summary"
  modelSummary <<- modelSummary
  
  modelSummary <- t(modelSummary)
  
  vs <- gridExtra::tableGrob(variableSummary)
  ms <- gridExtra::tableGrob(modelSummary)
  
  gridExtra::grid.arrange(vs,ms)
  
  #Residuals Histogram, Check for approximately normal distribution in errors
  hist(residuals(regM), col ="steelblue")
  
  #Fitted vs Residual Plot, Normal QQ plot, Scale-Location Plot, Residuals vs Leverage Plot
  par(mfrow=c(2,2))
  plot(regM)
}
LM_DIAGNOSTIC(DF,"var1", c("var2","var3","var4"))

LM_DIAGNOSTIC R Function

R Script Description

Reason for function creation

Importing the data and renaming the file as DF

LM_DIAGNOSTIC Function

Inputs:

Outputs:

Output Explanation

Variable Summary

Model Summary

Residuals Histogram

Fitted vs Residuals Plot

Normal Q-Q Plot

Scale Location Plot

Residuals VS Leverage Plot

Conclusion