Brian Scott
This function allows a user to quickly run diagnostics on a regression model in order to determine if linear regression is appropriate.
The summary output that you get from r’s summary() function does not have enough information to fully diagnose the model. This Function includes plots to determine homoscedacity, autocorrelation tests, and a multicolinearity test. Additionally, the output gets saved as a data frame, so it can easily be exported to other programs.
This data is masked, each variable was renamed as var1,var2, etc.
library(readxl)
MaskedData <- read_excel("F:/Summer2022/MaskedData.xlsx",
col_types = c("date", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric", "numeric", "numeric",
"numeric"))
DF <- MaskedDataThe function has three inputs.
This function produces 7 outputs.
LM_DIAGNOSTIC <- function(df, yVar, xVars){
require(car)
require(gridExtra)
#Create Y variable for regression model
depV <- paste(yVar, "~")
#create X variable string for regression model
indepV <- paste(xVars, collapse = " + ")
#Combine to form the full model
formula <- paste(depV, indepV)
#Run regression model
regM <- lm(formula, data = df)
#Collect P Values and coefficients and make a dataframe
pcValues <- data.frame(summary(regM)$coefficients[,1],summary(regM)$coefficients[,4])
names(pcValues)[1] <- "Estimate"
names(pcValues)[2] <- "P Value"
pcValues$VarSig <- ifelse(pcValues$`P Value` < .05, "True", "False")
#Multicollinearity Test, VIF Values as a dataframe
vifDF <- data.frame(car::vif(regM))
names(vifDF)[1] <- "VIF"
vifDF$NoMulticollinearity <- ifelse(vifDF$VIF < 5, "True", "False")
#Summary Table for Variables
variableSummary <- merge(pcValues,vifDF, by = 'row.names', all = TRUE)
names(variableSummary)[1] <- "Model Variables"
variableSummary <<- variableSummary
variableSummary <- t(variableSummary)
#Adjusted R Square as dataframe
adjR <- data.frame(summary(regM)$adj.r.squared)
names(adjR)[1] <- "AdjRSquare"
row.names(adjR)[1] <- "Model Fit"
#creates a new column that states the strength of the RSquare
adjR$ModelFit <- ifelse(adjR$`AdjRSquare` > .9, "Strong",
ifelse(adjR$`AdjRSquare` > .75, "Moderate",
ifelse(adjR$`AdjRSquare` > .5, "Weak", "Insignificant" )))
#Autocorrelation Test, Durbin Watson as dataframe
dw <- car::durbinWatsonTest(regM)
dwStat <- data.frame(dw$dw)
names(dwStat)[1] <- "DWStat"
row.names(dwStat)[1] <- "Autocorrelation Info"
#creates a column that states if the durbin watson test indicates if there is autocorrelation
dwStat$Autocorr <- ifelse(dwStat$`DWStat` < 1.7 || dwStat$`DWStat` > 2.3, "True", "False")
#Extract FStats and make dataframe
fstat <- summary(regM)$fstatistic[1]
predictVars <- summary(regM)$fstatistic[2]
degreesFreedom <- summary(regM)$fstatistic[3]
modelPValue <- pf(fstat,predictVars,degreesFreedom, lower.tail=F)
fStatTable <- data.frame(fstat,predictVars,degreesFreedom,modelPValue)
names(fStatTable)[1] <- "FStat"
names(fStatTable)[2] <- "NumXVars"
names(fStatTable)[3] <- "DF"
names(fStatTable)[4] <- "FStat-PVal"
#Extract Residual Standard Error as dataframe
rse <- data.frame(summary(regM)$sigma)
names(rse)[1] <- "RSE"
#Table for general model diagnostics
modelSummary <- merge(dwStat,adjR)
modelSummary <- merge(modelSummary,fStatTable)
modelSummary <- merge(modelSummary,rse)
row.names(modelSummary)[1] <- "Model Summary"
modelSummary <<- modelSummary
modelSummary <- t(modelSummary)
vs <- gridExtra::tableGrob(variableSummary)
ms <- gridExtra::tableGrob(modelSummary)
gridExtra::grid.arrange(vs,ms)
#Residuals Histogram, Check for approximately normal distribution in errors
hist(residuals(regM), col ="steelblue")
#Fitted vs Residual Plot, Normal QQ plot, Scale-Location Plot, Residuals vs Leverage Plot
par(mfrow=c(2,2))
plot(regM)
}
LM_DIAGNOSTIC(DF,"var1", c("var2","var3","var4"))