Requirements for simple linear regression

Response Requirements:

  1. n(response==1)
  2. response!=binary & response!=count
  1. IF response is an average value at a level of predictor, and the standard devaition can be calculated, use wls_given_variance.Rmd

Predictor Requirements

  1. n(predictor==1)
  1. predictor!=factor & predictor!=binary

if (response==count){ hist(respone)=approx_normal }

Exploratory Data Analysis

FUNCTIONS:

GGally::ggpairs(data) - check correlation

corrplot::corrplot(cor(data))

Patterns to look for

see common_transformation_patterns.Rmd

Build model list

model_list<-list()
x<-seq(1,20)
y<-x^2
y_error<-50*runif(20)*sample(-1:1,20,replace = TRUE)
y<-y+y_error

df_exp<-data.frame(x,y)%>%mutate(y=ifelse(y<0,0,y))
ggplot(df_exp,aes(x=x,y=y))+geom_point()

model_list[['base']]<-lm(y~x,data=df_exp)
summary(model_list[['base']])
## 
## Call:
## lm(formula = y ~ x, data = df_exp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -35.238 -17.025   4.600   7.394  50.156 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -70.2311    11.9416  -5.881 1.44e-05 ***
## x            21.0750     0.9969  21.141 3.69e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 25.71 on 18 degrees of freedom
## Multiple R-squared:  0.9613, Adjusted R-squared:  0.9591 
## F-statistic: 446.9 on 1 and 18 DF,  p-value: 3.688e-14

Check for heteroscedasticity

library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
lmtest::bptest(model_list[['base']])
## 
##  studentized Breusch-Pagan test
## 
## data:  model_list[["base"]]
## BP = 0.52668, df = 1, p-value = 0.468

H_0: homoscedasitcity is present (residuals are distributed with equal variance)

H_A: heteroscedasticity is present (residuals are not distributed with equal variance) USE homoscedasticity_FAIL.Rmd

if lmtest::bptest(model)==H_A perform Weighted least squares

Examine residuals

library(ggResidpanel)
## Warning: package 'ggResidpanel' was built under R version 4.1.3
ggResidpanel::resid_panel(model_list[['base']])

use powerTransform to get the optimal power to raise the response to, and run model again

(p1<-car::powerTransform(df_exp+.001))
## Estimated transformation parameters 
##         x         y 
## 0.9949228 0.6516880
model_list[['transform']]<-lm(y^p1$lambda[[2]]~x,data = df_exp)

summary(model_list[['transform']])
## 
## Call:
## lm(formula = y^p1$lambda[[2]] ~ x, data = df_exp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.6996 -1.5684  0.2527  1.7144  3.3977 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -4.3891     1.1105  -3.952 0.000934 ***
## x             2.6551     0.0927  28.643  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.39 on 18 degrees of freedom
## Multiple R-squared:  0.9785, Adjusted R-squared:  0.9773 
## F-statistic: 820.4 on 1 and 18 DF,  p-value: < 2.2e-16
resid_panel(model_list[['transform']])

Comparing models

#function for looking at adjusted r squared values
lapply(model_list, function(f) list(adj.r.squared=summary(f)$adj.r.squared,beta_estimate=f$coefficient[[2]],confidence_interval=confint(f)))
## $base
## $base$adj.r.squared
## [1] 0.9591353
## 
## $base$beta_estimate
## [1] 21.07498
## 
## $base$confidence_interval
##                 2.5 %    97.5 %
## (Intercept) -95.31959 -45.14268
## x            18.98064  23.16932
## 
## 
## $transform
## $transform$adj.r.squared
## [1] 0.9773376
## 
## $transform$beta_estimate
## [1] 2.655149
## 
## $transform$confidence_interval
##                 2.5 %    97.5 %
## (Intercept) -6.722059 -2.056074
## x            2.460394  2.849903