Response Requirements:
n(response==1)response!=binary & response!=countresponse==binary - use simple_logistic_regression.Rmdresponse==count - use simple_count_regression.Rmdresponse is an average value at a level of predictor, and the standard devaition can be calculated, use wls_given_variance.RmdPredictor Requirements
n(predictor==1)n(predictor!=1) use multivariate_lin_regression.Rmdpredictor!=factor & predictor!=binarypredictor==factor use simple_lin_categorical_predictor.Rmdpredictor==binaryuse simple_lin_dummy.Rmdif (response==count){ hist(respone)=approx_normal }
FUNCTIONS:
GGally::ggpairs(data) - check correlation
corrplot::corrplot(cor(data))
see common_transformation_patterns.Rmd
model_list<-list()
x<-seq(1,20)
y<-x^2
y_error<-50*runif(20)*sample(-1:1,20,replace = TRUE)
y<-y+y_error
df_exp<-data.frame(x,y)%>%mutate(y=ifelse(y<0,0,y))
ggplot(df_exp,aes(x=x,y=y))+geom_point()
model_list[['base']]<-lm(y~x,data=df_exp)
summary(model_list[['base']])
##
## Call:
## lm(formula = y ~ x, data = df_exp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -35.238 -17.025 4.600 7.394 50.156
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -70.2311 11.9416 -5.881 1.44e-05 ***
## x 21.0750 0.9969 21.141 3.69e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 25.71 on 18 degrees of freedom
## Multiple R-squared: 0.9613, Adjusted R-squared: 0.9591
## F-statistic: 446.9 on 1 and 18 DF, p-value: 3.688e-14
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
lmtest::bptest(model_list[['base']])
##
## studentized Breusch-Pagan test
##
## data: model_list[["base"]]
## BP = 0.52668, df = 1, p-value = 0.468
H_0: homoscedasitcity is present (residuals are distributed with equal variance)
H_A: heteroscedasticity is present (residuals are not distributed with equal variance) USE homoscedasticity_FAIL.Rmd
if lmtest::bptest(model)==H_A perform Weighted least squares
library(ggResidpanel)
## Warning: package 'ggResidpanel' was built under R version 4.1.3
ggResidpanel::resid_panel(model_list[['base']])
use powerTransform to get the optimal power to raise the response to, and run model again
(p1<-car::powerTransform(df_exp+.001))
## Estimated transformation parameters
## x y
## 0.9949228 0.6516880
model_list[['transform']]<-lm(y^p1$lambda[[2]]~x,data = df_exp)
summary(model_list[['transform']])
##
## Call:
## lm(formula = y^p1$lambda[[2]] ~ x, data = df_exp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.6996 -1.5684 0.2527 1.7144 3.3977
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.3891 1.1105 -3.952 0.000934 ***
## x 2.6551 0.0927 28.643 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.39 on 18 degrees of freedom
## Multiple R-squared: 0.9785, Adjusted R-squared: 0.9773
## F-statistic: 820.4 on 1 and 18 DF, p-value: < 2.2e-16
resid_panel(model_list[['transform']])
#function for looking at adjusted r squared values
lapply(model_list, function(f) list(adj.r.squared=summary(f)$adj.r.squared,beta_estimate=f$coefficient[[2]],confidence_interval=confint(f)))
## $base
## $base$adj.r.squared
## [1] 0.9591353
##
## $base$beta_estimate
## [1] 21.07498
##
## $base$confidence_interval
## 2.5 % 97.5 %
## (Intercept) -95.31959 -45.14268
## x 18.98064 23.16932
##
##
## $transform
## $transform$adj.r.squared
## [1] 0.9773376
##
## $transform$beta_estimate
## [1] 2.655149
##
## $transform$confidence_interval
## 2.5 % 97.5 %
## (Intercept) -6.722059 -2.056074
## x 2.460394 2.849903