Example 1

Simulating data:

library(pacman)
p_load(tidyverse)
p_load(kableExtra)
p_load(knitr)
p_load(ggrepel)
p_load(plotly)
p_load(scatterplot3d)
p_load(mvtnorm)
p_load(ggcorrplot)
p_load(mctest)
set.seed(150)

n<-250
Sex<-sample(c("F","M"),n,replace = T)
Age<-runif(n,18,100)
z1<-data.frame(Sex=factor(Sex),Age,mm=NA)
z1[z1$Sex %in% "F","mm"]<-77+1.2*z1[z1$Sex %in% "F","Age"]+rnorm(nrow(z1[z1$Sex %in% "F",]),0,sd=70)
z1[z1$Sex %in% "M","mm"]<-77+18+(1.2+6.6)*z1[z1$Sex %in% "M","Age"]+rnorm(nrow(z1[z1$Sex %in% "M",]),0,sd=70)

z1$mm[z1$mm<0]<-0
summary(z1)
##  Sex          Age              mm       
##  F:119   Min.   :18.61   Min.   :  0.0  
##  M:131   1st Qu.:38.01   1st Qu.:161.7  
##          Median :57.19   Median :279.0  
##          Mean   :58.69   Mean   :357.4  
##          3rd Qu.:78.22   3rd Qu.:555.2  
##          Max.   :99.70   Max.   :986.9
scatterplots of Age vs length (in mm) and sex of beetles.

scatterplots of Age vs length (in mm) and sex of beetles.

Age vs mm by sex

Fitting without centering:

m0int<-lm(mm~Sex*Age,data=z1)
(summ0<-summary(m0int))
## 
## Call:
## lm(formula = mm ~ Sex * Age, data = z1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -205.01  -48.15   -4.54   48.98  175.17 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  57.0912    17.7783   3.211   0.0015 ** 
## SexM         35.5481    24.1651   1.471   0.1426    
## Age           1.6462     0.2792   5.897 1.22e-08 ***
## SexM:Age      6.0888     0.3829  15.903  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 69.52 on 246 degrees of freedom
## Multiple R-squared:  0.9201, Adjusted R-squared:  0.9191 
## F-statistic:   944 on 3 and 246 DF,  p-value: < 2.2e-16

Fitting centering age:

m1int<-lm(mm~Sex*I(Age-mean(z1$Age)),data=z1)
(summ1<-summary(m1int))
## 
## Call:
## lm(formula = mm ~ Sex * I(Age - mean(z1$Age)), data = z1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -205.01  -48.15   -4.54   48.98  175.17 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                153.7135     6.3767  24.105  < 2e-16 ***
## SexM                       392.9143     8.8086  44.606  < 2e-16 ***
## I(Age - mean(z1$Age))        1.6462     0.2792   5.897 1.22e-08 ***
## SexM:I(Age - mean(z1$Age))   6.0888     0.3829  15.903  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 69.52 on 246 degrees of freedom
## Multiple R-squared:  0.9201, Adjusted R-squared:  0.9191 
## F-statistic:   944 on 3 and 246 DF,  p-value: < 2.2e-16

Comparison

As it can be seen the only values that change as result of centering are the intercept and sex, while the parameter related to age and the interaction term remain the same.

res<-rbind(summ0$coefficients,summ1$coefficients) %>% data.frame()
res$centering<-c(rep(F,nrow(summ0$coefficients)),rep(T,nrow(summ1$coefficients)))
res$terms<-rep(rownames(summ0$coefficients),2)
rownames(res)<-NULL
colnames(res)[c(2,4)]<-c("Std.Error","P-value")
kableres<-kable(res[order(res$terms),c(6,1:5)],caption="comparison of fitted model with and without centering Age",row.names =F)
kable_styling(kableres,"striped", position = "center",full_width = F)
comparison of fitted model with and without centering Age
terms Estimate Std.Error t.value P-value centering
(Intercept) 57.091155 17.7782557 3.211291 0.0014975 FALSE
(Intercept) 153.713504 6.3767196 24.105420 0.0000000 TRUE
Age 1.646242 0.2791719 5.896876 0.0000000 FALSE
Age 1.646242 0.2791719 5.896876 0.0000000 TRUE
SexM 35.548148 24.1650675 1.471055 0.1425542 FALSE
SexM 392.914315 8.8086428 44.605545 0.0000000 TRUE
SexM:Age 6.088769 0.3828696 15.902984 0.0000000 FALSE
SexM:Age 6.088769 0.3828696 15.902984 0.0000000 TRUE

Overall Multicollinearity Diagnostics

omcdiag(m0int)
## 
## Call:
## omcdiag(mod = m0int)
## 
## 
## Overall Multicollinearity Diagnostics
## 
##                        MC Results detection
## Determinant |X'X|:         0.1176         0
## Farrar Chi-Square:       529.1390         1
## Red Indicator:             0.5367         1
## Sum of Lambda Inverse:    18.1688         1
## Theil's Method:            0.4416         0
## Condition Number:         13.3050         0
## 
## 1 --> COLLINEARITY is detected by the test 
## 0 --> COLLINEARITY is not detected by the test
omcdiag(m1int)
## 
## Call:
## omcdiag(mod = m1int)
## 
## 
## Overall Multicollinearity Diagnostics
## 
##                        MC Results detection
## Determinant |X'X|:         0.4677         0
## Farrar Chi-Square:       187.8420         1
## Red Indicator:             0.4216         0
## Sum of Lambda Inverse:     5.2745         0
## Theil's Method:           -0.7752         0
## Condition Number:          2.5738         0
## 
## 1 --> COLLINEARITY is detected by the test 
## 0 --> COLLINEARITY is not detected by the test

However, there is a difference in multicollinearity diagnostics between the 2 models

Example 2

my_mean<-c(1500,3500)
mycors<-.6
sd_vec<-c(150,300)
temp_cor<-matrix(c(1,mycors,
                   mycors,1),
                 byrow = T,ncol=2)
V<-sd_vec %*% t(sd_vec) *temp_cor
n<-500
z0<-rmvnorm(n,my_mean,V)
z0<-data.frame(z0)
colnames(z0)<-c("EGF","TGF")
z0$EGF<-round(z0$EGF,0)
z0$TGF<-round(z0$TGF,0)
z0<-rbind(z0,
          data.frame(EGF=runif(n,1000,2000) %>% round(0),
                TGF=runif(n,2300,4500)%>% round(0))
)
z0$CD0<-dmvnorm(z0,my_mean,V) %>% {.*10^9} %>% sqrt
sd_coefs<-solve(matrix(c(1,0,1,63),nrow=2,byrow = T),c(log(3),log(9)))
z0$CD<-map_dbl(z0$CD0,~.+rnorm(1,0,exp(sd_coefs%*%c(1,.))))
z0$CD[z0$CD<0]<-0
z0$CD0<-NULL

Next we present the 3d scatterplot of this data set

plot_ly(z0,x=~TGF, y=~EGF, z=~CD, type="scatter3d", mode="markers",color=~CD,size =1)

Grafico de dispersión en 3d (Datos actualizados)

Fitting without centering:

m0pl3<-lm(CD~TGF+I(TGF^2)+EGF+I(EGF^2)+I(TGF*EGF)+I((TGF*EGF)^2), data=z0)
(sumpl3<-summary(m0pl3))
## 
## Call:
## lm(formula = CD ~ TGF + I(TGF^2) + EGF + I(EGF^2) + I(TGF * EGF) + 
##     I((TGF * EGF)^2), data = z0)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -31.617  -9.822  -0.414   9.517  61.967 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -6.433e+02  1.081e+02  -5.949 3.73e-09 ***
## TGF               2.129e-01  4.300e-02   4.950 8.73e-07 ***
## I(TGF^2)         -3.647e-05  3.415e-06 -10.681  < 2e-16 ***
## EGF               4.020e-01  9.926e-02   4.050 5.53e-05 ***
## I(EGF^2)         -1.633e-04  1.808e-05  -9.034  < 2e-16 ***
## I(TGF * EGF)      3.355e-05  2.832e-05   1.185    0.236    
## I((TGF * EGF)^2) -7.106e-13  1.374e-12  -0.517    0.605    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.22 on 993 degrees of freedom
## Multiple R-squared:  0.6216, Adjusted R-squared:  0.6193 
## F-statistic: 271.9 on 6 and 993 DF,  p-value: < 2.2e-16
omcdiag(m0pl3)
## 
## Call:
## omcdiag(mod = m0pl3)
## 
## 
## Overall Multicollinearity Diagnostics
## 
##                        MC Results detection
## Determinant |X'X|:         0.0000         1
## Farrar Chi-Square:     21150.2540         1
## Red Indicator:             0.6945         1
## Sum of Lambda Inverse: 12886.7179         1
## Theil's Method:            2.8874         1
## Condition Number:       1673.7470         1
## 
## 1 --> COLLINEARITY is detected by the test 
## 0 --> COLLINEARITY is not detected by the test

Fitting centering:

#centering data
z1<-z0
z1$EGF<-z1$EGF-mean(z1$EGF)
z1$TGF<-z1$TGF-mean(z1$TGF)
m0pl4<-lm(CD~I(TGF)+I(TGF^2)+EGF+I(EGF^2)+I(TGF*EGF)+I((TGF*EGF)^2), data=z1)
(sumpl4<-summary(m0pl4))
## 
## Call:
## lm(formula = CD ~ I(TGF) + I(TGF^2) + EGF + I(EGF^2) + I(TGF * 
##     EGF) + I((TGF * EGF)^2), data = z1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -31.797  -7.434   0.271   7.652  33.358 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       5.232e+01  5.853e-01  89.381  < 2e-16 ***
## I(TGF)           -6.233e-04  7.586e-04  -0.822   0.4115    
## I(TGF^2)         -6.064e-05  1.605e-06 -37.789  < 2e-16 ***
## EGF               3.144e-03  1.641e-03   1.916   0.0556 .  
## I(EGF^2)         -2.919e-04  8.123e-06 -35.937  < 2e-16 ***
## I(TGF * EGF)      2.322e-05  2.804e-06   8.284 3.82e-16 ***
## I((TGF * EGF)^2)  3.634e-10  1.663e-11  21.855  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.68 on 993 degrees of freedom
## Multiple R-squared:  0.7444, Adjusted R-squared:  0.7429 
## F-statistic: 482.1 on 6 and 993 DF,  p-value: < 2.2e-16
omcdiag(m0pl4)
## 
## Call:
## omcdiag(mod = m0pl4)
## 
## 
## Overall Multicollinearity Diagnostics
## 
##                        MC Results detection
## Determinant |X'X|:         0.2879         0
## Farrar Chi-Square:      1240.4988         1
## Red Indicator:             0.2536         0
## Sum of Lambda Inverse:    10.0227         0
## Theil's Method:           -2.0098         0
## Condition Number:          4.8442         0
## 
## 1 --> COLLINEARITY is detected by the test 
## 0 --> COLLINEARITY is not detected by the test

Comparison

Here meaningfull changes in parameter estimation can be appreciated.

res2<-rbind(sumpl3$coefficients,sumpl4$coefficients) %>% data.frame()
res2$centering<-c(rep(F,nrow(sumpl3$coefficients)),rep(T,nrow(sumpl4$coefficients)))
res2$terms<-rep(rownames(sumpl3$coefficients),2)
rownames(res2)<-NULL
colnames(res2)[c(2,4)]<-c("Std.Error","P-value")
kableres2<-kable(res2[order(res2$terms),c(6,1:5)],caption="comparison of fitted model with and without centering",row.names =F)
kable_styling(kableres2,"striped", position = "center",full_width = F)
comparison of fitted model with and without centering
terms Estimate Std.Error t.value P-value centering
(Intercept) -643.3130163 108.1365030 -5.9490829 0.0000000 FALSE
(Intercept) 52.3160328 0.5853125 89.3813745 0.0000000 TRUE
EGF 0.4019921 0.0992639 4.0497293 0.0000553 FALSE
EGF 0.0031437 0.0016406 1.9161962 0.0556270 TRUE
I((TGF * EGF)^2) 0.0000000 0.0000000 -0.5171688 0.6051535 FALSE
I((TGF * EGF)^2) 0.0000000 0.0000000 21.8545065 0.0000000 TRUE
I(EGF^2) -0.0001633 0.0000181 -9.0344551 0.0000000 FALSE
I(EGF^2) -0.0002919 0.0000081 -35.9365890 0.0000000 TRUE
I(TGF * EGF) 0.0000335 0.0000283 1.1846090 0.2364555 FALSE
I(TGF * EGF) 0.0000232 0.0000028 8.2841572 0.0000000 TRUE
I(TGF^2) -0.0000365 0.0000034 -10.6814684 0.0000000 FALSE
I(TGF^2) -0.0000606 0.0000016 -37.7885974 0.0000000 TRUE
TGF 0.2128521 0.0430040 4.9495937 0.0000009 FALSE
TGF -0.0006233 0.0007586 -0.8216315 0.4114839 TRUE

Overall Multicollinearity Diagnostics

omcdiag(m0pl3)
## 
## Call:
## omcdiag(mod = m0pl3)
## 
## 
## Overall Multicollinearity Diagnostics
## 
##                        MC Results detection
## Determinant |X'X|:         0.0000         1
## Farrar Chi-Square:     21150.2540         1
## Red Indicator:             0.6945         1
## Sum of Lambda Inverse: 12886.7179         1
## Theil's Method:            2.8874         1
## Condition Number:       1673.7470         1
## 
## 1 --> COLLINEARITY is detected by the test 
## 0 --> COLLINEARITY is not detected by the test
omcdiag(m0pl4)
## 
## Call:
## omcdiag(mod = m0pl4)
## 
## 
## Overall Multicollinearity Diagnostics
## 
##                        MC Results detection
## Determinant |X'X|:         0.2879         0
## Farrar Chi-Square:      1240.4988         1
## Red Indicator:             0.2536         0
## Sum of Lambda Inverse:    10.0227         0
## Theil's Method:           -2.0098         0
## Condition Number:          4.8442         0
## 
## 1 --> COLLINEARITY is detected by the test 
## 0 --> COLLINEARITY is not detected by the test

Huge improvement of Multicollinearity Diagnostics when centering.