Simulating data:
library(pacman)
p_load(tidyverse)
p_load(kableExtra)
p_load(knitr)
p_load(ggrepel)
p_load(plotly)
p_load(scatterplot3d)
p_load(mvtnorm)
p_load(ggcorrplot)
p_load(mctest)
set.seed(150)
n<-250
Sex<-sample(c("F","M"),n,replace = T)
Age<-runif(n,18,100)
z1<-data.frame(Sex=factor(Sex),Age,mm=NA)
z1[z1$Sex %in% "F","mm"]<-77+1.2*z1[z1$Sex %in% "F","Age"]+rnorm(nrow(z1[z1$Sex %in% "F",]),0,sd=70)
z1[z1$Sex %in% "M","mm"]<-77+18+(1.2+6.6)*z1[z1$Sex %in% "M","Age"]+rnorm(nrow(z1[z1$Sex %in% "M",]),0,sd=70)
z1$mm[z1$mm<0]<-0
summary(z1)
## Sex Age mm
## F:119 Min. :18.61 Min. : 0.0
## M:131 1st Qu.:38.01 1st Qu.:161.7
## Median :57.19 Median :279.0
## Mean :58.69 Mean :357.4
## 3rd Qu.:78.22 3rd Qu.:555.2
## Max. :99.70 Max. :986.9
scatterplots of Age vs length (in mm) and sex of beetles.
Age vs mm by sex
m0int<-lm(mm~Sex*Age,data=z1)
(summ0<-summary(m0int))
##
## Call:
## lm(formula = mm ~ Sex * Age, data = z1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -205.01 -48.15 -4.54 48.98 175.17
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 57.0912 17.7783 3.211 0.0015 **
## SexM 35.5481 24.1651 1.471 0.1426
## Age 1.6462 0.2792 5.897 1.22e-08 ***
## SexM:Age 6.0888 0.3829 15.903 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 69.52 on 246 degrees of freedom
## Multiple R-squared: 0.9201, Adjusted R-squared: 0.9191
## F-statistic: 944 on 3 and 246 DF, p-value: < 2.2e-16
m1int<-lm(mm~Sex*I(Age-mean(z1$Age)),data=z1)
(summ1<-summary(m1int))
##
## Call:
## lm(formula = mm ~ Sex * I(Age - mean(z1$Age)), data = z1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -205.01 -48.15 -4.54 48.98 175.17
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 153.7135 6.3767 24.105 < 2e-16 ***
## SexM 392.9143 8.8086 44.606 < 2e-16 ***
## I(Age - mean(z1$Age)) 1.6462 0.2792 5.897 1.22e-08 ***
## SexM:I(Age - mean(z1$Age)) 6.0888 0.3829 15.903 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 69.52 on 246 degrees of freedom
## Multiple R-squared: 0.9201, Adjusted R-squared: 0.9191
## F-statistic: 944 on 3 and 246 DF, p-value: < 2.2e-16
As it can be seen the only values that change as result of centering are the intercept and sex, while the parameter related to age and the interaction term remain the same.
res<-rbind(summ0$coefficients,summ1$coefficients) %>% data.frame()
res$centering<-c(rep(F,nrow(summ0$coefficients)),rep(T,nrow(summ1$coefficients)))
res$terms<-rep(rownames(summ0$coefficients),2)
rownames(res)<-NULL
colnames(res)[c(2,4)]<-c("Std.Error","P-value")
kableres<-kable(res[order(res$terms),c(6,1:5)],caption="comparison of fitted model with and without centering Age",row.names =F)
kable_styling(kableres,"striped", position = "center",full_width = F)
terms | Estimate | Std.Error | t.value | P-value | centering |
---|---|---|---|---|---|
(Intercept) | 57.091155 | 17.7782557 | 3.211291 | 0.0014975 | FALSE |
(Intercept) | 153.713504 | 6.3767196 | 24.105420 | 0.0000000 | TRUE |
Age | 1.646242 | 0.2791719 | 5.896876 | 0.0000000 | FALSE |
Age | 1.646242 | 0.2791719 | 5.896876 | 0.0000000 | TRUE |
SexM | 35.548148 | 24.1650675 | 1.471055 | 0.1425542 | FALSE |
SexM | 392.914315 | 8.8086428 | 44.605545 | 0.0000000 | TRUE |
SexM:Age | 6.088769 | 0.3828696 | 15.902984 | 0.0000000 | FALSE |
SexM:Age | 6.088769 | 0.3828696 | 15.902984 | 0.0000000 | TRUE |
Overall Multicollinearity Diagnostics
omcdiag(m0int)
##
## Call:
## omcdiag(mod = m0int)
##
##
## Overall Multicollinearity Diagnostics
##
## MC Results detection
## Determinant |X'X|: 0.1176 0
## Farrar Chi-Square: 529.1390 1
## Red Indicator: 0.5367 1
## Sum of Lambda Inverse: 18.1688 1
## Theil's Method: 0.4416 0
## Condition Number: 13.3050 0
##
## 1 --> COLLINEARITY is detected by the test
## 0 --> COLLINEARITY is not detected by the test
omcdiag(m1int)
##
## Call:
## omcdiag(mod = m1int)
##
##
## Overall Multicollinearity Diagnostics
##
## MC Results detection
## Determinant |X'X|: 0.4677 0
## Farrar Chi-Square: 187.8420 1
## Red Indicator: 0.4216 0
## Sum of Lambda Inverse: 5.2745 0
## Theil's Method: -0.7752 0
## Condition Number: 2.5738 0
##
## 1 --> COLLINEARITY is detected by the test
## 0 --> COLLINEARITY is not detected by the test
However, there is a difference in multicollinearity diagnostics between the 2 models
my_mean<-c(1500,3500)
mycors<-.6
sd_vec<-c(150,300)
temp_cor<-matrix(c(1,mycors,
mycors,1),
byrow = T,ncol=2)
V<-sd_vec %*% t(sd_vec) *temp_cor
n<-500
z0<-rmvnorm(n,my_mean,V)
z0<-data.frame(z0)
colnames(z0)<-c("EGF","TGF")
z0$EGF<-round(z0$EGF,0)
z0$TGF<-round(z0$TGF,0)
z0<-rbind(z0,
data.frame(EGF=runif(n,1000,2000) %>% round(0),
TGF=runif(n,2300,4500)%>% round(0))
)
z0$CD0<-dmvnorm(z0,my_mean,V) %>% {.*10^9} %>% sqrt
sd_coefs<-solve(matrix(c(1,0,1,63),nrow=2,byrow = T),c(log(3),log(9)))
z0$CD<-map_dbl(z0$CD0,~.+rnorm(1,0,exp(sd_coefs%*%c(1,.))))
z0$CD[z0$CD<0]<-0
z0$CD0<-NULL
Next we present the 3d scatterplot of this data set
plot_ly(z0,x=~TGF, y=~EGF, z=~CD, type="scatter3d", mode="markers",color=~CD,size =1)
Grafico de dispersión en 3d (Datos actualizados)
m0pl3<-lm(CD~TGF+I(TGF^2)+EGF+I(EGF^2)+I(TGF*EGF)+I((TGF*EGF)^2), data=z0)
(sumpl3<-summary(m0pl3))
##
## Call:
## lm(formula = CD ~ TGF + I(TGF^2) + EGF + I(EGF^2) + I(TGF * EGF) +
## I((TGF * EGF)^2), data = z0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31.617 -9.822 -0.414 9.517 61.967
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.433e+02 1.081e+02 -5.949 3.73e-09 ***
## TGF 2.129e-01 4.300e-02 4.950 8.73e-07 ***
## I(TGF^2) -3.647e-05 3.415e-06 -10.681 < 2e-16 ***
## EGF 4.020e-01 9.926e-02 4.050 5.53e-05 ***
## I(EGF^2) -1.633e-04 1.808e-05 -9.034 < 2e-16 ***
## I(TGF * EGF) 3.355e-05 2.832e-05 1.185 0.236
## I((TGF * EGF)^2) -7.106e-13 1.374e-12 -0.517 0.605
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.22 on 993 degrees of freedom
## Multiple R-squared: 0.6216, Adjusted R-squared: 0.6193
## F-statistic: 271.9 on 6 and 993 DF, p-value: < 2.2e-16
omcdiag(m0pl3)
##
## Call:
## omcdiag(mod = m0pl3)
##
##
## Overall Multicollinearity Diagnostics
##
## MC Results detection
## Determinant |X'X|: 0.0000 1
## Farrar Chi-Square: 21150.2540 1
## Red Indicator: 0.6945 1
## Sum of Lambda Inverse: 12886.7179 1
## Theil's Method: 2.8874 1
## Condition Number: 1673.7470 1
##
## 1 --> COLLINEARITY is detected by the test
## 0 --> COLLINEARITY is not detected by the test
#centering data
z1<-z0
z1$EGF<-z1$EGF-mean(z1$EGF)
z1$TGF<-z1$TGF-mean(z1$TGF)
m0pl4<-lm(CD~I(TGF)+I(TGF^2)+EGF+I(EGF^2)+I(TGF*EGF)+I((TGF*EGF)^2), data=z1)
(sumpl4<-summary(m0pl4))
##
## Call:
## lm(formula = CD ~ I(TGF) + I(TGF^2) + EGF + I(EGF^2) + I(TGF *
## EGF) + I((TGF * EGF)^2), data = z1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31.797 -7.434 0.271 7.652 33.358
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.232e+01 5.853e-01 89.381 < 2e-16 ***
## I(TGF) -6.233e-04 7.586e-04 -0.822 0.4115
## I(TGF^2) -6.064e-05 1.605e-06 -37.789 < 2e-16 ***
## EGF 3.144e-03 1.641e-03 1.916 0.0556 .
## I(EGF^2) -2.919e-04 8.123e-06 -35.937 < 2e-16 ***
## I(TGF * EGF) 2.322e-05 2.804e-06 8.284 3.82e-16 ***
## I((TGF * EGF)^2) 3.634e-10 1.663e-11 21.855 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.68 on 993 degrees of freedom
## Multiple R-squared: 0.7444, Adjusted R-squared: 0.7429
## F-statistic: 482.1 on 6 and 993 DF, p-value: < 2.2e-16
omcdiag(m0pl4)
##
## Call:
## omcdiag(mod = m0pl4)
##
##
## Overall Multicollinearity Diagnostics
##
## MC Results detection
## Determinant |X'X|: 0.2879 0
## Farrar Chi-Square: 1240.4988 1
## Red Indicator: 0.2536 0
## Sum of Lambda Inverse: 10.0227 0
## Theil's Method: -2.0098 0
## Condition Number: 4.8442 0
##
## 1 --> COLLINEARITY is detected by the test
## 0 --> COLLINEARITY is not detected by the test
Here meaningfull changes in parameter estimation can be appreciated.
res2<-rbind(sumpl3$coefficients,sumpl4$coefficients) %>% data.frame()
res2$centering<-c(rep(F,nrow(sumpl3$coefficients)),rep(T,nrow(sumpl4$coefficients)))
res2$terms<-rep(rownames(sumpl3$coefficients),2)
rownames(res2)<-NULL
colnames(res2)[c(2,4)]<-c("Std.Error","P-value")
kableres2<-kable(res2[order(res2$terms),c(6,1:5)],caption="comparison of fitted model with and without centering",row.names =F)
kable_styling(kableres2,"striped", position = "center",full_width = F)
terms | Estimate | Std.Error | t.value | P-value | centering |
---|---|---|---|---|---|
(Intercept) | -643.3130163 | 108.1365030 | -5.9490829 | 0.0000000 | FALSE |
(Intercept) | 52.3160328 | 0.5853125 | 89.3813745 | 0.0000000 | TRUE |
EGF | 0.4019921 | 0.0992639 | 4.0497293 | 0.0000553 | FALSE |
EGF | 0.0031437 | 0.0016406 | 1.9161962 | 0.0556270 | TRUE |
I((TGF * EGF)^2) | 0.0000000 | 0.0000000 | -0.5171688 | 0.6051535 | FALSE |
I((TGF * EGF)^2) | 0.0000000 | 0.0000000 | 21.8545065 | 0.0000000 | TRUE |
I(EGF^2) | -0.0001633 | 0.0000181 | -9.0344551 | 0.0000000 | FALSE |
I(EGF^2) | -0.0002919 | 0.0000081 | -35.9365890 | 0.0000000 | TRUE |
I(TGF * EGF) | 0.0000335 | 0.0000283 | 1.1846090 | 0.2364555 | FALSE |
I(TGF * EGF) | 0.0000232 | 0.0000028 | 8.2841572 | 0.0000000 | TRUE |
I(TGF^2) | -0.0000365 | 0.0000034 | -10.6814684 | 0.0000000 | FALSE |
I(TGF^2) | -0.0000606 | 0.0000016 | -37.7885974 | 0.0000000 | TRUE |
TGF | 0.2128521 | 0.0430040 | 4.9495937 | 0.0000009 | FALSE |
TGF | -0.0006233 | 0.0007586 | -0.8216315 | 0.4114839 | TRUE |
Overall Multicollinearity Diagnostics
omcdiag(m0pl3)
##
## Call:
## omcdiag(mod = m0pl3)
##
##
## Overall Multicollinearity Diagnostics
##
## MC Results detection
## Determinant |X'X|: 0.0000 1
## Farrar Chi-Square: 21150.2540 1
## Red Indicator: 0.6945 1
## Sum of Lambda Inverse: 12886.7179 1
## Theil's Method: 2.8874 1
## Condition Number: 1673.7470 1
##
## 1 --> COLLINEARITY is detected by the test
## 0 --> COLLINEARITY is not detected by the test
omcdiag(m0pl4)
##
## Call:
## omcdiag(mod = m0pl4)
##
##
## Overall Multicollinearity Diagnostics
##
## MC Results detection
## Determinant |X'X|: 0.2879 0
## Farrar Chi-Square: 1240.4988 1
## Red Indicator: 0.2536 0
## Sum of Lambda Inverse: 10.0227 0
## Theil's Method: -2.0098 0
## Condition Number: 4.8442 0
##
## 1 --> COLLINEARITY is detected by the test
## 0 --> COLLINEARITY is not detected by the test
Huge improvement of Multicollinearity Diagnostics when centering.