library(data.table)
library(ggplot2)
library(caret)
library(jtools)
library(scales)
library(dplyr)
library(lattice)
library(MASS)
library(faraway)
library(cvTools)
library(lmtest)
library(faraway)
library(latex2exp)
library(cvTools)
Cargado base de datos
df<- data.table(read.table('DNAH1.txt', header = T, sep = " " , encoding = 'Latin-1'))
df<-subset(df, select = -c(X))
Transformación de variables
df$position = as.numeric(df$position)
class(df$position)
## [1] "numeric"
df$base1 = as.factor(df$base)
df$base1 = as.numeric(df$base1)
class(df$base1)
## [1] "numeric"
df$height = as.numeric(df$height)
class(df$height)
## [1] "numeric"
Nuevas variables dicotomicas
df$base_a <- as.numeric(df$base1 == 1)
df$base_c <- as.numeric(df$base1 == 2)
df$base_g <- as.numeric(df$base1 == 3)
df$base_t <- as.numeric(df$base1 == 4)
lr=lm(height~position ,data = df)
plot(lr)
summary(lr)
##
## Call:
## lm(formula = height ~ position, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20891.8 -4171.7 -26.6 4390.7 28020.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36750.027 693.435 53.0 <2e-16 ***
## position -28.454 2.474 -11.5 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7517 on 478 degrees of freedom
## Multiple R-squared: 0.2167, Adjusted R-squared: 0.2151
## F-statistic: 132.3 on 1 and 478 DF, p-value: < 2.2e-16
ggplot(lr, aes(x = .fitted, y = .resid)) + geom_point(color = "#11CACF", size = 1)+ geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
geom_smooth(method = "loess" , se = T, color = "#BE11CF", size = 1) +
labs(x = "Position", y = "Residuals", title = "Plot de residuos Modelo 1") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
##QQplot Base
qqnorm(df$base1, main = "QQ Plot - Base")
qqline(df$base1)
## QQplot Position
qqnorm(df$position, main = "QQ Plot - Position")
qqline(df$position)
##QQplot Height
qqnorm(df$height, main = "QQ Plot - Height")
qqline(df$height)
##QQplot Normal
qqnorm(rnorm(1000), main = "QQ Plot - Normal")
qqline(rnorm(1000))
A partir del análisis inicial de los datos, es posible notar que la distribución de la variable respuesta o la variable de la altura posee una distribución similar a la de una normal. Por ende, la transformación utilizando Box-Cox puede mostrarse infructífera para el análisis, pudiendo quizás no transformar nada en la variable que estamos intentando estimar.
#fit linear regression model
model <- lm(height~position, data = df)
#find optimal lambda for Box-Cox transformation
bc <- boxcox(df$height~df$position)
lambda <- bc$x[which.max(bc$y)]
lambda
## [1] 0.7474747
model_bc = lm(((height^lambda)-1)/(lambda) ~ position, data = df)
plot(model_bc)
summary(model_bc)
##
## Call:
## lm(formula = ((height^lambda) - 1)/(lambda) ~ position, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1641.71 -315.37 14.22 336.94 1883.91
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3453.6190 51.6807 66.83 <2e-16 ***
## position -2.1366 0.1844 -11.59 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 560.2 on 478 degrees of freedom
## Multiple R-squared: 0.2193, Adjusted R-squared: 0.2177
## F-statistic: 134.3 on 1 and 478 DF, p-value: < 2.2e-16
ggplot(model_bc, aes(x = .fitted, y = .resid)) + geom_point(color = "#11CACF", size = 1)+ geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
geom_smooth(method = "loess" , se = T, color = "#BE11CF", size = 1) +
labs(x = "Position", y = "Residuals", title = "Plot de residuos Modelo 1") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
Por lo tanto, es apreciable que mediante la aplicación de una modificación Box-Cox nos se realiza un cambio aparente en la distribución o comportamiento de los residuos de la variable respuesta. Esto va a justificar por ende, que la modificación no ha surtido el efecto deseado debido a la distribución que se encontró anteriormente, asentando más aún la conclusión de una distribución normal en la variable respuesta correspondiente a la altura de una persona.
ggplot() +
geom_point(data = df, mapping = aes(x = position, y = height, color = base)) +
geom_smooth(data = subset(df, base == "T"), mapping = aes(x = position, y = height), method = 'loess', se = FALSE, fullrange = TRUE, linetype = "dashed", color = "#D284F9") +
geom_smooth(data = subset(df, base == "A"), mapping = aes(x = position, y = height), method = 'loess', se = FALSE, fullrange = TRUE, linetype = "dashed", color = "#E77C00") +
geom_smooth(data = subset(df, base == "C"), mapping = aes(x = position, y = height), method = 'loess', se = FALSE, fullrange = TRUE, linetype = "dashed", color = "#42A11B") +
geom_smooth(data = subset(df, base == "G"), mapping = aes(x = position, y = height), method = 'loess', se = FALSE, fullrange = TRUE, linetype = "dashed", color = "#0CACC4") +
theme_minimal() +
labs(x = 'Puntaje de los niños', y = 'IQ de la madre', color = NULL) +
theme(legend.position = 'bottom')
reg1<-lm(df$height~df$position)
df[,height1 := predict(reg1)]
summary(reg1)
##
## Call:
## lm(formula = df$height ~ df$position)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20891.8 -4171.7 -26.6 4390.7 28020.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36750.027 693.435 53.0 <2e-16 ***
## df$position -28.454 2.474 -11.5 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7517 on 478 degrees of freedom
## Multiple R-squared: 0.2167, Adjusted R-squared: 0.2151
## F-statistic: 132.3 on 1 and 478 DF, p-value: < 2.2e-16
ggplot() +
geom_segment(df, mapping = aes(x=df$position, xend=df$position, y=df$height, yend=df$height1)) +
geom_point(df, mapping = aes(x=df$position, y=df$height, color='Valores reales')) +
geom_smooth(df, mapping = aes(x=df$position, y=df$height1, color='Valores predichos'), method='loess',se=T, fullrange = T) +
theme_minimal() +
labs(x='Position',y='Height',color=NULL, title="Modelo 1") +
theme(legend.position = 'bottom') +
scale_y_continuous(labels = number_format(scale = 1))
ggplot(reg1, aes(x = .fitted, y = .resid)) + geom_point(color = "#11CACF", size = 1)+ geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
geom_smooth(method = "loess" , se = T, color = "#BE11CF", size = 1) +
labs(x = "Position", y = "Residuals", title = "Plot de residuos Modelo 1") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
plot(reg1,1, main="Modelo 1")
bptest(reg1)
##
## studentized Breusch-Pagan test
##
## data: reg1
## BP = 12.557, df = 1, p-value = 0.0003948
## Chequeamos independencia de los residuos
dwtest(reg1)
##
## Durbin-Watson test
##
## data: reg1
## DW = 1.4019, p-value = 1.895e-11
## alternative hypothesis: true autocorrelation is greater than 0
## Normalidad de los residuos
shapiro.test(reg1$residuals)
##
## Shapiro-Wilk normality test
##
## data: reg1$residuals
## W = 0.98808, p-value = 0.0005897
mean((df$height-predict(reg1))^2, data =df)
## [1] 56265714
cvFit(reg1,data=df,y=df$height,x=df$position,K=10,cost=mspe,seed=2000)
## 10-fold CV results:
## CV
## 87982569
Debido al análisis de la secuencia y las posiciones asociadas a los genes, es posible concluir que la relación posee una pendiente negativa. Por otro lado, la relación no parece ser lineal, debido a la gran variabilidad que existe en los residuos, los cuales se alejan del cero y no permite explicar de forma completa la varianza presente en los datos. Adicionalmente, la prueba de validez con los test nos entrega una justificación mayor sobre el ajuste de la regresión lineal, la cual no cumple como un buen modelo desde el punto de vista de la explicación de los datos poblacionales
Desde la interacción de la altura y las bases, podemos decir que parece existir una relación entre esta variable y la explicación de la altura. Pese a esto, a partir del gráfico de apoyo podemos apreciar la diferencia poco significativa presente entre las distintas bases y su variación en las líneas ajustadas a cada uno de sus comportamientos. Por ende, desde la justificación previa, podemos decir que la base asociada a esta característica de altura no pareciera ser la única variable implicada en la función general, pero puede que incorpore información para la estimación.
reg2<-lm(height~ position + I(position^2) ,data=df)
df[,height1 := predict(reg2)]
summary(reg2)
##
## Call:
## lm(formula = height ~ position + I(position^2), data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21847.6 -4215.0 -10.7 3998.8 28392.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.143e+04 1.010e+03 31.128 < 2e-16 ***
## position 3.614e+01 9.553e+00 3.783 0.000174 ***
## I(position^2) -1.325e-01 1.899e-02 -6.978 1.01e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7168 on 477 degrees of freedom
## Multiple R-squared: 0.2893, Adjusted R-squared: 0.2863
## F-statistic: 97.07 on 2 and 477 DF, p-value: < 2.2e-16
ggplot() +
geom_segment(df, mapping = aes(x=df$position, xend=df$position, y=df$height, yend=df$height1)) +
geom_point(df, mapping = aes(x=df$position, y=df$height, color='Valores reales')) +
geom_smooth(df, mapping = aes(x=df$position, y=df$height1, color='Valores predichos'), method='loess',se=T, fullrange = T) +
theme_minimal() +
labs(x='Position',y='Height',color=NULL, title = "Modelo 2") +
theme(legend.position = 'bottom') +
scale_y_continuous(labels = number_format(scale = 1))
ggplot(reg2, aes(x = .fitted, y = .resid)) + geom_point(color = "#11CACF", size = 1)+ geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
geom_smooth(method = "loess" , se = T, color = "#BE11CF", size = 1) +
labs(x = "Position", y = "Residuals", title = "Plot de residuos Modelo 2") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
plot(reg2,1, main="Modelo 2")
bptest(reg2)
##
## studentized Breusch-Pagan test
##
## data: reg2
## BP = 3.033, df = 2, p-value = 0.2195
## Chequeamos independencia de los residuos
dwtest(reg2)
##
## Durbin-Watson test
##
## data: reg2
## DW = 1.5449, p-value = 1.809e-07
## alternative hypothesis: true autocorrelation is greater than 0
## Normalidad de los residuos
shapiro.test(reg2$residuals)
##
## Shapiro-Wilk normality test
##
## data: reg2$residuals
## W = 0.98467, p-value = 5.966e-05
mean((df$height-predict(reg2))^2, data =df)
## [1] 51054148
cvFit(reg2,data=df,y=df$height,x=df$position,K=10,cost=mspe,seed=2000)
## 10-fold CV results:
## CV
## 51794413
reg3<-lm(height ~ position + I(position^3), data=df)
df[,height1 := predict(reg3)]
summary(reg3)
##
## Call:
## lm(formula = height ~ position + I(position^3), data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21365.1 -4061.8 45.8 4218.4 28535.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.324e+04 9.044e+02 36.754 < 2e-16 ***
## position 3.549e+00 6.026e+00 0.589 0.556
## I(position^3) -1.503e-04 2.598e-05 -5.787 1.3e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7274 on 477 degrees of freedom
## Multiple R-squared: 0.2681, Adjusted R-squared: 0.265
## F-statistic: 87.37 on 2 and 477 DF, p-value: < 2.2e-16
ggplot() +
geom_segment(df, mapping = aes(x=df$position, xend=df$position, y=df$height, yend=df$height1)) +
geom_point(df, mapping = aes(x=df$position, y=df$height, color='Valores reales')) +
geom_smooth(df, mapping = aes(x=df$position, y=df$height1, color='Valores predichos'), method='loess',se=T, fullrange = T) +
theme_minimal() +
labs(x='Position',y='Height',color=NULL, title = "Modelo 3") +
theme(legend.position = 'bottom') +
scale_y_continuous(labels = number_format(scale = 1))
ggplot(reg3, aes(x = .fitted, y = .resid)) + geom_point(color = "#11CACF", size = 1)+ geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
geom_smooth(method = "loess" , se = T, color = "#BE11CF", size = 1) +
labs(x = "Position", y = "Residuals", title = "Plot de residuos Modelo 3") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
plot(reg3,1, main="Modelo 3")
bptest(reg3)
##
## studentized Breusch-Pagan test
##
## data: reg3
## BP = 5.4089, df = 2, p-value = 0.06691
## Chequeamos independencia de los residuos
dwtest(reg3)
##
## Durbin-Watson test
##
## data: reg3
## DW = 1.5003, p-value = 1.215e-08
## alternative hypothesis: true autocorrelation is greater than 0
## Normalidad de los residuos
shapiro.test(reg3$residuals)
##
## Shapiro-Wilk normality test
##
## data: reg3$residuals
## W = 0.98511, p-value = 7.891e-05
mean((df$height-predict(reg3))^2, data =df)
## [1] 52574684
cvFit(reg3,data=df,y=df$height,x=df$position,K=10,cost=mspe,seed=2000)
## 10-fold CV results:
## CV
## 53335272
reg4<-lm(height ~ position + log(position), data=df)
df[,height1 := predict(reg4)]
summary(reg4)
##
## Call:
## lm(formula = height ~ position + log(position), data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22015.2 -3899.6 -154.2 3611.1 26318.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6438.334 3050.301 2.111 0.0353 *
## position -75.588 5.156 -14.660 <2e-16 ***
## log(position) 8010.523 788.759 10.156 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6823 on 477 degrees of freedom
## Multiple R-squared: 0.356, Adjusted R-squared: 0.3533
## F-statistic: 131.8 on 2 and 477 DF, p-value: < 2.2e-16
ggplot() +
geom_segment(df, mapping = aes(x=df$position, xend=df$position, y=df$height, yend=df$height1)) +
geom_point(df, mapping = aes(x=df$position, y=df$height, color='Valores reales')) +
geom_smooth(df, mapping = aes(x=df$position, y=df$height1, color='Valores predichos'), method='loess',se=T, fullrange = T, span = 0.09) +
theme_minimal() +
labs(x='Position',y='Height',color=NULL, title = "Modelo 4") +
theme(legend.position = 'bottom') +
scale_y_continuous(labels = number_format(scale = 1))
ggplot(reg4, aes(x = .fitted, y = .resid)) + geom_point(color = "#11CACF", size = 1)+ geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
geom_smooth(method = "loess" , se = T, color = "#BE11CF", size = 1) +
labs(x = "Position", y = "Residuals", title = "Plot de residuos Modelo 4") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
plot(reg4,1, main="Modelo 4")
bptest(reg4)
##
## studentized Breusch-Pagan test
##
## data: reg4
## BP = 4.0078, df = 2, p-value = 0.1348
## Chequeamos independencia de los residuos
dwtest(reg4)
##
## Durbin-Watson test
##
## data: reg4
## DW = 1.7048, p-value = 0.0004307
## alternative hypothesis: true autocorrelation is greater than 0
## Normalidad de los residuos
shapiro.test(reg4$residuals)
##
## Shapiro-Wilk normality test
##
## data: reg4$residuals
## W = 0.97938, p-value = 2.571e-06
mean((df$height-predict(reg4))^2, data =df)
## [1] 46262420
cvFit(reg4,data=df,y=df$height,x=df$position,K=10,cost=mspe,seed=2000)
## 10-fold CV results:
## CV
## 46821497
Por lo tanto, el modelo 4 es el mejor modelo generado para la predicción de la variable de altura. Dada la condición de los residuos ajustados, podemos ver una tendencia cercana al cero, además se presenta una minimización del Mspe respecto a los demás modelos, mediante la consideración de la media asociada a la muestra y por validación cruzada.
reg_b<-lm(height ~ position + log(position) + base_t*position + base_c*position + base_g*position , data=df)
df[,height1 := predict(reg_b)]
summary(reg_b)
##
## Call:
## lm(formula = height ~ position + log(position) + base_t * position +
## base_c * position + base_g * position, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22571.2 -3479.7 258.1 3748.6 23684.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9904.163 2998.391 3.303 0.00103 **
## position -74.693 5.804 -12.869 < 2e-16 ***
## log(position) 7386.516 761.017 9.706 < 2e-16 ***
## base_t -3146.165 1517.605 -2.073 0.03871 *
## base_c -9651.219 2017.717 -4.783 2.31e-06 ***
## base_g 3390.076 1581.369 2.144 0.03256 *
## position:base_t 8.720 5.384 1.620 0.10595
## position:base_c 41.978 7.536 5.570 4.29e-08 ***
## position:base_g -15.459 5.633 -2.744 0.00630 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6488 on 471 degrees of freedom
## Multiple R-squared: 0.425, Adjusted R-squared: 0.4152
## F-statistic: 43.51 on 8 and 471 DF, p-value: < 2.2e-16
ggplot() +
geom_segment(df, mapping = aes(x=df$position, xend=df$position, y=df$height, yend=df$height1)) +
geom_point(df, mapping = aes(x=df$position, y=df$height, color=df$base)) +
geom_smooth(df %>% filter(base_a == 1), mapping = aes(x=position, y=height1), color='orange', method='loess',se=F, fullrange = T, span = 0.09) +
geom_smooth(df %>% filter(base_t == 1), mapping = aes(x=position, y=height1), color='purple', method='loess',se=F, fullrange = T, span = 0.09)+
geom_smooth(df %>% filter(base_c == 1), mapping = aes(x=position, y=height1), color='darkgreen', method='loess',se=F, fullrange = T, span = 0.09)+
geom_smooth(df %>% filter(base_g == 1), mapping = aes(x=position, y=height1), color='darkcyan', method='loess',se=F, fullrange = T, span = 0.09)+
theme_minimal() +
labs(x='Position',y='Height',color=NULL, title = "Modelo 4") +
theme(legend.position = 'bottom')
reg_c<-lm(height ~ position + log(position) + base_c*position, data=df)
df[,height_c := predict(reg_c)]
summary(reg_c)
##
## Call:
## lm(formula = height ~ position + log(position) + base_c * position,
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22640.2 -3728.5 130.5 3871.8 25746.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9434.299 3002.093 3.143 0.00178 **
## position -77.078 4.984 -15.465 < 2e-16 ***
## log(position) 7500.195 767.442 9.773 < 2e-16 ***
## base_c -9579.102 1930.395 -4.962 9.73e-07 ***
## position:base_c 43.588 7.156 6.091 2.32e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6584 on 475 degrees of freedom
## Multiple R-squared: 0.4028, Adjusted R-squared: 0.3978
## F-statistic: 80.11 on 4 and 475 DF, p-value: < 2.2e-16
reg_a<-lm(height ~ position + log(position) + base_a*position, data=df)
df[,height_a := predict(reg_a)]
summary(reg_a)
##
## Call:
## lm(formula = height ~ position + log(position) + base_a * position,
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21397.1 -3784.3 -154.2 3709.3 26920.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5473.368 3117.677 1.756 0.0798 .
## position -73.770 5.374 -13.727 <2e-16 ***
## log(position) 8055.397 789.235 10.207 <2e-16 ***
## base_a 1927.952 1282.692 1.503 0.1335
## position:base_a -4.973 4.677 -1.063 0.2882
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6820 on 475 degrees of freedom
## Multiple R-squared: 0.3593, Adjusted R-squared: 0.3539
## F-statistic: 66.6 on 4 and 475 DF, p-value: < 2.2e-16
reg_t<-lm(height ~ position + log(position) + base_t*position, data=df)
df[,height_t := predict(reg_t)]
summary(reg_t)
##
## Call:
## lm(formula = height ~ position + log(position) + base_t * position,
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21546.1 -3833.7 -107.7 3579.8 25804.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6962.094 3059.583 2.276 0.0233 *
## position -78.058 5.408 -14.433 <2e-16 ***
## log(position) 8057.373 788.815 10.215 <2e-16 ***
## base_t -2770.467 1447.146 -1.914 0.0562 .
## position:base_t 8.298 5.016 1.654 0.0987 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6811 on 475 degrees of freedom
## Multiple R-squared: 0.3609, Adjusted R-squared: 0.3555
## F-statistic: 67.06 on 4 and 475 DF, p-value: < 2.2e-16
reg_g<-lm(height ~ position + log(position) + base_g*position, data=df)
df[,height_g := predict(reg_g)]
summary(reg_g)
##
## Call:
## lm(formula = height ~ position + log(position) + base_g * position,
## data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22659.1 -3904.2 -110.7 3568.5 23819.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6216.538 2990.355 2.079 0.038166 *
## position -68.090 5.317 -12.806 < 2e-16 ***
## log(position) 7730.280 778.146 9.934 < 2e-16 ***
## base_g 5720.025 1487.196 3.846 0.000136 ***
## position:base_g -23.928 5.182 -4.617 5.01e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6687 on 475 degrees of freedom
## Multiple R-squared: 0.3841, Adjusted R-squared: 0.3789
## F-statistic: 74.04 on 4 and 475 DF, p-value: < 2.2e-16
ggplot() +
geom_point(data = df, mapping = aes(x = position, y = height, color = base)) +
geom_smooth(df, mapping = aes(x = position, y = height_t), method = 'loess', se = FALSE, fullrange = TRUE, linetype = "dashed", color = "#D284F9") +
geom_smooth(df, mapping = aes(x = position, y = height_a), method = 'loess', se = FALSE, fullrange = TRUE, linetype = "dashed", color = "#E77C00") +
geom_smooth(df, mapping = aes(x = position, y = height_c), method = 'loess', se = FALSE, fullrange = TRUE, linetype = "dashed", color = "#42A11B") +
geom_smooth(df, mapping = aes(x = position, y = height_g), method = 'loess', se = FALSE, fullrange = TRUE, linetype = "dashed", color = "#0CACC4") +
theme_minimal() +
labs(x = 'Position', y = 'Height', color = NULL) +
theme(legend.position = 'bottom')
Mediante la incorporación de la variable “base” podemos realizar los modelos ajustados mediante distintas consideraciones. En el primer modelo ajustado, se realiza una interacción de la variable “posición” con la incorporación de variables binarias. Esto permite generar un modelo con distintas pendientes que incorpore la información respecto a la variable de la base. Por otro lado, el segundo modelo genera una estimación particular por cada una de las bases. En este caso, es posible apreciar la poca diferencia que existe entre cada una de las líneas de regresión.
Debido al ajuste diferido, se puede decir que la diferencia entre ambos modelos radica en cómo incorporan la variable base de su estimación. Por ende, el primer modelo de interacción explica mejor los datos que el modelo de regresión múltiple. Además, muestra una mejora en el ajuste respecto al modelo 5 en el cual se basa su modelo predictivo. Por ende, el mejor modelo desde un punto de vista del R^2 y la representación gráfica del mismo se va a presentar en el modelo con interacción. Dado que entrega peso a las variables asociadas a la base, las cuales conforman una explicación obviada en los modelos anteriores.
ggplot() +
geom_point(data = df, mapping = aes(x = position, y = height, color = base)) +
geom_smooth(data = subset(df, base == "T"), mapping = aes(x = position, y = height), method = 'loess', se = FALSE, fullrange = TRUE, linetype = "dashed", color = "#D284F9") +
geom_smooth(data = subset(df, base == "A"), mapping = aes(x = position, y = height), method = 'loess', se = FALSE, fullrange = TRUE, linetype = "dashed", color = "#E77C00") +
geom_smooth(data = subset(df, base == "C"), mapping = aes(x = position, y = height), method = 'loess', se = FALSE, fullrange = TRUE, linetype = "dashed", color = "#42A11B") +
geom_smooth(data = subset(df, base == "G"), mapping = aes(x = position, y = height), method = 'loess', se = FALSE, fullrange = TRUE, linetype = "dashed", color = "#0CACC4") +
theme_minimal() +
labs(x = 'Position', y = 'Height', color = NULL) +
theme(legend.position = 'bottom')
reg_b<-lm(height ~ position + log(position) + I(position^2)*base + I(position^3)*base + I(position^4)*base + I(position^5)*base, data=df)
df[,height1 := predict(reg_b)]
summary(reg_b)
##
## Call:
## lm(formula = height ~ position + log(position) + I(position^2) *
## base + I(position^3) * base + I(position^4) * base + I(position^5) *
## base, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25357.4 -2834.2 71.7 3447.4 21275.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.885e+04 8.438e+03 3.419 0.000685 ***
## position 6.433e+02 2.554e+02 2.519 0.012108 *
## log(position) -3.141e+03 4.241e+03 -0.741 0.459208
## I(position^2) -6.216e+00 2.148e+00 -2.894 0.003984 **
## baseC -1.001e+04 2.655e+03 -3.772 0.000183 ***
## baseG -4.001e+03 2.533e+03 -1.579 0.114917
## baseT -5.327e+03 2.382e+03 -2.236 0.025812 *
## I(position^3) 2.616e-02 9.486e-03 2.758 0.006049 **
## I(position^4) -5.101e-05 1.982e-05 -2.574 0.010376 *
## I(position^5) 3.742e-08 1.551e-08 2.413 0.016216 *
## I(position^2):baseC 1.179e+00 8.702e-01 1.354 0.176263
## I(position^2):baseG 1.878e+00 6.131e-01 3.063 0.002321 **
## I(position^2):baseT 2.711e-01 6.182e-01 0.438 0.661281
## baseC:I(position^3) -8.726e-03 7.510e-03 -1.162 0.245851
## baseG:I(position^3) -1.526e-02 5.059e-03 -3.016 0.002700 **
## baseT:I(position^3) -6.282e-04 5.098e-03 -0.123 0.901972
## baseC:I(position^4) 2.477e-05 2.171e-05 1.141 0.254381
## baseG:I(position^4) 4.118e-05 1.425e-05 2.890 0.004033 **
## baseT:I(position^4) -3.969e-07 1.437e-05 -0.028 0.977976
## baseC:I(position^5) -2.390e-08 2.055e-08 -1.163 0.245478
## baseG:I(position^5) -3.677e-08 1.329e-08 -2.767 0.005879 **
## baseT:I(position^5) 1.189e-09 1.342e-08 0.089 0.929464
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6339 on 458 degrees of freedom
## Multiple R-squared: 0.4662, Adjusted R-squared: 0.4417
## F-statistic: 19.05 on 21 and 458 DF, p-value: < 2.2e-16
ggplot() +
geom_point(df, mapping = aes(x=df$position, y=df$height, color=df$base)) +
geom_smooth(df %>% filter(base1 == 1), mapping = aes(x=position, y=height1 ), color='orange', method='loess',se=F, fullrange = T, linetype='dashed', span = 0.09) +
geom_smooth(df %>% filter(base1 == 4), mapping = aes(x=position, y=height1), color='purple', method='loess',se=F, fullrange = T, linetype='dashed', span = 0.09)+
geom_smooth(df %>% filter(base1 == 2), mapping = aes(x=position, y=height1), color='#42A11B', method='loess',se=F, fullrange = T, linetype='dashed', span = 0.09)+
geom_smooth(df %>% filter(base1 == 3), mapping = aes(x=position, y=height1), color='darkcyan', method='loess',se=F, fullrange = T, linetype='dashed', span = 0.09)+
theme_minimal() +
labs(x='Position',y='Height',color=NULL, title = "Modelo Ajustado") +
theme(legend.position = 'bottom')
Para realizar el ajuste solicitado es necesario considerar la estimación polinomial de la posición con la interacción de la base. En primer lugar, se muestra el gráfico 1 con el ajuste de recetas utilizando la base como separación por cada uno de los ajustes. Esto permite tener una idea de cómo se verá el gráfico final, considerando el ajuste de las regresiones asociadas a un sobre ajuste. El segundo gráfico es el modelo ajustado por un grado polinomial de 5, lo que genera un nuevo ajuste sobre el modelo original. El nuevo modelo presenta una mejora en los parámetros de significancia de R^2 y SCR, además de minimizar el F estadístico. No obstante, posee un claro problema en la significancia estadística de sus parámetros. Dado que varios de los parámetros incorporados al modelo no poseen significancia estadística.