library(tidyverse)
library(readxl)
library(janitor)
library(tidymodels)
library(GGally)
library(plotly)
library(ggeffects)
library(scatterplot3d)
library(car)
library(lmtest)
library(plot3D)
datos <- read_excel("Simpson-Calidad-Leche.xls") %>%
clean_names() %>%
mutate(across(c(fat_g_100_ml, protein_g_100_ml, lactose_g_100_ml,
inv_simpson), as.numeric))
datos
## # A tibble: 63 x 12
## id herd_id alimentation calving_date race somatic_cell_co~
## <chr> <chr> <chr> <dttm> <chr> <dbl>
## 1 BERT-L10-pl2~ TF1 traditional 2015-01-07 00:00:00 Fris~ 17
## 2 BERT-L2-pl2-~ TF1 traditional 2015-12-29 00:00:00 Fris~ 6
## 3 BERT-L4-pl2-~ TF1 traditional 2014-10-14 00:00:00 Fris~ 11
## 4 BERT-L5-pl2-~ TF1 traditional 2015-03-17 00:00:00 Brun~ 28
## 5 BERT-L7-pl2-~ TF1 traditional 2014-11-07 00:00:00 Fris~ 35
## 6 BERT-L8-pl2-~ TF1 traditional 2014-10-02 00:00:00 Fris~ 58
## 7 BRO-L1-pl2-E3 TF2 traditional 2015-01-15 00:00:00 Brun~ 111
## 8 BRO-L2-pl2-F3 TF2 traditional 2014-10-15 00:00:00 Brun~ 53
## 9 BRO-L5-pl2-A4 TF2 traditional 2014-10-10 00:00:00 Brun~ 88
## 10 BRO-L8-pl2-C4 TF2 traditional 2014-11-14 00:00:00 Brun~ 126
## # ... with 53 more rows, and 6 more variables: cbt_ufc_ml_x_1000 <dbl>,
## # fat_g_100_ml <dbl>, protein_g_100_ml <dbl>, lactose_g_100_ml <dbl>,
## # observed_otu <dbl>, inv_simpson <dbl>
datos %>%
glimpse()
## Rows: 63
## Columns: 12
## $ id <chr> "BERT-L10-pl2-F9", "BERT-L2-pl2-F8",~
## $ herd_id <chr> "TF1", "TF1", "TF1", "TF1", "TF1", "~
## $ alimentation <chr> "traditional", "traditional", "tradi~
## $ calving_date <dttm> 2015-01-07, 2015-12-29, 2014-10-14,~
## $ race <chr> "Frisona Italiana", "Frisona Italian~
## $ somatic_cell_count_cell_ml_x_1000 <dbl> 17, 6, 11, 28, 35, 58, 111, 53, 88, ~
## $ cbt_ufc_ml_x_1000 <dbl> 5, 2, 3, 4, 3, 4, 4, 6, 6, 5, 3, 5, ~
## $ fat_g_100_ml <dbl> 1.79, 1.17, 1.55, 2.30, 3.11, 1.98, ~
## $ protein_g_100_ml <dbl> 3.43, 3.60, 3.17, 4.13, 3.76, 3.63, ~
## $ lactose_g_100_ml <dbl> 5.39, 5.25, 5.06, 5.41, 4.84, 5.11, ~
## $ observed_otu <dbl> 254, 135, 296, 83, 169, 76, 127, 174~
## $ inv_simpson <dbl> 16.78, 16.91, 36.13, 1.92, 12.20, 1.~
datos %>% names()
## [1] "id" "herd_id"
## [3] "alimentation" "calving_date"
## [5] "race" "somatic_cell_count_cell_ml_x_1000"
## [7] "cbt_ufc_ml_x_1000" "fat_g_100_ml"
## [9] "protein_g_100_ml" "lactose_g_100_ml"
## [11] "observed_otu" "inv_simpson"
datos %>%
ggplot(aes(x = inv_simpson, fill = race)) +
geom_density(alpha = 0.5)
datos %>%
group_by(race) %>%
summarise(promedio = mean(inv_simpson),
desviacion = sd(inv_simpson)) %>%
ungroup() %>%
ggplot(aes(
x = race,
y = promedio,
ymin = promedio - desviacion,
ymax = promedio + desviacion
)) +
geom_point() +
geom_errorbar(width = 0.1)
promedio_general <- datos %>%
pull(inv_simpson) %>%
mean()
datos %>%
group_by(race) %>%
summarise(promedio = mean(inv_simpson),
desviacion = sd(inv_simpson)) %>%
ungroup() %>%
ggplot(aes(
x = race,
y = promedio,
ymin = promedio - desviacion,
ymax = promedio + desviacion
)) +
geom_point() +
geom_errorbar(width = 0.1) +
geom_hline(yintercept = promedio_general, lty = 2, color = "red")
datos %>%
group_by(race, alimentation) %>%
summarise(promedio = mean(inv_simpson, na.rm = TRUE)) %>%
ggplot(aes(x = race, y = promedio, color = alimentation)) +
geom_point() +
geom_line(aes(group = alimentation))
datos %>%
ggplot(aes(x = fat_g_100_ml, y = inv_simpson)) +
geom_point() +
geom_smooth(method = "lm")
\[y = mX + b\]
\[y = \beta_0 + \beta_1 X + \epsilon\]
\[m = \frac{\sum_{i = 1}^{n}(x_i - \bar{x})(y_i - \bar{y)}}{\sum_{i=1}^{n}(x_i - \bar{x})^2}\]
\[b = \bar{y} - m\ \bar{x}\]
promedio_x <- datos %>% pull(fat_g_100_ml) %>% mean()
promedio_y <- datos %>% pull(inv_simpson) %>% mean()
numerador <- sum( (datos$fat_g_100_ml - promedio_x) * (datos$inv_simpson - promedio_y) )
denominador <- sum( (datos$fat_g_100_ml - promedio_x)^2 )
pendiente <- numerador / denominador
pendiente
## [1] 2.989569
intercepto <- promedio_y - (pendiente * promedio_x)
intercepto
## [1] 47.32435
cor(datos$fat_g_100_ml, datos$inv_simpson) * (sd(datos$inv_simpson) / sd(datos$fat_g_100_ml) )
## [1] 2.989569
\[y = \beta X + \epsilon\]
\[y = \beta_0 + \beta_1 X + \epsilon\]
\[\beta = (X^{T}X)^{-1}X^{T}y\]
variable_x <- datos %>% pull(fat_g_100_ml)
mtx_intercepto <- rep(1, length(variable_x))
matriz_x <- cbind(mtx_intercepto, variable_x)
variable_y <- datos %>% pull(inv_simpson)
betas <- solve((t(matriz_x) %*% matriz_x)) %*% t(matriz_x) %*% variable_y
betas
## [,1]
## mtx_intercepto 47.324354
## variable_x 2.989569
modelo <- lm(inv_simpson ~ fat_g_100_ml, data = datos)
summary(modelo)
##
## Call:
## lm(formula = inv_simpson ~ fat_g_100_ml, data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.922 -23.417 -2.524 24.259 70.728
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 47.324 8.115 5.832 2.23e-07 ***
## fat_g_100_ml 2.990 1.461 2.046 0.0451 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31.76 on 61 degrees of freedom
## Multiple R-squared: 0.0642, Adjusted R-squared: 0.04886
## F-statistic: 4.185 on 1 and 61 DF, p-value: 0.0451
\[Inverso \ Simpson = 47.324354 + 2.989569 \times Fat + \epsilon\]
shapiro.test(datos$fat_g_100_ml)
##
## Shapiro-Wilk normality test
##
## data: datos$fat_g_100_ml
## W = 0.91921, p-value = 0.0005137
shapiro.test(datos$inv_simpson)
##
## Shapiro-Wilk normality test
##
## data: datos$inv_simpson
## W = 0.9781, p-value = 0.3223
cor(datos$fat_g_100_ml, datos$inv_simpson, method = "spearman")
## [1] 0.2478879
cor.test(datos$fat_g_100_ml, datos$inv_simpson, method = "spearman")
##
## Spearman's rank correlation rho
##
## data: datos$fat_g_100_ml and datos$inv_simpson
## S = 31336, p-value = 0.05037
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.2478879
\[H_0: \beta_1 = 0 \\ H_1: \beta_1 \neq 0\]
modelo <- lm(inv_simpson ~ fat_g_100_ml, data = datos)
summary(modelo)
##
## Call:
## lm(formula = inv_simpson ~ fat_g_100_ml, data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.922 -23.417 -2.524 24.259 70.728
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 47.324 8.115 5.832 2.23e-07 ***
## fat_g_100_ml 2.990 1.461 2.046 0.0451 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31.76 on 61 degrees of freedom
## Multiple R-squared: 0.0642, Adjusted R-squared: 0.04886
## F-statistic: 4.185 on 1 and 61 DF, p-value: 0.0451
confint(modelo, level = 0.95)
## 2.5 % 97.5 %
## (Intercept) 31.0983172 63.550391
## fat_g_100_ml 0.0674321 5.911707
par(mfrow = c(2, 2))
plot(modelo)
residuales <- residuals(modelo)
predichos <- fitted.values(modelo)
\[MSE = \frac{1}{n}\sum_{i=1}^{n}(y_i - \hat{y_i)^2}\]
\[RMSE = \sqrt{MSE}\]
rmse_modelo <- rmse_vec(truth = datos$inv_simpson, estimate = predichos)
rmse_modelo
## [1] 31.25398
\[R^2 = \frac{\sum_{i=1}^{n}(\hat{y_i} - \bar{y})^2}{\sum_{i=1}^{n}(y_i - \bar{y})^2}\]
rsq_vec(truth = datos$inv_simpson, estimate = predichos)
## [1] 0.06420425
cor(x = datos$fat_g_100_ml, y = datos$inv_simpson, method = "pearson")^2
## [1] 0.06420425
set.seed(123)
particion <- initial_split(data = datos, prop = 0.8)
train <- training(particion)
test <- testing(particion)
range(train$fat_g_100_ml)
## [1] 1.14 10.65
range(test$fat_g_100_ml)
## [1] 1.26 8.42
modelo_lm <- lm(inv_simpson ~ fat_g_100_ml, data = train)
summary(modelo_lm)
##
## Call:
## lm(formula = inv_simpson ~ fat_g_100_ml, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -56.073 -22.351 -2.456 24.339 63.604
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 44.753 9.065 4.937 9.99e-06 ***
## fat_g_100_ml 3.167 1.583 2.000 0.0512 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30.55 on 48 degrees of freedom
## Multiple R-squared: 0.07693, Adjusted R-squared: 0.0577
## F-statistic: 4 on 1 and 48 DF, p-value: 0.05117
predicciones_test <- predict(object = modelo_lm, newdata = test)
regresion <- linear_reg() %>%
set_engine("lm")
modelo_tidy <- regresion %>%
fit(inv_simpson ~ fat_g_100_ml, data = train)
modelo_tidy
## parsnip model object
##
##
## Call:
## stats::lm(formula = inv_simpson ~ fat_g_100_ml, data = data)
##
## Coefficients:
## (Intercept) fat_g_100_ml
## 44.753 3.167
predicciones_tidy_train <- predict(object = modelo_tidy, new_data = train)
predicciones_tidy_test <- predict(object = modelo_tidy, new_data = test)
rmse_train <- rmse_vec(truth = train$inv_simpson, estimate = predicciones_tidy_train$.pred)
rmse_test <- rmse_vec(truth = test$inv_simpson, estimate = predicciones_tidy_test$.pred)
rmse_train
## [1] 29.93056
rmse_test
## [1] 36.10619
set.seed(123)
submuestras <- vfold_cv(data = datos, v = 10)
recipesreceta1 <- recipe(inv_simpson ~ fat_g_100_ml, data = train)
modelo_regresion <- linear_reg() %>%
set_engine("lm")
# Opcional
parametros <- function (x) {
extract_fit_parsnip(x)
}
# Opcional
configuracion <- control_resamples(extract = parametros)
# Ajuste
resultado_receta1 <-
fit_resamples(modelo_regresion, receta1, submuestras,
control = configuracion)
collect_metrics(resultado_receta1)
## # A tibble: 2 x 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 31.2 10 2.49 Preprocessor1_Model1
## 2 rsq standard 0.190 10 0.0398 Preprocessor1_Model1
# Opcional
parametros <- function(x) {
intercepto = NULL
pendiente = NULL
for (i in 1:10) {
intercepto[i] = x$.extracts[[i]]$.extracts[[1]][[3]][[1]][[1]]
pendiente[i] = x$.extracts[[i]]$.extracts[[1]][[3]][[1]][[2]]
}
res = data.frame(
intercepto = intercepto,
pendiente = pendiente
)
return(res)
}
parametros(x = resultado_receta1)
## intercepto pendiente
## 1 49.37120 2.611449
## 2 50.78098 2.297365
## 3 42.99229 4.058517
## 4 46.49610 3.296893
## 5 44.05488 3.254708
## 6 47.27489 2.934588
## 7 45.02721 3.437308
## 8 47.63553 3.064024
## 9 49.24848 2.364639
## 10 50.05766 2.695095
receta2 <- recipe(inv_simpson ~ fat_g_100_ml, data = train) %>%
step_log(fat_g_100_ml)
resultado_receta2 <-
fit_resamples(modelo_regresion, receta2, submuestras,
control = configuracion)
collect_metrics(resultado_receta2)
## # A tibble: 2 x 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 rmse standard 31.1 10 2.36 Preprocessor1_Model1
## 2 rsq standard 0.197 10 0.0489 Preprocessor1_Model1
# Opcional
parametros(x = resultado_receta2)
## intercepto pendiente
## 1 45.80454 11.655483
## 2 48.41620 9.810375
## 3 39.87751 16.361077
## 4 44.99017 12.721149
## 5 41.53824 13.232614
## 6 44.80592 12.154279
## 7 42.47652 14.061353
## 8 44.81469 12.883684
## 9 48.13595 9.112506
## 10 46.91406 11.707854
datos_regmul <- datos %>%
select(somatic_cell_count_cell_ml_x_1000:lactose_g_100_ml, inv_simpson)
ggpairs(data = datos_regmul, lower = list(continuous = "smooth"))
scatterplot3d(x = datos_regmul$fat_g_100_ml,
y = datos_regmul$lactose_g_100_ml,
z = datos_regmul$inv_simpson,
pch = 16,
type = "h")
plot_ly(x = ~fat_g_100_ml, y = ~ lactose_g_100_ml, z = ~inv_simpson,
type = "scatter3d", data = datos_regmul)
modelo_nulo <- lm(inv_simpson ~ 1, data = datos_regmul)
modelo_simple1 <- lm(inv_simpson ~ fat_g_100_ml, data = datos_regmul)
modelo_simple2 <- lm(inv_simpson ~ lactose_g_100_ml, data = datos_regmul)
modelo_multiple <- lm(inv_simpson ~ fat_g_100_ml + lactose_g_100_ml, data = datos_regmul)
logLik(modelo_nulo)
## 'log Lik.' -308.3386 (df=2)
logLik(modelo_simple1)
## 'log Lik.' -306.2484 (df=3)
logLik(modelo_simple2)
## 'log Lik.' -305.3626 (df=3)
logLik(modelo_multiple)
## 'log Lik.' -305.3061 (df=4)
AIC(modelo_nulo, modelo_simple1, modelo_simple2, modelo_multiple)
## df AIC
## modelo_nulo 2 620.6773
## modelo_simple1 3 618.4967
## modelo_simple2 3 616.7252
## modelo_multiple 4 618.6122
BIC(modelo_nulo, modelo_simple1, modelo_simple2, modelo_multiple)
## df BIC
## modelo_nulo 2 624.9635
## modelo_simple1 3 624.9261
## modelo_simple2 3 623.1546
## modelo_multiple 4 627.1847
anova(modelo_simple2, modelo_multiple)
## Analysis of Variance Table
##
## Model 1: inv_simpson ~ lactose_g_100_ml
## Model 2: inv_simpson ~ fat_g_100_ml + lactose_g_100_ml
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 61 59833
## 2 60 59726 1 107.21 0.1077 0.7439
summary(modelo_multiple)
##
## Call:
## lm(formula = inv_simpson ~ fat_g_100_ml + lactose_g_100_ml, data = datos_regmul)
##
## Residuals:
## Min 1Q Median 3Q Max
## -56.148 -23.529 -2.826 22.859 75.837
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 194.2863 109.1763 1.780 0.0802 .
## fat_g_100_ml 0.7276 2.2171 0.328 0.7439
## lactose_g_100_ml -27.2555 20.1925 -1.350 0.1822
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31.55 on 60 degrees of freedom
## Multiple R-squared: 0.09178, Adjusted R-squared: 0.06151
## F-statistic: 3.032 on 2 and 60 DF, p-value: 0.05568
par(mfrow = c(2, 2))
plot(modelo_multiple)
vif(modelo_multiple)
## fat_g_100_ml lactose_g_100_ml
## 2.332753 2.332753
shapiro.test(residuals(modelo_multiple))
##
## Shapiro-Wilk normality test
##
## data: residuals(modelo_multiple)
## W = 0.97947, p-value = 0.3736
bptest(modelo_multiple)
##
## studentized Breusch-Pagan test
##
## data: modelo_multiple
## BP = 0.12571, df = 2, p-value = 0.9391
harvtest(modelo_multiple)
##
## Harvey-Collier test
##
## data: modelo_multiple
## HC = 3.9452, df = 59, p-value = 0.0002146
residualPlots(modelo_multiple)
## Test stat Pr(>|Test stat|)
## fat_g_100_ml -0.3312 0.74163
## lactose_g_100_ml -1.7092 0.09266 .
## Tukey test -1.8581 0.06316 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
marginalModelPlots(modelo_multiple)
ggpredict(modelo_multiple)
## $fat_g_100_ml
## # Predicted values of inv_simpson
##
## fat_g_100_ml | Predicted | 95% CI
## -----------------------------------------
## 1 | 58.98 | [40.60, 77.36]
## 2 | 59.71 | [45.15, 74.27]
## 4 | 61.16 | [52.58, 69.75]
## 5 | 61.89 | [54.06, 69.71]
## 6 | 62.62 | [53.32, 71.92]
## 7 | 63.34 | [51.11, 75.57]
## 8 | 64.07 | [48.25, 79.89]
## 11 | 66.25 | [38.34, 94.17]
##
## Adjusted for:
## * lactose_g_100_ml = 4.99
##
## $lactose_g_100_ml
## # Predicted values of inv_simpson
##
## lactose_g_100_ml | Predicted | 95% CI
## ----------------------------------------------
## 4.40 | 77.88 | [53.22, 102.53]
## 4.50 | 75.15 | [54.21, 96.09]
## 4.70 | 69.70 | [55.79, 83.61]
## 4.80 | 66.97 | [56.12, 77.83]
## 4.90 | 64.25 | [55.66, 72.83]
## 5.10 | 58.80 | [49.89, 67.70]
## 5.20 | 56.07 | [44.71, 67.43]
## 5.50 | 47.90 | [26.30, 69.49]
##
## Adjusted for:
## * fat_g_100_ml = 4.83
##
## attr(,"class")
## [1] "ggalleffects" "list"
## attr(,"model.name")
## [1] "modelo_multiple"
ggpredict(modelo_multiple) %>% plot()
## $fat_g_100_ml
##
## $lactose_g_100_ml
# Cargando la función para graficar hiperplanos 3D
source("plot3d_regresion.R")
plot3d_regresion(
var_x1 = "fat_g_100_ml",
var_x2 = "lactose_g_100_ml",
var_rta = "inv_simpson",
tipo = "interactivo",
datos = datos_regmul,
modelo = modelo_multiple
) %>%
layout(width = 700)
plot3d_regresion(
var_x1 = "fat_g_100_ml",
var_x2 = "lactose_g_100_ml",
var_rta = "inv_simpson",
tipo = "otro",
datos = datos_regmul,
modelo = modelo_multiple
)
I(): sólo incorpora al modelo el
término deseadopoly(): incorpora todos los términos
hasta cierto gradomodelo_cuadratico <-
lm(inv_simpson ~ fat_g_100_ml + lactose_g_100_ml + I(lactose_g_100_ml ^ 2),
data = datos_regmul)
summary(modelo_cuadratico)
##
## Call:
## lm(formula = inv_simpson ~ fat_g_100_ml + lactose_g_100_ml +
## I(lactose_g_100_ml^2), data = datos_regmul)
##
## Residuals:
## Min 1Q Median 3Q Max
## -63.788 -24.785 2.112 20.566 72.695
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2010.467 1294.374 -1.553 0.1257
## fat_g_100_ml 1.192 2.199 0.542 0.5899
## lactose_g_100_ml 854.553 516.290 1.655 0.1032
## I(lactose_g_100_ml^2) -87.942 51.451 -1.709 0.0927 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31.06 on 59 degrees of freedom
## Multiple R-squared: 0.1346, Adjusted R-squared: 0.09063
## F-statistic: 3.06 on 3 and 59 DF, p-value: 0.03507
residualPlots(modelo_cuadratico)
## Test stat Pr(>|Test stat|)
## fat_g_100_ml 0.1258 0.9003
## lactose_g_100_ml -0.7542 0.4538
## I(lactose_g_100_ml^2) -1.4537 0.1514
## Tukey test -1.2141 0.2247
marginalModelPlots(modelo_cuadratico)
AIC(modelo_multiple, modelo_cuadratico)
## df AIC
## modelo_multiple 4 618.6122
## modelo_cuadratico 5 617.5674
anova(modelo_multiple, modelo_cuadratico)
## Analysis of Variance Table
##
## Model 1: inv_simpson ~ fat_g_100_ml + lactose_g_100_ml
## Model 2: inv_simpson ~ fat_g_100_ml + lactose_g_100_ml + I(lactose_g_100_ml^2)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 60 59726
## 2 59 56908 1 2817.9 2.9215 0.09266 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot3d_regresion(
var_x1 = "fat_g_100_ml",
var_x2 = "lactose_g_100_ml",
var_rta = "inv_simpson",
tipo = "interactivo",
datos = datos_regmul,
modelo = modelo_cuadratico
) %>%
layout(width = 700)
modelo_cat1 <- lm(inv_simpson ~ fat_g_100_ml + race, data = datos)
summary(modelo_cat1)
##
## Call:
## lm(formula = inv_simpson ~ fat_g_100_ml + race, data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -55.864 -23.221 -5.681 22.128 68.233
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 44.9053 11.2673 3.985 0.000191 ***
## fat_g_100_ml 3.5700 1.5934 2.240 0.028905 *
## raceFrisona Italiana 2.5404 9.3698 0.271 0.787253
## raceMixed Race -25.4940 17.4724 -1.459 0.149932
## racePezzata Rossa Italiana -0.5933 14.8150 -0.040 0.968195
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31.87 on 58 degrees of freedom
## Multiple R-squared: 0.1043, Adjusted R-squared: 0.04258
## F-statistic: 1.689 on 4 and 58 DF, p-value: 0.1648
model.matrix(inv_simpson ~ fat_g_100_ml + race, data = datos)
## (Intercept) fat_g_100_ml raceFrisona Italiana raceMixed Race
## 1 1 1.79 1 0
## 2 1 1.17 1 0
## 3 1 1.55 1 0
## 4 1 2.30 0 0
## 5 1 3.11 1 0
## 6 1 1.98 1 0
## 7 1 8.11 0 0
## 8 1 8.79 0 0
## 9 1 10.65 0 0
## 10 1 5.74 0 0
## 11 1 2.12 1 0
## 12 1 2.54 1 0
## 13 1 2.15 1 0
## 14 1 2.29 1 0
## 15 1 2.23 1 0
## 16 1 2.40 1 0
## 17 1 1.25 1 0
## 18 1 2.78 1 0
## 19 1 1.89 1 0
## 20 1 6.71 0 0
## 21 1 5.07 0 0
## 22 1 7.90 0 0
## 23 1 7.85 0 0
## 24 1 5.39 0 0
## 25 1 6.28 0 0
## 26 1 3.42 0 0
## 27 1 6.24 0 0
## 28 1 9.64 0 0
## 29 1 4.07 0 0
## 30 1 6.62 0 0
## 31 1 3.77 1 0
## 32 1 9.61 0 0
## 33 1 7.64 0 0
## 34 1 7.24 0 0
## 35 1 4.08 0 1
## 36 1 6.92 1 0
## 37 1 7.51 1 0
## 38 1 1.26 1 0
## 39 1 8.02 1 0
## 40 1 1.61 1 0
## 41 1 4.09 1 0
## 42 1 6.44 1 0
## 43 1 6.30 1 0
## 44 1 5.55 1 0
## 45 1 6.84 1 0
## 46 1 7.05 1 0
## 47 1 2.66 0 0
## 48 1 2.58 1 0
## 49 1 1.53 0 0
## 50 1 1.14 0 0
## 51 1 2.27 0 0
## 52 1 3.00 0 0
## 53 1 1.42 0 0
## 54 1 1.16 1 0
## 55 1 1.80 1 0
## 56 1 7.11 0 1
## 57 1 9.65 0 1
## 58 1 6.05 0 0
## 59 1 8.42 1 0
## 60 1 7.79 1 0
## 61 1 8.04 0 0
## 62 1 5.95 1 0
## 63 1 5.80 0 1
## racePezzata Rossa Italiana
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
## 7 0
## 8 0
## 9 0
## 10 0
## 11 0
## 12 0
## 13 0
## 14 0
## 15 0
## 16 0
## 17 0
## 18 0
## 19 0
## 20 0
## 21 1
## 22 1
## 23 1
## 24 1
## 25 1
## 26 0
## 27 0
## 28 0
## 29 0
## 30 0
## 31 0
## 32 0
## 33 0
## 34 0
## 35 0
## 36 0
## 37 0
## 38 0
## 39 0
## 40 0
## 41 0
## 42 0
## 43 0
## 44 0
## 45 0
## 46 0
## 47 0
## 48 0
## 49 0
## 50 0
## 51 0
## 52 0
## 53 0
## 54 0
## 55 0
## 56 0
## 57 0
## 58 1
## 59 0
## 60 0
## 61 0
## 62 0
## 63 0
## attr(,"assign")
## [1] 0 1 2 2 2
## attr(,"contrasts")
## attr(,"contrasts")$race
## [1] "contr.treatment"
ggpredict(modelo_cat1)
## $fat_g_100_ml
## # Predicted values of inv_simpson
##
## fat_g_100_ml | Predicted | 95% CI
## ------------------------------------------
## 1 | 51.02 | [36.89, 65.14]
## 2 | 54.59 | [42.16, 67.01]
## 4 | 61.73 | [50.67, 72.78]
## 5 | 65.30 | [53.66, 76.93]
## 6 | 68.87 | [55.90, 81.83]
## 7 | 72.44 | [57.59, 87.28]
## 8 | 76.01 | [58.91, 93.10]
## 11 | 86.72 | [61.73, 111.70]
##
## Adjusted for:
## * race = Frisona Italiana
##
## $race
## # Predicted values of inv_simpson
##
## race | Predicted | 95% CI
## ---------------------------------------------------
## Frisona Italiana | 64.69 | [53.21, 76.17]
## Bruna Italiana | 62.15 | [48.33, 75.97]
## Pezzata Rossa Italiana | 61.56 | [35.58, 87.54]
## Mixed Race | 36.66 | [ 4.91, 68.40]
##
## Adjusted for:
## * fat_g_100_ml = 4.83
##
## attr(,"class")
## [1] "ggalleffects" "list"
## attr(,"model.name")
## [1] "modelo_cat1"
ejemplo <- ggpredict(modelo_cat1, terms = c("fat_g_100_ml", "race"))
ejemplo
## # Predicted values of inv_simpson
##
## # race = Frisona Italiana
##
## fat_g_100_ml | Predicted | 95% CI
## ------------------------------------------
## 1 | 51.02 | [36.89, 65.14]
## 3 | 58.16 | [46.82, 69.49]
## 5 | 65.30 | [53.66, 76.93]
## 7 | 72.44 | [57.59, 87.28]
## 11 | 86.72 | [61.73, 111.70]
##
## # race = Bruna Italiana
##
## fat_g_100_ml | Predicted | 95% CI
## ------------------------------------------
## 1 | 48.48 | [28.75, 68.20]
## 3 | 55.62 | [39.81, 71.42]
## 5 | 62.76 | [49.01, 76.50]
## 7 | 69.90 | [55.55, 84.24]
## 11 | 84.18 | [62.40, 105.95]
##
## # race = Pezzata Rossa Italiana
##
## fat_g_100_ml | Predicted | 95% CI
## ------------------------------------------
## 1 | 47.88 | [17.27, 78.49]
## 3 | 55.02 | [27.37, 82.67]
## 5 | 62.16 | [36.28, 88.04]
## 7 | 69.30 | [43.74, 94.86]
## 11 | 83.58 | [54.35, 112.81]
##
## # race = Mixed Race
##
## fat_g_100_ml | Predicted | 95% CI
## ------------------------------------------
## 1 | 22.98 | [-12.90, 58.87]
## 3 | 30.12 | [ -3.13, 63.38]
## 5 | 37.26 | [ 5.60, 68.92]
## 7 | 44.40 | [ 13.15, 75.65]
## 11 | 58.68 | [ 24.64, 92.72]
ejemplo %>%
ggplot(aes(x = x, y = predicted, color = group)) +
geom_line()
datos_completo <- datos %>%
select(-c(id, herd_id, calving_date, observed_otu))
modelo_global <- lm(inv_simpson ~ ., data = datos_completo)
summary(modelo_global)
##
## Call:
## lm(formula = inv_simpson ~ ., data = datos_completo)
##
## Residuals:
## Min 1Q Median 3Q Max
## -59.374 -20.903 -3.046 19.717 70.574
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 145.22107 120.28110 1.207 0.2327
## alimentationunifeed 22.61681 11.55803 1.957 0.0556 .
## raceFrisona Italiana -16.48994 12.52004 -1.317 0.1935
## raceMixed Race -37.70526 18.78129 -2.008 0.0498 *
## racePezzata Rossa Italiana -18.65634 18.40622 -1.014 0.3154
## somatic_cell_count_cell_ml_x_1000 0.01966 0.11628 0.169 0.8664
## cbt_ufc_ml_x_1000 2.48534 1.63476 1.520 0.1344
## fat_g_100_ml -0.04271 2.40554 -0.018 0.9859
## protein_g_100_ml -2.90644 12.33787 -0.236 0.8147
## lactose_g_100_ml -17.50667 23.00388 -0.761 0.4500
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30.8 on 53 degrees of freedom
## Multiple R-squared: 0.2353, Adjusted R-squared: 0.1055
## F-statistic: 1.812 on 9 and 53 DF, p-value: 0.08763
modelo_backward <- stats::step(modelo_global, direction = "backward", trace = 0)
modelo_backward
##
## Call:
## lm(formula = inv_simpson ~ alimentation + cbt_ufc_ml_x_1000 +
## lactose_g_100_ml, data = datos_completo)
##
## Coefficients:
## (Intercept) alimentationunifeed cbt_ufc_ml_x_1000
## 165.642 11.883 2.518
## lactose_g_100_ml
## -24.740
modelo_forward <- stats::step(modelo_global, direction = "forward", trace = 0)
modelo_forward
##
## Call:
## lm(formula = inv_simpson ~ alimentation + race + somatic_cell_count_cell_ml_x_1000 +
## cbt_ufc_ml_x_1000 + fat_g_100_ml + protein_g_100_ml + lactose_g_100_ml,
## data = datos_completo)
##
## Coefficients:
## (Intercept) alimentationunifeed
## 145.22107 22.61681
## raceFrisona Italiana raceMixed Race
## -16.48994 -37.70526
## racePezzata Rossa Italiana somatic_cell_count_cell_ml_x_1000
## -18.65634 0.01966
## cbt_ufc_ml_x_1000 fat_g_100_ml
## 2.48534 -0.04271
## protein_g_100_ml lactose_g_100_ml
## -2.90644 -17.50667
modelo_both <- stats::step(modelo_global, direction = "both", trace = 0)
modelo_both
##
## Call:
## lm(formula = inv_simpson ~ alimentation + cbt_ufc_ml_x_1000 +
## lactose_g_100_ml, data = datos_completo)
##
## Coefficients:
## (Intercept) alimentationunifeed cbt_ufc_ml_x_1000
## 165.642 11.883 2.518
## lactose_g_100_ml
## -24.740
AIC(modelo_backward, modelo_forward, modelo_both)
## df AIC
## modelo_backward 5 615.1520
## modelo_forward 11 621.7747
## modelo_both 5 615.1520
summary(modelo_backward)
##
## Call:
## lm(formula = inv_simpson ~ alimentation + cbt_ufc_ml_x_1000 +
## lactose_g_100_ml, data = datos_completo)
##
## Residuals:
## Min 1Q Median 3Q Max
## -60.458 -22.312 -0.299 23.248 62.247
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 165.642 68.520 2.417 0.0187 *
## alimentationunifeed 11.883 7.788 1.526 0.1324
## cbt_ufc_ml_x_1000 2.518 1.478 1.703 0.0937 .
## lactose_g_100_ml -24.740 13.176 -1.878 0.0654 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30.47 on 59 degrees of freedom
## Multiple R-squared: 0.1672, Adjusted R-squared: 0.1248
## F-statistic: 3.948 on 3 and 59 DF, p-value: 0.01238