This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
Maestría en sanidad animal Disciplina: Epidemiología avanzada Fonte: Dohoo et. al (2003) Veterinary Epidemiologic Reseach dataset: daisy2
# Vamos a trabajar analizando cómo podemos evaluar la importancia de las variables predictoras. Veremos el efecto de la producción de leche, parto, placenta retenida y flujo vaginal en el período de espera de las vacas.
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## Warning in register(): Can't find generic `scale_type` in package ggplot2 to
## register S3 method.
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
# Archivos originales en:
# "http://projects.upei.ca/ver/files/2014/11/ver2_data_R.zip"
setwd("~/Dropbox/1.UNL/1 Modelización/Laboratorio Regresión linear/1.1 Regresion linear/")
# Leyendo los datos
load("daisy2.RData")
# labels = c(region = 'Region',
# herd = 'Herd number',
# cow = 'Cow number',
# study_lact = 'Study lactation number',
# herd_size = 'Herd size',
# mwp = "Minimum wait period for herd",
# parity = 'Lactation number',
# milk120 = 'Milk volume in first 120 days of lactation',
# calv_dt = 'Calving date',
# cf = 'Calving to first service interval',
# fs = 'Conception at first service',
# cc = 'Calving to conception interval',
# wpc = 'Interval from wait period to conception',
# spc = 'Services to conception',
# twin = 'Twins born',
# dyst = 'Dystocia at calving',
# rp = 'Retained placenta at calving',
# vag_disch = 'Vaginal discharge observed',
# h7 = 'Indicator for 7 herd subset'),
# levels = list(fs = list('No' = 0, 'Yes' = 1),
# twin = list('No' = 0, 'Yes' = 1),
# dyst = list('No' = 0, 'Yes' = 1),
# rp = list('No' = 0, 'Yes' = 1),
# vag_disch = list('No' = 0, 'Yes' = 1)),
# units = c(milk120 = "litres"))
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.
#Eliminar las observaciones faltantes en producción de leche
daisy2 <- daisy2[!is.na(daisy2$milk120), ]
# Investiguemos una variable para comprobar su unidad
mean(daisy2$milk120)/120 #(litros)
## [1] 24.64322
hist(daisy2$milk120/120)
#Es razonable producción media diaria de 26 litros
summary(daisy2$milk120, na.rm = TRUE)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 989 2459 2922 2957 3400 6713
# Min. 1st Qu. Median Mean 3rd Qu. Max.
# 1110 2742 3226 3225 3693 5630
sd(daisy2$milk120, na.rm = TRUE)/120
## [1] 5.967396
lm.milk <- lm(milk120 ~ parity, data = daisy2)
summary(lm.milk)
##
## Call:
## lm(formula = milk120 ~ parity, data = daisy2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2010.3 -454.6 -36.3 413.5 3842.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2478.286 17.176 144.28 <2e-16 ***
## parity 195.807 5.385 36.36 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 667.8 on 6608 degrees of freedom
## (1485 observations deleted due to missingness)
## Multiple R-squared: 0.1667, Adjusted R-squared: 0.1666
## F-statistic: 1322 on 1 and 6608 DF, p-value: < 2.2e-16
plot(daisy2$parity, daisy2$milk120, cex=0.2)
# Grafico de la relación linear
lm.milk <- lm(milk120 ~ parity, data = daisy2)
confidence <- data.frame(parity = 1:7)
# Confidence bands
confidence.band <- predict(lm.milk, int = "c" , newdata = confidence)
confidence.band <- as.data.frame(cbind(confidence.band, parity = c(1:7)))
# Prediction bands
prediction.band <- predict(lm.milk, int = "p" , newdata = confidence)
prediction.band <- as.data.frame(cbind(prediction.band, parity = c(1:7)))
ggplot(data = daisy2, aes(x = as.numeric(parity), y = milk120)) +
geom_point(size=0.1) +
geom_smooth(data = confidence.band, aes(x = parity, y = lwr), method = lm,
se = FALSE, colour = "red", size=0.5) +
geom_smooth(data = confidence.band, aes(x = parity, y = upr), method = lm,
se = FALSE, colour = "red", size=0.5) +
geom_smooth(data = prediction.band, aes(x = parity, y = lwr), method = lm,
se = FALSE, colour = "blue", size=0.5) +
geom_smooth(data = prediction.band, aes(x = parity, y = upr), method = lm,
se = FALSE, colour = "blue", size=0.5) +
xlab("Número de lactancia") +
ylab("Volumen de leche en los 120 primeros dias de lactación")+
xlim(0,8)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 1502 rows containing missing values (geom_point).
# Transformación de variables
daisy2$milk120.sq <- daisy2$milk120^2
plot(daisy2$milk120, cex=0.2)
plot(daisy2$milk120.sq, cex=0.2)
table(daisy2$parity)
##
## 1 2 3 4 5 6 7 8 9 10 11
## 1648 1493 1376 1137 670 204 49 16 10 6 1
daisy2$parity.cat[as.numeric(daisy2$parity) >= 3] <- "3+"
daisy2$parity.cat[daisy2$parity == 2] <- "2"
daisy2$parity.cat[as.numeric(daisy2$parity) < 2] <- "1"
table(daisy2$parity.cat)
##
## 1 2 3+
## 1648 1493 3469
Algunas variables de interes
wpc = ‘Interval from wait period to conception’
rp = ‘Retained placenta at calving’
vag_disch = ‘Vaginal discharge observed’
parity = ‘Lactation number’,
wpc = ‘Intervalo desde el período de espera hasta la concepción’
rp = ‘placenta retenida al parto’
vag_disch = ‘Flujo vaginal observado’
parity = ‘Número de lactancia’,
lm.wpc <- lm(wpc ~ milk120 + milk120.sq + as.factor(parity.cat) + rp + vag_disch, data = daisy2)
summary(lm.wpc)
##
## Call:
## lm(formula = wpc ~ milk120 + milk120.sq + as.factor(parity.cat) +
## rp + vag_disch, data = daisy2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -82.65 -38.90 -15.61 25.41 250.07
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.804e+01 9.183e+00 9.587 < 2e-16 ***
## milk120 -1.547e-02 5.989e-03 -2.583 0.009827 **
## milk120.sq 2.994e-06 9.249e-07 3.237 0.001215 **
## as.factor(parity.cat)2 2.094e+00 2.123e+00 0.986 0.324052
## as.factor(parity.cat)3+ -8.191e-01 1.918e+00 -0.427 0.669332
## rp 9.818e+00 2.932e+00 3.349 0.000818 ***
## vag_disch 1.027e+01 4.830e+00 2.126 0.033583 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 53.2 on 5750 degrees of freedom
## (2338 observations deleted due to missingness)
## Multiple R-squared: 0.00744, Adjusted R-squared: 0.006404
## F-statistic: 7.183 on 6 and 5750 DF, p-value: 1.191e-07
# Ajustando un modelo de producción de leche evaluando importancia de
# variables predictoras
# Mejor modelo con mayor Adjusted R2 y variables
# Pista: conjunción entre linea 70 y linea 109
# Utilize los criterios para realizar una análisis linear múltiple
# Variables categoricas serán utilizadas como factores en este análisis
# Realice categorización de variables categoricas