Richard Maclehose Dennis Byrnes and the staff at Lakeside dairy Anonymous peer reviewers
library(knitr)
## Warning: package 'knitr' was built under R version 4.0.2
library(readr)
knitr::opts_chunk$set(echo=TRUE, warning=FALSE, message=FALSE)
load(file="UHS-CM.Rdata")
load(file="uhsdata.Rdata")
load(file="UHS-CM-Cow.Rdata")
The video below shows an example of the video footage that was used (viewed at 16x) to conduct udder hygiene scores
subdays <- subdays[order(subdays$cow),]
head(subdays,20)
print(summarytools::dfSummary(subdays,valid.col=FALSE, graph.col=F, style="grid"))
## Data Frame Summary
## subdays
## Dimensions: 31383 x 26
## Duplicates: 28
##
## +----+---------------------+----------------------------+---------------------+----------+
## | No | Variable | Stats / Values | Freqs (% of Valid) | Missing |
## +====+=====================+============================+=====================+==========+
## | 1 | StudyDay | Mean (sd) : 43.1 (25.5) | 89 distinct values | 0 |
## | | [numeric] | min < med < max: | | (0%) |
## | | | 1 < 42 < 89 | | |
## | | | IQR (CV) : 44 (0.6) | | |
## +----+---------------------+----------------------------+---------------------+----------+
## | 2 | Cow Number | 1. 25 | 92 ( 0.3%) | 0 |
## | | [character] | 2. 110 | 89 ( 0.3%) | (0%) |
## | | | 3. 1104 | 89 ( 0.3%) | |
## | | | 4. 1111 | 89 ( 0.3%) | |
## | | | 5. 1116 | 89 ( 0.3%) | |
## | | | 6. 1128 | 89 ( 0.3%) | |
## | | | 7. 1129 | 89 ( 0.3%) | |
## | | | 8. 113 | 89 ( 0.3%) | |
## | | | 9. 1148 | 89 ( 0.3%) | |
## | | | 10. 115 | 89 ( 0.3%) | |
## | | | [ 494 others ] | 30490 (97.2%) | |
## +----+---------------------+----------------------------+---------------------+----------+
## | 3 | Tx | Min : 0 | 0 : 16266 (51.8%) | 0 |
## | | [numeric] | Mean : 0.5 | 1 : 15117 (48.2%) | (0%) |
## | | | Max : 1 | | |
## +----+---------------------+----------------------------+---------------------+----------+
## | 4 | CM | Min : 0 | 0 : 27993 (89.2%) | 0 |
## | | [numeric] | Mean : 0.1 | 1 : 3390 (10.8%) | (0%) |
## | | | Max : 1 | | |
## +----+---------------------+----------------------------+---------------------+----------+
## | 5 | Daysatrisk | Mean (sd) : 79.3 (17.6) | 56 distinct values | 0 |
## | | [numeric] | min < med < max: | | (0%) |
## | | | 4 < 89 < 89 | | |
## | | | IQR (CV) : 13 (0.2) | | |
## +----+---------------------+----------------------------+---------------------+----------+
## | 6 | Age | Mean (sd) : 50.6 (20.4) | 412 distinct values | 0 |
## | | [numeric] | min < med < max: | | (0%) |
## | | | 11.3 < 45 < 117.4 | | |
## | | | IQR (CV) : 28.5 (0.4) | | |
## +----+---------------------+----------------------------+---------------------+----------+
## | 7 | Parity | 1. 1 | 11226 (35.8%) | 0 |
## | | [factor] | 2. 2 | 7916 (25.2%) | (0%) |
## | | | 3. 3 | 6215 (19.8%) | |
## | | | 4. 4 | 6026 (19.2%) | |
## +----+---------------------+----------------------------+---------------------+----------+
## | 8 | DIM at enrolment | Mean (sd) : 124.7 (102) | 209 distinct values | 0 |
## | | [numeric] | min < med < max: | | (0%) |
## | | | 0 < 120 < 421 | | |
## | | | IQR (CV) : 171 (0.8) | | |
## +----+---------------------+----------------------------+---------------------+----------+
## | 9 | Enrolment date | min : 2016-01-16 | 49 distinct values | 0 |
## | | [Date] | med : 2016-01-16 | | (0%) |
## | | | max : 2016-04-10 | | |
## | | | range : 2m 25d | | |
## +----+---------------------+----------------------------+---------------------+----------+
## | 10 | Exit date | min : 2016-01-17 | 62 distinct values | 0 |
## | | [Date] | med : 2016-04-14 | | (0%) |
## | | | max : 2016-04-14 | | |
## | | | range : 2m 28d | | |
## +----+---------------------+----------------------------+---------------------+----------+
## | 11 | BirthDate | min : 2006-04-08 | 390 distinct values | 0 |
## | | [POSIXct, POSIXt] | med : 2012-04-24 | | (0%) |
## | | | max : 2015-02-05 | | |
## | | | range : 8y 9m 28d | | |
## +----+---------------------+----------------------------+---------------------+----------+
## | 12 | CalveDate | min : 2014-11-21 | 257 distinct values | 0 |
## | | [Date] | med : 2015-09-18 | | (0%) |
## | | | max : 2016-04-10 | | |
## | | | range : 1y 4m 20d | | |
## +----+---------------------+----------------------------+---------------------+----------+
## | 13 | Clinical case date | min : 2016-01-17 | 59 distinct values | 27993 |
## | | [Date] | med : 2016-03-08 | | (89.2%) |
## | | | max : 2016-04-14 | | |
## | | | range : 2m 28d | | |
## +----+---------------------+----------------------------+---------------------+----------+
## | 14 | CMDIM | Mean (sd) : 161.4 (95.1) | 89 distinct values | 28007 |
## | | [numeric] | min < med < max: | | (89.24%) |
## | | | 0 < 148 < 348 | | |
## | | | IQR (CV) : 169 (0.6) | | |
## +----+---------------------+----------------------------+---------------------+----------+
## | 15 | pdirty | Mean (sd) : 3.9 (1.2) | 89 distinct values | 0 |
## | | [numeric] | min < med < max: | | (0%) |
## | | | 1.3 < 3.8 < 6.8 | | |
## | | | IQR (CV) : 1.9 (0.3) | | |
## +----+---------------------+----------------------------+---------------------+----------+
## | 16 | Date | min : 2016-01-16 | 89 distinct values | 0 |
## | | [Date] | med : 2016-02-26 | | (0%) |
## | | | max : 2016-04-13 | | |
## | | | range : 2m 28d | | |
## +----+---------------------+----------------------------+---------------------+----------+
## | 17 | pdirty_1 | Mean (sd) : 3.9 (1.2) | 88 distinct values | 381 |
## | | [numeric] | min < med < max: | | (1.21%) |
## | | | 1.3 < 3.8 < 6.8 | | |
## | | | IQR (CV) : 1.9 (0.3) | | |
## +----+---------------------+----------------------------+---------------------+----------+
## | 18 | pdirty_2 | Mean (sd) : 3.9 (1.2) | 87 distinct values | 762 |
## | | [numeric] | min < med < max: | | (2.43%) |
## | | | 1.3 < 3.8 < 6.8 | | |
## | | | IQR (CV) : 1.9 (0.3) | | |
## +----+---------------------+----------------------------+---------------------+----------+
## | 19 | pdirty_3 | Mean (sd) : 3.9 (1.2) | 86 distinct values | 1143 |
## | | [numeric] | min < med < max: | | (3.64%) |
## | | | 1.3 < 3.9 < 6.8 | | |
## | | | IQR (CV) : 2.1 (0.3) | | |
## +----+---------------------+----------------------------+---------------------+----------+
## | 20 | pdirty_4 | Mean (sd) : 3.9 (1.2) | 85 distinct values | 1524 |
## | | [numeric] | min < med < max: | | (4.86%) |
## | | | 1.3 < 3.9 < 6.8 | | |
## | | | IQR (CV) : 2.1 (0.3) | | |
## +----+---------------------+----------------------------+---------------------+----------+
## | 21 | DIM | Mean (sd) : 162 (105.4) | 459 distinct values | 0 |
## | | [numeric] | min < med < max: | | (0%) |
## | | | 0 < 157 < 458 | | |
## | | | IQR (CV) : 169 (0.7) | | |
## +----+---------------------+----------------------------+---------------------+----------+
## | 22 | cow | Mean (sd) : 1324.3 (1237) | 504 distinct values | 0 |
## | | [numeric] | min < med < max: | | (0%) |
## | | | 1 < 1316 < 9688 | | |
## | | | IQR (CV) : 471 (0.9) | | |
## +----+---------------------+----------------------------+---------------------+----------+
## | 23 | CMD | Min : 0 | 0 : 31266 (99.6%) | 0 |
## | | [numeric] | Mean : 0 | 1 : 117 ( 0.4%) | (0%) |
## | | | Max : 1 | | |
## +----+---------------------+----------------------------+---------------------+----------+
## | 24 | cowday | 1 distinct value | 1 : 31383 (100.0%) | 0 |
## | | [numeric] | | | (0%) |
## +----+---------------------+----------------------------+---------------------+----------+
## | 25 | pdirty3_1 | Mean (sd) : 3.9 (1.2) | 86 distinct values | 1143 |
## | | [numeric] | min < med < max: | | (3.64%) |
## | | | 1.7 < 3.9 < 6.2 | | |
## | | | IQR (CV) : 1.9 (0.3) | | |
## +----+---------------------+----------------------------+---------------------+----------+
## | 26 | cmhx | Min : 0 | 0 : 13510 (43.0%) | 0 |
## | | [numeric] | Mean : 0.6 | 1 : 17873 (57.0%) | (0%) |
## | | | Max : 1 | | |
## +----+---------------------+----------------------------+---------------------+----------+
#print(summarytools::dfSummary(subdays,valid.col=FALSE, graph.magnif=0.8, style="grid"), method = "render")
#Describe subjects in study
library(dplyr)
colnames(Enrollcow)
## [1] "Cow Number" "Mutli lact" "Tx"
## [4] "CM" "CMCode" "Daysatrisk"
## [7] "Age" "Parity" "DIM at enrolment"
## [10] "Enrolment date" "Exit date" "BirthDate"
## [13] "CalveDate" "Clinical case date" "Pathogen"
## [16] "CMDIM" "calveposten" "dayin"
## [19] "dayout" "Delete"
library(table1)
table1(~ Age + factor(Parity) + `DIM at enrolment` + factor(Tx) + factor(CM), data=Enrollcow)
Overall (N=504) |
|
---|---|
Age | |
Mean (SD) | 52.1 (21.4) |
Median [Min, Max] | 46.8 [11.3, 117] |
factor(Parity) | |
1 | 170 (33.7%) |
2 | 121 (24.0%) |
3 | 106 (21.0%) |
4 | 52 (10.3%) |
5 | 29 (5.8%) |
6 | 18 (3.6%) |
7 | 7 (1.4%) |
8 | 1 (0.2%) |
DIM at enrolment | |
Mean (SD) | 124 (112) |
Median [Min, Max] | 107 [2.00, 421] |
factor(Tx) | |
0 | 251 (49.8%) |
1 | 253 (50.2%) |
factor(CM) | |
0 | 388 (77.0%) |
1 | 116 (23.0%) |
hist(subdays$pdirty_1)
library(geepack)
library(emmeans)
library(ggplot2)
subdays <- subdays[order(subdays$cow),]
#Using GEE
#GEE with clustering within Study Day. I have no reason to expect that there is a pattern of covariance of cows within study days. Therefore, I will choose independence.
subdays <- subdays[order(subdays$StudyDay),]
#Model 1 (using UHS from 1 day prior to day at risk)
m1 <- geeglm(CMD ~ pdirty_1 + Parity + DIM + Tx + cmhx,
data = subdays %>% filter(!is.na(pdirty_1)), family=poisson, id=StudyDay) # %>% tidy(exp=T,conf.int=T)
summary(m1)
##
## Call:
## geeglm(formula = CMD ~ pdirty_1 + Parity + DIM + Tx + cmhx, family = poisson,
## data = subdays %>% filter(!is.na(pdirty_1)), id = StudyDay)
##
## Coefficients:
## Estimate Std.err Wald Pr(>|W|)
## (Intercept) -7.107055 0.396936 320.581 < 2e-16 ***
## pdirty_1 0.340176 0.064194 28.082 1.16e-07 ***
## Parity2 0.285218 0.252675 1.274 0.25899
## Parity3 0.486700 0.306613 2.520 0.11243
## Parity4 0.653415 0.283786 5.301 0.02131 *
## DIM -0.003389 0.001078 9.882 0.00167 **
## Tx 0.516520 0.202302 6.519 0.01067 *
## cmhx -0.017775 0.208664 0.007 0.93211
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation structure = independence
## Estimated Scale Parameters:
##
## Estimate Std.err
## (Intercept) 0.9521 1.744
## Number of clusters: 88 Maximum cluster size: 391
#Model 2 (using UHS from 2 days prior to day at risk)
m2 <- geeglm(CMD ~ pdirty_2 + Parity + DIM + Tx + cmhx,
data = subdays %>% filter(!is.na(pdirty_2)), family=poisson, id=StudyDay) # %>% tidy(exp=T,conf.int=T)
summary(m2)
##
## Call:
## geeglm(formula = CMD ~ pdirty_2 + Parity + DIM + Tx + cmhx, family = poisson,
## data = subdays %>% filter(!is.na(pdirty_2)), id = StudyDay)
##
## Coefficients:
## Estimate Std.err Wald Pr(>|W|)
## (Intercept) -7.13529 0.41523 295.29 < 2e-16 ***
## pdirty_2 0.35822 0.06179 33.61 6.7e-09 ***
## Parity2 0.25610 0.25483 1.01 0.3149
## Parity3 0.46238 0.31177 2.20 0.1380
## Parity4 0.67708 0.28510 5.64 0.0176 *
## DIM -0.00352 0.00110 10.28 0.0013 **
## Tx 0.48942 0.20281 5.82 0.0158 *
## cmhx -0.04752 0.20977 0.05 0.8208
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation structure = independence
## Estimated Scale Parameters:
##
## Estimate Std.err
## (Intercept) 0.958 1.93
## Number of clusters: 87 Maximum cluster size: 391
#Model 3 (using UHS from 3 days prior to day at risk)
m3 <- geeglm(CMD ~ pdirty_3 + Parity + DIM + Tx + cmhx,
data = subdays %>% filter(!is.na(pdirty_3)), family=poisson, id=StudyDay) # %>% tidy(exp=T,conf.int=T)
summary(m3)
##
## Call:
## geeglm(formula = CMD ~ pdirty_3 + Parity + DIM + Tx + cmhx, family = poisson,
## data = subdays %>% filter(!is.na(pdirty_3)), id = StudyDay)
##
## Coefficients:
## Estimate Std.err Wald Pr(>|W|)
## (Intercept) -7.01383 0.43631 258.42 < 2e-16 ***
## pdirty_3 0.33268 0.06686 24.76 6.5e-07 ***
## Parity2 0.25549 0.25479 1.01 0.3160
## Parity3 0.45997 0.31182 2.18 0.1402
## Parity4 0.67599 0.28530 5.61 0.0178 *
## DIM -0.00352 0.00110 10.24 0.0014 **
## Tx 0.49189 0.20255 5.90 0.0152 *
## cmhx -0.04222 0.20992 0.04 0.8406
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation structure = independence
## Estimated Scale Parameters:
##
## Estimate Std.err
## (Intercept) 0.97 1.96
## Number of clusters: 86 Maximum cluster size: 391
tab <- rbind(tidy(m1,conf.int=T)[2,],
tidy(m2,conf.int=T)[2,],
tidy(m3,conf.int=T)[2,]) %>% select(term,estimate,std.error,conf.low, conf.high)
tab1 <- rbind(tidy(m1,conf.int=T,exp=T)[2,],
tidy(m2,conf.int=T,exp=T)[2,],
tidy(m3,conf.int=T,exp=T)[2,]) %>% select(term,estimate,std.error,conf.low, conf.high)
tab1$RR <- paste0(round(tab1$estimate,1)," (",round(tab1$conf.low,1),", ",round(tab1$conf.high,1),")")
tab1 <- tab1 %>% select(term,RR)
tab <- merge(tab,tab1,by="term")
tab %>% select(term,estimate,std.error,RR)
Given that estimates are very similar when using UHS from 1-3 days prior to day at risk, I will only create a plot for UHS 1 day before day at risk.
emx <- emmeans(m1,~pdirty_1,type="response",at = list(pdirty_1=c(0:6))) %>% tidy
emx$pdirty <- emx$pdirty_1
emx$rate <- emx$rate * 3000
emx$asymp.LCL <- emx$asymp.LCL * 3000
emx$asymp.UCL <- emx$asymp.UCL * 3000
emx$pdirty <- emx$pdirty / 10
emx$std.error <- NULL
emx$pdirty_1 <- NULL
emx$df <- NULL
emx[, c(4,1,2,3)]
curve1 <- ggplot(emx) + coord_cartesian(xlim = c(0, 0.6),ylim = (c(0,25))) + aes(pdirty,rate) + geom_smooth(size=1.2, se=F) + geom_ribbon(aes(ymin=asymp.LCL, ymax=asymp.UCL),linetype=0,alpha=0.2) + labs(x = "Herd udder hygiene \n(Proportion of herd with udder hygiene scores ≥ 3/4)", y = "Mastitis cases per 100 cow-months at risk") + theme(panel.border = element_rect(colour = "black", fill=NA, size=1),axis.text=element_text(size=12,family="Times"),axis.title=element_text(size=14,face="bold",family="Times"),panel.background = element_rect(fill = "white", colour = "white",size = 0.5, linetype = "solid"),panel.grid.major = element_line(size = 0.02, linetype = 'solid', colour = "grey"), panel.grid.minor = element_line(size = 0.02, linetype = 'solid',colour = "grey"),legend.position="bottom",legend.text = element_text(colour="black", size=12,face="bold",family="Times")) + scale_color_brewer(palette="Set1")
curve1
#tiff("curve1.tiff", units="in", width=7, height=7, res=300)
#curve1
#dev.off()
Sensitivity analysis - use of quadratic term
#Quadratic term
mm <- geeglm(CMD ~ pdirty_1 + I(pdirty_1^2) + Parity + DIM + Tx + cmhx,
data = subdays %>% filter(!is.na(pdirty_1)), family=poisson, id=StudyDay)
summary(mm)
##
## Call:
## geeglm(formula = CMD ~ pdirty_1 + I(pdirty_1^2) + Parity + DIM +
## Tx + cmhx, family = poisson, data = subdays %>% filter(!is.na(pdirty_1)),
## id = StudyDay)
##
## Coefficients:
## Estimate Std.err Wald Pr(>|W|)
## (Intercept) -8.84288 0.92082 92.22 <2e-16 ***
## pdirty_1 1.19281 0.41937 8.09 0.0045 **
## I(pdirty_1^2) -0.09726 0.04729 4.23 0.0397 *
## Parity2 0.28219 0.25305 1.24 0.2648
## Parity3 0.48389 0.30675 2.49 0.1147
## Parity4 0.65406 0.28388 5.31 0.0212 *
## DIM -0.00338 0.00108 9.88 0.0017 **
## Tx 0.51540 0.20236 6.49 0.0109 *
## cmhx -0.01512 0.20890 0.01 0.9423
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation structure = independence
## Estimated Scale Parameters:
##
## Estimate Std.err
## (Intercept) 0.941 1.77
## Number of clusters: 88 Maximum cluster size: 391
#Cubic term
m2 <- geeglm(CMD ~ pdirty_1 + I(pdirty_1^2) + I(pdirty_1^3) +
Parity + DIM + Tx + cmhx,
data = subdays %>% filter(!is.na(pdirty_1)), family=poisson, id=StudyDay) # %>% tidy(exp=T,conf.int=T)
summary(m2)
##
## Call:
## geeglm(formula = CMD ~ pdirty_1 + I(pdirty_1^2) + I(pdirty_1^3) +
## Parity + DIM + Tx + cmhx, family = poisson, data = subdays %>%
## filter(!is.na(pdirty_1)), id = StudyDay)
##
## Coefficients:
## Estimate Std.err Wald Pr(>|W|)
## (Intercept) -12.41900 2.64145 22.10 2.6e-06 ***
## pdirty_1 3.96368 1.92315 4.25 0.0393 *
## I(pdirty_1^2) -0.76862 0.45042 2.91 0.0879 .
## I(pdirty_1^3) 0.05131 0.03343 2.36 0.1248
## Parity2 0.28142 0.25276 1.24 0.2655
## Parity3 0.48312 0.30670 2.48 0.1152
## Parity4 0.65365 0.28401 5.30 0.0214 *
## DIM -0.00338 0.00108 9.88 0.0017 **
## Tx 0.51552 0.20241 6.49 0.0109 *
## cmhx -0.01294 0.20898 0.00 0.9506
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation structure = independence
## Estimated Scale Parameters:
##
## Estimate Std.err
## (Intercept) 0.931 1.71
## Number of clusters: 88 Maximum cluster size: 391
atx <- unique(subdays$pdirty_1)
emx <- emmeans(m1,~pdirty_1,type="response",at = list(pdirty_1=atx)) %>% tidy
emx$pdirty <- emx$pdirty_1
emx$rate <- emx$rate
emx$asymp.LCL <- emx$asymp.LCL
emx$asymp.UCL <- emx$asymp.UCL
emx$pdirty <- emx$pdirty
emx$std.error <- NULL
emx$pdirty_1 <- NULL
emx$df <- NULL
emx[, c(4,1,2,3)]
emx$asymp.LCL <- NULL
emx$asymp.UCL <- NULL
emx$curve <- "Main Model"
emx4 <- emmeans(mm,~pdirty_1,type="response",at = list(pdirty_1=atx)) %>% tidy
emx4$pdirty <- emx4$pdirty_1
emx4$rate <- emx4$rate
emx4$asymp.LCL <- emx4$asymp.LCL
emx4$asymp.UCL <- emx4$asymp.UCL
emx4$pdirty <- emx4$pdirty
emx4$std.error <- NULL
emx4$pdirty_1 <- NULL
emx4$df <- NULL
emx4[, c(4,1,2,3)]
emx4$asymp.LCL <- NULL
emx4$asymp.UCL <- NULL
emx4$curve <- "Model with quadratic term"
emx <- rbind(emx,emx4)
smooth_vals = predict(loess(as.integer(CMD)~pdirty_1, data=subdays, span=0.4), subdays$pdirty_1)
subdays$rate <- smooth_vals
emx2 <- subdays %>% select(pdirty_1,rate)
emx2$pdirty <- emx2$pdirty_1
emx2$pdirty_1 <- NULL
emx2$curve <- "Raw data with loess smoother"
emx3 <- rbind(emx,emx2)
curve1 <- ggplot(emx3) + aes(pdirty,rate,color=curve) + geom_line(size=1.2) +
coord_cartesian(xlim = c(1, 6)) +
labs(x = "Herd udder hygiene \n(Proportion of herd with udder hygiene scores ≥ 3/4)", y = "Mastitis cases per day-day at risk") +
theme(panel.border = element_rect(colour = "black", fill=NA, size=1),axis.text=element_text(size=12,family="Times"),axis.title=element_text(size=14,face="bold",family="Times"),panel.background = element_rect(fill = "white", colour = "white",size = 0.5, linetype = "solid"),panel.grid.major = element_line(size = 0.02, linetype = 'solid', colour = "grey"), panel.grid.minor = element_line(size = 0.02, linetype = 'solid',colour = "grey"),legend.position="bottom",legend.text = element_text(colour="black", size=12,face="bold",family="Times")) + scale_color_brewer(palette="Set1")
curve1
Check for predicted probabilities outside the range of 0 to 1
subdays$predict <- predict(m1,subdays,type="response")
hist(subdays$predict)
range(subdays$predict, na.rm=T)
## [1] 0.000272 0.026380
All predicted probabilities were between 0 and 1 for each row in the current dataset.
Sensitivity analysis - use compound symmetry (exchangeable) covariance structure
subdays <- subdays[order(subdays$cow),]
subdays <- subdays[order(subdays$StudyDay),]
#Model 1 (using UHS from 1 day prior to day at risk)
m1 <- geeglm(CMD ~ pdirty_1 + Parity + DIM + Tx + cmhx,
data = subdays %>% filter(!is.na(pdirty_1)),
corstr = "exchangeable",
family=poisson, id=StudyDay) # %>% tidy(exp=T,conf.int=T)
summary(m1)
##
## Call:
## geeglm(formula = CMD ~ pdirty_1 + Parity + DIM + Tx + cmhx, family = poisson,
## data = subdays %>% filter(!is.na(pdirty_1)), id = StudyDay,
## corstr = "exchangeable")
##
## Coefficients:
## Estimate Std.err Wald Pr(>|W|)
## (Intercept) -7.10692 0.39636 321.50 <2e-16 ***
## pdirty_1 0.34000 0.06384 28.37 1e-07 ***
## Parity2 0.28676 0.25266 1.29 0.2564
## Parity3 0.48705 0.30655 2.52 0.1121
## Parity4 0.65377 0.28360 5.31 0.0212 *
## DIM -0.00339 0.00108 9.90 0.0017 **
## Tx 0.51667 0.20216 6.53 0.0106 *
## cmhx -0.01625 0.20854 0.01 0.9379
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation structure = exchangeable
## Estimated Scale Parameters:
##
## Estimate Std.err
## (Intercept) 0.951 1.74
## Link = identity
##
## Estimated Correlation Parameters:
## Estimate Std.err
## alpha -0.000236 0.00061
## Number of clusters: 88 Maximum cluster size: 391
m1 <- geeglm(CMD ~ pdirty_1 + Parity + DIM + Tx + cmhx,
data = subdays %>% filter(!is.na(pdirty_1)),
corstr = "ar1",
family=poisson, id=StudyDay) # %>% tidy(exp=T,conf.int=T)
summary(m1)
##
## Call:
## geeglm(formula = CMD ~ pdirty_1 + Parity + DIM + Tx + cmhx, family = poisson,
## data = subdays %>% filter(!is.na(pdirty_1)), id = StudyDay,
## corstr = "ar1")
##
## Coefficients:
## Estimate Std.err Wald Pr(>|W|)
## (Intercept) -7.10594 0.39701 320.36 < 2e-16 ***
## pdirty_1 0.34009 0.06412 28.13 1.1e-07 ***
## Parity2 0.28399 0.25305 1.26 0.2617
## Parity3 0.48930 0.30653 2.55 0.1104
## Parity4 0.65690 0.28328 5.38 0.0204 *
## DIM -0.00340 0.00108 9.94 0.0016 **
## Tx 0.51832 0.20228 6.57 0.0104 *
## cmhx -0.01946 0.20859 0.01 0.9257
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation structure = ar1
## Estimated Scale Parameters:
##
## Estimate Std.err
## (Intercept) 0.953 1.75
## Link = identity
##
## Estimated Correlation Parameters:
## Estimate Std.err
## alpha -0.00415 0.00793
## Number of clusters: 88 Maximum cluster size: 391
Sensitivity analysis 2 - control for clustering of cow-days within cows.
subdays <- subdays[order(subdays$cow),]
#Model 1 (using UHS from 1 day prior to day at risk)
m1 <- geeglm(CMD ~ pdirty_1 + Parity + DIM + Tx + cmhx,
data = subdays %>% filter(!is.na(pdirty_1)), family=poisson, id=cow) # %>% tidy(exp=T,conf.int=T)
summary(m1)
##
## Call:
## geeglm(formula = CMD ~ pdirty_1 + Parity + DIM + Tx + cmhx, family = poisson,
## data = subdays %>% filter(!is.na(pdirty_1)), id = cow)
##
## Coefficients:
## Estimate Std.err Wald Pr(>|W|)
## (Intercept) -7.10705 0.40598 306.45 <2e-16 ***
## pdirty_1 0.34018 0.06856 24.62 7e-07 ***
## Parity2 0.28522 0.26973 1.12 0.2903
## Parity3 0.48670 0.28823 2.85 0.0913 .
## Parity4 0.65341 0.30631 4.55 0.0329 *
## DIM -0.00339 0.00110 9.46 0.0021 **
## Tx 0.51652 0.19722 6.86 0.0088 **
## cmhx -0.01778 0.22383 0.01 0.9367
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation structure = independence
## Estimated Scale Parameters:
##
## Estimate Std.err
## (Intercept) 0.952 1.91
## Number of clusters: 504 Maximum cluster size: 91
Results are very similar.
Sensitivity analysis 3. Will replicate analysis for model 1 using a generalized linear mixed model.
library(lme4)
#Using Poisson - convergence issues, but estimates are very similar to GEE
glmm <- glmer(CMD ~ pdirty_1 + Parity + DIM + Tx + cmhx + (1 | cow) + (1 | Date),
data = subdays %>% filter(!is.na(pdirty_1)), family=poisson)
summary(glmm)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: poisson ( log )
## Formula: CMD ~ pdirty_1 + Parity + DIM + Tx + cmhx + (1 | cow) + (1 |
## Date)
## Data: subdays %>% filter(!is.na(pdirty_1))
##
## AIC BIC logLik deviance df.resid
## 1466 1550 -723 1446 30992
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -0.409 -0.037 -0.027 -0.021 16.511
##
## Random effects:
## Groups Name Variance Std.Dev.
## cow (Intercept) 4.8393 2.200
## Date (Intercept) 0.0618 0.249
## Number of obs: 31002, groups: cow, 504; Date, 88
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.51181 0.63873 -13.33 < 2e-16 ***
## pdirty_1 0.37564 0.08403 4.47 7.8e-06 ***
## Parity2 0.34863 0.43613 0.80 0.4241
## Parity3 0.58890 0.46408 1.27 0.2045
## Parity4 0.77803 0.50606 1.54 0.1242
## DIM -0.00392 0.00140 -2.79 0.0052 **
## Tx 0.76994 0.30523 2.52 0.0117 *
## cmhx -0.08608 0.38246 -0.23 0.8219
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) pdrt_1 Party2 Party3 Party4 DIM Tx
## pdirty_1 -0.665
## Parity2 -0.278 0.002
## Parity3 -0.257 0.014 0.522
## Parity4 -0.201 0.012 0.534 0.558
## DIM -0.292 0.009 0.060 0.092 0.002
## Tx -0.325 0.021 -0.002 -0.053 -0.030 -0.006
## cmhx -0.041 0.000 -0.353 -0.434 -0.548 -0.222 0.051
## convergence code: 0
## Model failed to converge with max|grad| = 0.686311 (tol = 0.002, component 1)
## Model is nearly unidentifiable: very large eigenvalue
## - Rescale variables?
#Using Logistic - also has convergence issues. Similar results to GEE.
glmm <- glmer(CMD ~ pdirty_1 + Parity + DIM + Tx + cmhx + (1 | Date),
data = subdays %>% filter(!is.na(pdirty_1)), family=binomial)
summary(glmm)
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula: CMD ~ pdirty_1 + Parity + DIM + Tx + cmhx + (1 | Date)
## Data: subdays %>% filter(!is.na(pdirty_1))
##
## AIC BIC logLik deviance df.resid
## 1508 1583 -745 1490 30993
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -0.164 -0.070 -0.055 -0.044 26.860
##
## Random effects:
## Groups Name Variance Std.Dev.
## Date (Intercept) 9.86e-13 9.93e-07
## Number of obs: 31002, groups: Date, 88
##
## Fixed effects:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.110734 0.417720 -17.02 < 2e-16 ***
## pdirty_1 0.342027 0.075765 4.51 6.4e-06 ***
## Parity2 0.286560 0.269629 1.06 0.28788
## Parity3 0.489284 0.285193 1.72 0.08623 .
## Parity4 0.656937 0.309260 2.12 0.03365 *
## DIM -0.003406 0.000971 -3.51 0.00045 ***
## Tx 0.519173 0.191286 2.71 0.00665 **
## cmhx -0.017962 0.228627 -0.08 0.93738
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) pdrt_1 Party2 Party3 Party4 DIM Tx
## pdirty_1 -0.795
## Parity2 -0.289 -0.011
## Parity3 -0.261 -0.004 0.547
## Parity4 -0.204 -0.005 0.537 0.580
## DIM -0.280 -0.002 0.131 0.154 0.054
## Tx -0.280 -0.008 -0.027 -0.055 -0.015 -0.005
## cmhx -0.070 -0.007 -0.270 -0.395 -0.520 -0.214 0.069
## convergence code: 0
## boundary (singular) fit: see ?isSingular
Setup of dataset
set.seed(99)
#Create dataset of UHS to test necessary sample size
library(tidyr)
hyg <- uhscount
hyg$count <- seq(1,406)
hyg <- hyg[, c(36,1:35)]
hyg <- pivot_longer(hyg,-count, names_to = "date", values_to = "score",values_drop_na = TRUE)
library(janitor)
hyg$score %>% tabyl
#Reformat scores to 0 (scores 1/2) and 1 (scores 3 and 4 = dirty)
hyg$score[hyg$score<3] <- 0
hyg$score[hyg$score>2] <- 1
maxperdate <- hyg %>% group_by(date) %>% summarise(max=max(count))
hyg <- merge(hyg,maxperdate,by="date")
hyg$quant <- hyg$count/hyg$max
#Create function to estimate pdirty when selecting a subset of the herd.
pdirt1 <- function(dat,samp.size){
select.date <- hyg %>% filter(hyg$date==dat) %>% select(score)
counts.today <- sum(!is.na(select.date$score))
sample.size <- min(counts.today,samp.size)
sample.scores <- sample(select.date$score,size = sample.size,replace = FALSE)
mean(sample.scores, na.rm=T)
}
#Function for random selection of a given proportion of cows
pdirt1(dat=42389,samp.size=1)
hyg <- hyg[order(hyg$count),]
hyg$middle80.criteria.min <- 0.5 - (1/hyg$max*40)
hyg$middle80.criteria.max <- 0.5 + (1/hyg$max*40)
hyg$middle80 <- 0
hyg$middle80[hyg$quant > hyg$middle80.criteria.min & hyg$quant < hyg$middle80.criteria.max] <- 1
hyg %>% filter(middle80==1) %>%
group_by(date) %>%
summarise(
middle80 = n())
last80 <- hyg %>%
group_by(date) %>%
slice(tail(row_number(), 80))
last80$last80 <- 1
first80 <- hyg %>%
group_by(date) %>%
slice(head(row_number(), 80))
first80$first80 <- 1
hyg <- merge(hyg,first80,all=T)
hyg$first80[is.na(hyg$first80)] <- 0
hyg <- merge(hyg,last80,all=T)
hyg$last80[is.na(hyg$last80)] <- 0
pdirt.first80 <- function(dat){
hyg %>% filter(hyg$date==dat & first80==1) %>% select(score) %>% summarise(mean=mean(score))}
pdirt.middle80 <- function(dat){
hyg %>% filter(hyg$date==dat & middle80==1) %>% select(score) %>% summarise(mean=mean(score))}
pdirt.last80 <- function(dat){
hyg %>% filter(hyg$date==dat & last80==1) %>% select(score) %>% summarise(mean=mean(score))}
#Create function to estimate pdirty when selecting a subset of the herd.
pdirt2 <- function(dat1){
pfull <- pdirt1(dat=dat1,samp.size=1000)
p350 <- pdirt1(dat=dat1,samp.size=350)
p300 <- pdirt1(dat=dat1,samp.size=300)
p250 <- pdirt1(dat=dat1,samp.size=250)
p200 <- pdirt1(dat=dat1,samp.size=200)
p150<- pdirt1(dat=dat1,samp.size=150)
p100 <- pdirt1(dat=dat1,samp.size=100)
p80 <- pdirt1(dat=dat1,samp.size=80)
p60 <- pdirt1(dat=dat1,samp.size=60)
p50 <- pdirt1(dat=dat1,samp.size=50)
p40 <- pdirt1(dat=dat1,samp.size=40)
p30 <- pdirt1(dat=dat1,samp.size=30)
p20 <- pdirt1(dat=dat1,samp.size=20)
p10 <- pdirt1(dat=dat1,samp.size=10)
pfirst80 <- pdirt.first80(dat=dat1)
pmiddle80 <- pdirt.middle80(dat=dat1)
plast80 <- pdirt.last80(dat=dat1)
#Differences between full dataset and sample
pd350 <- p350 - pfull
pd300 <- p300 - pfull
pd250 <- p250 - pfull
pd200 <- p200 - pfull
pd150 <- p150 - pfull
pd100 <- p100 - pfull
pd80 <- p80 - pfull
pd60 <- p60 - pfull
pd50 <- p50 - pfull
pd40 <- p40 - pfull
pd30 <- p30 - pfull
pd20 <- p20 - pfull
pd10 <- p10 - pfull
pdfirst80 <- pfirst80 - pfull
pdmiddle80 <- pmiddle80 - pfull
pdlast80 <- plast80 - pfull
tab <- as.data.frame(rbind(pfull,p350,p300,p250,p200,p150,p100,p80,p60,p50,p40,p30,p20,p10,
pfirst80,pmiddle80,plast80,
pd350,pd300,pd250,pd200,pd150,pd100,pd80,pd60,pd50,pd40,pd30,pd20,pd10,
pdfirst80,pdmiddle80,pdlast80))
tab$measure <- c("all cows",350,300,250,200,150,100,80,60,50,40,30,20,10,
"first80","middle80","last80",
"350diff","300diff","250diff","200diff","150diff","100diff","80diff","60diff",
"50diff","40diff","30diff","20diff","10diff",
"First80diff","Middle80diff","Last80diff")
tab$date <- dat1
tab
}
#pdirt2(42387)
#print(pdirt2(42387))
#Create list of dates
uhsdates <- unique(hyg$date)
#Combine into single dataframe
hygsum <- data.frame()
hygsum <- do.call(rbind.data.frame,
lapply(unique(uhsdates),
function(i){
pdirt2(i)}))
hygsum <- hygsum[, c(3,2,1)]
hygsum$prop <- hygsum$mean
hygsum$mean <- NULL
library(blandr)
library(DescTools)
library(stringr)
library(ggplot2)
?CCC
#Analysis 1 - report differences and sd of differences ----
tabdiff <- hygsum %>% group_by(measure) %>% summarise(mean=mean(prop), sd=sd(prop),hilow=Range(prop),
q5=quantile(probs=0.05,prop),q95=quantile(probs=0.95,prop))
tabdiff
tabdiffplot1 <- hygsum %>% filter(str_detect(measure, "diff")
& measure != "First80diff" & measure != "Middle80diff" & measure != "Last80diff")
tabdiffplot1$measure <- recode_factor(tabdiffplot1$measure,
"350diff" = "350",
"300diff" = "300",
"250diff" = "250",
"200diff" = "200",
"150diff" = "150",
"100diff" = "100",
"80diff" = "80",
"60diff" = "60",
"50diff" = "50",
"40diff" = "40",
"30diff" = "30",
"20diff" = "20",
"10diff" = "10",
.ordered = TRUE)
plot1 <- ggplot(tabdiffplot1, aes(x=measure, y=prop)) +
coord_flip(ylim = c(-.25,.25)) +
geom_boxplot(aes(fill=measure)) +
geom_jitter(width = 0.1) +
theme(legend.position="none") +
labs(y = "Deviation from TRUE herd udder hygiene \n(Proportion of herd with udder hygiene scores ≥ 3/4)", x="Number of cows scored") +
theme(axis.text.x = element_text(size=12, family="Times"),axis.title.x = element_text(size=15, family="Times",face="bold")) +
theme(axis.text.y = element_text(size=15, family="Times"),axis.title.y = element_text(size=15, family="Times",face="bold")) +
theme(plot.title = element_text(size=15, family="Times",hjust = 0.5)) +
geom_hline(yintercept=0, linetype="solid", color = "red") +
theme(panel.border = element_rect(colour = "black", fill=NA, size=1),
panel.background = element_rect(fill = "white", colour = "white",size = 0.5, linetype = "solid"),
panel.grid.major = element_line(size =0, linetype = 'solid', colour = "grey"),
panel.grid.minor = element_line(size = 0, linetype = 'solid',colour = "grey"))
plot1
tiff("newplot1.tiff", units="in", width=7, height=7, res=300)
plot1
dev.off()
## quartz_off_screen
## 2
tabdiff1 <-hygsum %>% filter(measure == "First80diff" | measure == "Middle80diff" | measure == "Last80diff")
tabdiff1$measure <- recode_factor(tabdiff1$measure,
Last80diff = "Last 80",
Middle80diff = "Middle 80",
First80diff = "First 80",
.ordered = TRUE)
plot2 <- ggplot(tabdiff1, aes(x=measure, y=prop)) +
geom_boxplot(aes(fill=measure)) +
geom_jitter(width = 0.1) +
coord_flip(ylim = c(-.25,.25)) +
theme(legend.position="none") +
labs(y = "Deviation from TRUE herd udder hygiene \n(Proportion of herd with udder hygiene scores ≥ 3/4)", x="Stage of milking") +
theme(axis.text.x = element_text(size=12, family="Times"),axis.title.x = element_text(size=15, family="Times",face="bold")) +
theme(axis.text.y = element_text(size=15, family="Times"),axis.title.y = element_text(size=15, family="Times",face="bold")) +
theme(plot.title = element_text(size=15, family="Times",hjust = 0.5)) +
geom_hline(yintercept=0, linetype="solid", color = "red") +
theme(panel.border = element_rect(colour = "black", fill=NA, size=1),
panel.background = element_rect(fill = "white", colour = "white",size = 0.5, linetype = "solid"),
panel.grid.major = element_line(size =0, linetype = 'solid', colour = "grey"),
panel.grid.minor = element_line(size = 0, linetype = 'solid',colour = "grey"))
plot2
tiff("newplot2.tiff", units="in", width=7, height=4, res=300)
plot2
dev.off()
## quartz_off_screen
## 2
#Analysis 2 - limits of agreement -----
#CCC(hygsum$prop[hygsum$measure=="all cows"], hygsum$prop[hygsum$measure==10])
#?CCC
difffun <- function(group){
a <- CCC(hygsum$prop[hygsum$measure=="all cows"], hygsum$prop[hygsum$measure==group])[5] %>% as.data.frame %>% select(blalt.delta) %>%
summarise(mean = -mean(blalt.delta),
conf.low = -mean(blalt.delta) - 2.58*sd(blalt.delta),
conf.high = -mean(blalt.delta) + 2.58*sd(blalt.delta))
b <- paste0(sprintf("%.2f",round(a$mean, 2)), sep=""," (",
sprintf("%.2f",round(a$conf.low, 2)),", ",
sprintf("%.2f",round(a$conf.high, 2)),")")
as.data.frame(cbind(b,group))
}
#difffun(350)
#Loop through proportions
proplist <- c(350,300,250,200,150,100,80,60,50,40,30,20,10,
"first80","middle80","last80")
#for (i in proplist) {
# print(corfun(i))
#}
#Combine into single dataframe
mdifftab <- data.frame()
mdifftab <- do.call(rbind.data.frame,
lapply(proplist,
function(i){
difffun(i)}))
colnames(mdifftab) <- c("mdiff","per")
mdifftab
#Analysis 2 - CCC-----
cccfun <- function(group){
tab2 <- CCC(hygsum$prop[hygsum$measure=="all cows"], hygsum$prop[hygsum$measure==group])[1] %>% as.data.frame %>% select('rho.c.est')
tab2$per <- group
tab2
}
cccfun(300)
#Loop through proportions
proplist <- c(350,300,250,200,150,100,80,60,50,40,30,20,10,
"first80","middle80","last80")
#Combine into single dataframe
cortab <- data.frame()
cortab <- do.call(rbind.data.frame,
lapply(proplist,
function(i){
cccfun(i)}))
cortab$R <- cortab$.
cortab$. <- NULL
cortab
More detailed reporting
func4limitsofagreement <- function(group){
a <- CCC(hygsum$prop[hygsum$measure=="all cows"], hygsum$prop[hygsum$measure==group],conf.level = 0.99)[[5]]
meandiff.value <- -mean(a$delta)
meandiff.sd <- sd(a$delta)
meandiff.se <- meandiff.sd / sqrt(nrow(a))
meandiff.99 <- 2.58*meandiff.se
lolim99 <- -meandiff.value - 2.58*meandiff.sd
uplim99 <- -meandiff.value + 2.58*meandiff.sd
lim99.se <- meandiff.se * 1.71
lim99.99ci <- lim99.se * 2.58
c3 <- CCC(hygsum$prop[hygsum$measure=="all cows"], hygsum$prop[hygsum$measure==group],conf.level = 0.99)[[1]]
c3$ccc <- c3$est
c3$ccc.lwr.ci <- c3$lwr.ci
c3$ccc.upr.ci <- c3$upr.ci
cbind(group,c3 %>% select(ccc,ccc.lwr.ci,ccc.upr.ci),meandiff.value,meandiff.se,meandiff.99,lolim99,uplim99,lim99.se,lim99.99ci)
}
#Loop through proportions
proplist <- c(350,300,250,200,150,100,80,60,50,40,30,20,10,
"first80","middle80","last80")
cortab2 <- data.frame()
cortab2 <- do.call(rbind.data.frame,
lapply(proplist,
function(i){
func4limitsofagreement(i)}))
cortab2
cortab2$ccc.sum <- paste0(sprintf("%.2f",round(cortab2$ccc, 2)), sep=""," (",
sprintf("%.2f",round(cortab2$ccc.lwr.ci, 2)),", ",
sprintf("%.2f",round(cortab2$ccc.upr.ci, 2)),")")
cortab2
The variable ‘quant’ was used for this. For each milking the exact quantile that the cow came in. Results will be reported as a RR per quantile (i.e. relative change in risk for every 10% increase in position)
hyg$quant <- hyg$quant * 10
gee <- geeglm(score ~ quant,data=hyg,id=date,family=binomial(link="log"))
summary(gee)
##
## Call:
## geeglm(formula = score ~ quant, family = binomial(link = "log"),
## data = hyg, id = date)
##
## Coefficients:
## Estimate Std.err Wald Pr(>|W|)
## (Intercept) -1.03196 0.07433 192.8 < 2e-16 ***
## quant 0.02158 0.00622 12.1 0.00052 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation structure = independence
## Estimated Scale Parameters:
##
## Estimate Std.err
## (Intercept) 1 0.0215
## Number of clusters: 35 Maximum cluster size: 406
hyg$quant2 <- hyg$quant^2
gee1 <- geeglm(score ~ quant + quant2,data=hyg,id=date,family=binomial(link="log"))
summary(gee1)
##
## Call:
## geeglm(formula = score ~ quant + quant2, family = binomial(link = "log"),
## data = hyg, id = date)
##
## Coefficients:
## Estimate Std.err Wald Pr(>|W|)
## (Intercept) -1.06994 0.07223 219.43 <2e-16 ***
## quant 0.04312 0.01491 8.36 0.0038 **
## quant2 -0.00210 0.00141 2.20 0.1379
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation structure = independence
## Estimated Scale Parameters:
##
## Estimate Std.err
## (Intercept) 1 0.0214
## Number of clusters: 35 Maximum cluster size: 406
tab <- tidy(gee, conf.int=T)
tab$estimate <- (tab$estimate) %>% exp
tab$conf.low <- (tab$conf.low) %>% exp
tab$conf.high <- (tab$conf.high) %>% exp
tab$RR <- paste0(round(tab$estimate,2)," (",round(tab$conf.low,2),", ",round(tab$conf.high,2),")")
tab %>% select(term,RR)
RR = 1.02. Therefore, a cow that comes in 10% later is expected to have a 1.02 times the risk of being dirty (scores 3 or 4).
Sensitivity analysis using different covariance structures.
gee2 <- geeglm(score ~ quant,data=hyg,id=date,corstr = "ar1",
family=binomial(link="log"))
summary(gee2)
##
## Call:
## geeglm(formula = score ~ quant, family = binomial(link = "log"),
## data = hyg, id = date, corstr = "ar1")
##
## Coefficients:
## Estimate Std.err Wald Pr(>|W|)
## (Intercept) -1.03231 0.07433 192.9 < 2e-16 ***
## quant 0.02163 0.00621 12.1 0.00049 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation structure = ar1
## Estimated Scale Parameters:
##
## Estimate Std.err
## (Intercept) 1 0.0215
## Link = identity
##
## Estimated Correlation Parameters:
## Estimate Std.err
## alpha 0.119 0.0241
## Number of clusters: 35 Maximum cluster size: 406
gee3 <- geeglm(score ~ quant,data=hyg,id=date,corstr = "exchangeable",
family=binomial(link="log"))
summary(gee3)
##
## Call:
## geeglm(formula = score ~ quant, family = binomial(link = "log"),
## data = hyg, id = date, corstr = "exchangeable")
##
## Coefficients:
## Estimate Std.err Wald Pr(>|W|)
## (Intercept) -1.03955 0.07482 193.1 < 2e-16 ***
## quant 0.02171 0.00625 12.1 0.00051 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation structure = exchangeable
## Estimated Scale Parameters:
##
## Estimate Std.err
## (Intercept) 1 0.0236
## Link = identity
##
## Estimated Correlation Parameters:
## Estimate Std.err
## alpha 0.0774 0.0157
## Number of clusters: 35 Maximum cluster size: 406
Check for predicted probabilities outside the range of 0 to 1
hyg$predict <- predict(gee,hyg,type="response")
hist(hyg$predict)
range(hyg$predict, na.rm=T)
## [1] 0.356 0.442
All predicted probabilities were between 0 and 1 for each row in the current dataset.
#OLD ANALYSIS BELOW
set.seed(1)
#Create dataset of UHS to test necessary sample size
library(tidyr)
hyg <- uhscount
hyg$count <- seq(1,406)
hyg <- hyg[, c(36,1:35)]
hyg <- pivot_longer(hyg,-count, names_to = "date", values_to = "score",values_drop_na = TRUE)
library(janitor)
hyg$score %>% tabyl
#Reformat scores to 0 (scores 1/2) and 1 (scores 3 and 4 = dirty)
hyg$score[hyg$score<3] <- 0
hyg$score[hyg$score>2] <- 1
#Sample first, middle and last 10% of the herd -----
maxperdate <- hyg %>% group_by(date) %>% summarise(max=max(count))
hyg <- merge(hyg,maxperdate,by="date")
hyg$quant <- hyg$count/hyg$max
#First, middle and last 10% groups
hyg$group <- NA
hyg$group[hyg$quant < 0.10000001] <- "First 10%"
hyg$group[hyg$quant > 0.45 & hyg$quant < 0.55] <- "Middle 10%"
hyg$group[hyg$quant > 0.900000] <- "Last 10%"
#Split into 10% groups
#hyg$groupq <- NA
#hyg$groupq[hyg$quant < 0.10000001] <- 0.1
#hyg$groupq[hyg$quant >= 0.1 & hyg$quant < 0.2] <- .2
#hyg$groupq[hyg$quant >= 0.2 & hyg$quant < 0.3] <- .3
#hyg$groupq[hyg$quant >= 0.3 & hyg$quant < 0.4] <- .4
#hyg$groupq[hyg$quant >= 0.4 & hyg$quant < 0.5] <- .5
#hyg$groupq[hyg$quant >= 0.5 & hyg$quant < 0.6] <- .6
#hyg$groupq[hyg$quant >= 0.6 & hyg$quant < 0.7] <- .7
#hyg$groupq[hyg$quant >= 0.7 & hyg$quant < 0.8] <- .8
#hyg$groupq[hyg$quant >= 0.8 & hyg$quant < 0.9] <- .9
#hyg$groupq[hyg$quant >= 0.9 & hyg$quant] <- 1
#Split into 20% groups (quintiles)
hyg$groupq <- NA
hyg$groupq[hyg$quant >= 0 & hyg$quant < 0.2] <- .2
hyg$groupq[hyg$quant >= 0.2 & hyg$quant < 0.4] <- .4
hyg$groupq[hyg$quant >= 0.4 & hyg$quant < 0.6] <- .6
hyg$groupq[hyg$quant >= 0.6 & hyg$quant < 0.8] <- .8
hyg$groupq[hyg$quant >= 0.8 & hyg$quant] <- 1
#Create function to estimate pdirty using 100, 90, 80 --> 10% of data
hyg %>% filter(hyg$date==42389) %>% select(score) %>% sample_frac(size = 0.1) %>% summarise(mean=mean(score))
pdirt1 <- function(dat,prop){
hyg %>% filter(hyg$date==dat) %>% select(score) %>% sample_frac(size = prop,replace = FALSE) %>% summarise(mean=mean(score))
}
hyg %>% filter(hyg$date==42387 & group =="First 10%") %>% select(score) %>% summarise(mean=mean(score))
#Function for random selection of a given proportion of cows
pdirt1(dat=42389,prop=1)
#Function for first, middle and last 10%
pdirt3 <- function(dat,groupnam){
hyg %>% filter(hyg$date==dat & group ==groupnam) %>% select(score) %>% summarise(mean=mean(score))}
pdirt3(42387,groupnam ="First 10%")
#Function for 10% groups (0-10, 11-20, etc)
pdirt4 <- function(dat,groupnam){
hyg %>% filter(hyg$date==dat & groupq ==groupnam) %>% select(score) %>% summarise(mean=mean(score))}
pdirt4(42387,groupnam=.1)
pdirt2 <- function(dat1){
p100 <- pdirt1(dat=dat1,prop=1)
p90 <- pdirt1(dat=dat1,prop=0.9)
p80 <- pdirt1(dat=dat1,prop=.8)
p70 <- pdirt1(dat=dat1,prop=.7)
p60 <- pdirt1(dat=dat1,prop=.6)
p50 <- pdirt1(dat=dat1,prop=.5)
p40 <- pdirt1(dat=dat1,prop=.4)
p30 <- pdirt1(dat=dat1,prop=.3)
p20 <- pdirt1(dat=dat1,prop=.2)
p10 <- pdirt1(dat=dat1,prop=.1)
p5 <- pdirt1(dat=dat1,prop=.05)
pfirst10 <- pdirt3(dat=dat1,groupnam="First 10%")
pmiddle10 <- pdirt3(dat=dat1,groupnam="Middle 10%")
plast10 <- pdirt3(dat=dat1,groupnam="Last 10%")
#pq10 <- pdirt4(dat=dat1,groupnam=0.1)
pq20 <- pdirt4(dat=dat1,groupnam=0.2)
#pq30 <- pdirt4(dat=dat1,groupnam=0.3)
pq40 <- pdirt4(dat=dat1,groupnam=0.4)
#pq50 <- pdirt4(dat=dat1,groupnam=0.5)
pq60 <- pdirt4(dat=dat1,groupnam=0.6)
#pq70 <- pdirt4(dat=dat1,groupnam=0.7)
pq80 <- pdirt4(dat=dat1,groupnam=0.8)
#pq90 <- pdirt4(dat=dat1,groupnam=0.9)
pq100 <- pdirt4(dat=dat1,groupnam=1)
#Differences between full dataset and sample
pd90 <- p90 - p100
pd80 <- p80 - p100
pd70 <- p70 - p100
pd60 <- p60 - p100
pd50 <- p50 - p100
pd40 <- p40 - p100
pd30 <- p30 - p100
pd20 <- p20 - p100
pd10 <- p10 - p100
pd5 <- p5 - p100
pqd100 <- pq100 - p100
#pqd90 <- pq90 - p100
pqd80 <- pq80 - p100
#pqd70 <- pq70 - p100
pqd60 <- pq60 - p100
#pqd50 <- pq50 - p100
pqd40 <- pq40 - p100
#pqd30 <- pq30 - p100
pqd20 <- pq20 - p100
#pqd10 <- pq10 - p100
pdfirst10 <- pfirst10 - p100
pdmiddle10 <- pmiddle10 - p100
pdlast10 <- plast10 - p100
tab <- rbind(p100,p90,p80,p70,p60,p50,p40,p30,p20,p10,p5,
pfirst10,pmiddle10,plast10,
pq20,pq40,pq60,pq80,pq100,
pd90,pd80,pd70,pd60,pd50,pd40,pd30,pd20,pd10,pd5,
pdfirst10,pdmiddle10,pdlast10,
pqd20,pqd40,pqd60,pqd80,pqd100)
tab$per <- c(1,0.9,0.8,0.7,0.6,0.5,0.4,0.3,0.2,0.1,0.05,
"pfirst10","pmiddle10","plast10",
"pq20","pq40","pq60","pq80","pq100",
"90diff","80diff","70diff","60diff","50diff","40diff","30diff","20diff","10diff","5diff",
"first10diff","middle10diff","last10diff",
"pqd20","pqd40","pqd60","pqd80","pqd100")
tab$date <- dat1
tab
}
print(pdirt2(42387))
#Create list of dates
uhsdates <- unique(hyg$date)
#Loop through dates
#for (i in uhsdates) {
# print(pdirt2(i))
#}
#Combine into single dataframe
hygsum <- data.frame()
hygsum <- do.call(rbind.data.frame,
lapply(unique(uhsdates),
function(i){
pdirt2(i)}))
hygsum <- hygsum[, c(3,2,1)]
hygsum$prop <- hygsum$mean
hygsum$mean <- NULL
library(blandr)
library(DescTools)
library(stringr)
library(ggplot2)
#Analysis 1 - report differences and sd of differences ----
tabdiff <- hygsum %>% group_by(per) %>% summarise(mean=mean(prop), sd=sd(prop),hilow=Range(prop),
q5=quantile(probs=0.05,prop),q95=quantile(probs=0.95,prop))
tabdiff
tabdiffplot1 <- hygsum %>% filter(str_detect(per, "diff") & per != "first10diff" & per != "middle10diff" & per != "last10diff")
tabdiffplot1$per <- recode_factor(tabdiffplot1$per,
"90diff" = "90%",
"80diff" = "80%",
"70diff" = "70%",
"60diff" = "60%",
"50diff" = "50%",
"40diff" = "40%",
"30diff" = "30%",
"20diff" = "20%",
"10diff" = "10%",
"5diff" = "5%",
.ordered = TRUE)
plot1 <- ggplot(tabdiffplot1, aes(x=per, y=prop)) +
coord_flip(ylim = c(-.25,.25)) +
geom_boxplot(aes(fill=per)) +
geom_jitter(width = 0.1) +
theme(legend.position="none") +
labs(y = "Deviation from TRUE prevalence of dirty udders", x="Proportion of scores used") +
theme(axis.text.x = element_text(size=12, family="Times"),axis.title.x = element_text(size=15, family="Times",face="bold")) +
theme(axis.text.y = element_text(size=15, family="Times"),axis.title.y = element_text(size=15, family="Times",face="bold")) +
theme(plot.title = element_text(size=15, family="Times",hjust = 0.5)) +
geom_hline(yintercept=0, linetype="solid", color = "red") +
theme(panel.border = element_rect(colour = "black", fill=NA, size=1),
panel.background = element_rect(fill = "white", colour = "white",size = 0.5, linetype = "solid"),
panel.grid.major = element_line(size =0, linetype = 'solid', colour = "grey"),
panel.grid.minor = element_line(size = 0, linetype = 'solid',colour = "grey"))
plot1
tiff("plot1.tiff", units="in", width=7, height=7, res=300)
plot1
dev.off()
tabdiff1 <-hygsum %>% filter(str_detect(per, "pqd"))
tabdiff1$per <- recode_factor(tabdiff1$per,
pqd100 = "5th quintile",
pqd80 = "4th quintile",
pqd60 = "3rd quintile",
pqd40 = "2nd quintile",
pqd20 = "1st quintile",
.ordered = TRUE)
plot2 <- ggplot(tabdiff1, aes(x=per, y=prop)) +
geom_boxplot(aes(fill=per)) +
geom_jitter(width = 0.1) +
coord_flip(ylim = c(-.25,.25)) +
theme(legend.position="none") +
labs(y = "Deviation from TRUE prevalence of dirty udders", x="Stage of milking") +
theme(axis.text.x = element_text(size=12, family="Times"),axis.title.x = element_text(size=15, family="Times",face="bold")) +
theme(axis.text.y = element_text(size=15, family="Times"),axis.title.y = element_text(size=15, family="Times",face="bold")) +
theme(plot.title = element_text(size=15, family="Times",hjust = 0.5)) +
geom_hline(yintercept=0, linetype="solid", color = "red") +
theme(panel.border = element_rect(colour = "black", fill=NA, size=1),
panel.background = element_rect(fill = "white", colour = "white",size = 0.5, linetype = "solid"),
panel.grid.major = element_line(size =0, linetype = 'solid', colour = "grey"),
panel.grid.minor = element_line(size = 0, linetype = 'solid',colour = "grey"))
plot2
tiff("plot2.tiff", units="in", width=7, height=4, res=300)
plot2
dev.off()
#Analysis 2 - limits of agreement -----
difffun <- function(group){
a <- CCC(hygsum$prop[hygsum$per==1], hygsum$prop[hygsum$per==group])[5] %>% as.data.frame %>% select(blalt.delta) %>%
summarise(mean = mean(blalt.delta),
conf.low = mean(blalt.delta) - 1.96*sd(blalt.delta),
conf.high = mean(blalt.delta) + 1.96*sd(blalt.delta))
a$mean <- a$mean * -1
a$conf.low <- a$conf.high * -1
a$conf.high <- a$conf.low * -1
b <- paste0(sprintf("%.2f",round(a$mean, 2)), sep=""," (",
sprintf("%.2f",round(a$conf.low, 2)),", ",
sprintf("%.2f",round(a$conf.high, 2)),")")
b$per <- group
b
}
#Loop through proportions
proplist <- c(0.9,.8,.7,.6,.5,.4,.3,.2,.1,.05,
"pq20","pq40","pq60","pq80","pq100")
#for (i in proplist) {
# print(corfun(i))
#}
#Combine into single dataframe
mdifftab <- data.frame()
mdifftab <- do.call(rbind.data.frame,
lapply(proplist,
function(i){
difffun(i)}))
colnames(mdifftab) <- c("mdiff","per")
mdifftab
#Analysis 2 - CCC-----
CCC(hygsum$prop[hygsum$per==1], hygsum$prop[hygsum$per=="plast10"])
cccfun <- function(group){
tab2 <- CCC(hygsum$prop[hygsum$per==1], hygsum$prop[hygsum$per==group])[1] %>% as.data.frame %>% select('rho.c.est')
tab2$per <- group
tab2
}
cccfun(0.05)
#Loop through proportions
proplist <- c(0.9,.8,.7,.6,.5,.4,.3,.2,.1,.05,
"pq20","pq40","pq60","pq80","pq100")
#for (i in proplist) {
# print(corfun(i))
#}
#Combine into single dataframe
cortab <- data.frame()
cortab <- do.call(rbind.data.frame,
lapply(proplist,
function(i){
cccfun(i)}))
cortab$R <- cortab$.
cortab$. <- NULL
cortab