Acknowledgements

Co-authors

Bill Tranter Richard Laven

Others

Richard Maclehose Dennis Byrnes and the staff at Lakeside dairy Anonymous peer reviewers

Download all datasets HERE




library(knitr)
## Warning: package 'knitr' was built under R version 4.0.2
library(readr)



knitr::opts_chunk$set(echo=TRUE, warning=FALSE, message=FALSE)

load(file="UHS-CM.Rdata")
load(file="uhsdata.Rdata")
load(file="UHS-CM-Cow.Rdata")

Demonstrate udder hygiene data collection

The video below shows an example of the video footage that was used (viewed at 16x) to conduct udder hygiene scores

Evaluate data

Show data

subdays <- subdays[order(subdays$cow),]
head(subdays,20)

Summary of dataset

print(summarytools::dfSummary(subdays,valid.col=FALSE, graph.col=F, style="grid"))
## Data Frame Summary  
## subdays  
## Dimensions: 31383 x 26  
## Duplicates: 28  
## 
## +----+---------------------+----------------------------+---------------------+----------+
## | No | Variable            | Stats / Values             | Freqs (% of Valid)  | Missing  |
## +====+=====================+============================+=====================+==========+
## | 1  | StudyDay            | Mean (sd) : 43.1 (25.5)    | 89 distinct values  | 0        |
## |    | [numeric]           | min < med < max:           |                     | (0%)     |
## |    |                     | 1 < 42 < 89                |                     |          |
## |    |                     | IQR (CV) : 44 (0.6)        |                     |          |
## +----+---------------------+----------------------------+---------------------+----------+
## | 2  | Cow Number          | 1. 25                      |    92 ( 0.3%)       | 0        |
## |    | [character]         | 2. 110                     |    89 ( 0.3%)       | (0%)     |
## |    |                     | 3. 1104                    |    89 ( 0.3%)       |          |
## |    |                     | 4. 1111                    |    89 ( 0.3%)       |          |
## |    |                     | 5. 1116                    |    89 ( 0.3%)       |          |
## |    |                     | 6. 1128                    |    89 ( 0.3%)       |          |
## |    |                     | 7. 1129                    |    89 ( 0.3%)       |          |
## |    |                     | 8. 113                     |    89 ( 0.3%)       |          |
## |    |                     | 9. 1148                    |    89 ( 0.3%)       |          |
## |    |                     | 10. 115                    |    89 ( 0.3%)       |          |
## |    |                     | [ 494 others ]             | 30490 (97.2%)       |          |
## +----+---------------------+----------------------------+---------------------+----------+
## | 3  | Tx                  | Min  : 0                   | 0 : 16266 (51.8%)   | 0        |
## |    | [numeric]           | Mean : 0.5                 | 1 : 15117 (48.2%)   | (0%)     |
## |    |                     | Max  : 1                   |                     |          |
## +----+---------------------+----------------------------+---------------------+----------+
## | 4  | CM                  | Min  : 0                   | 0 : 27993 (89.2%)   | 0        |
## |    | [numeric]           | Mean : 0.1                 | 1 :  3390 (10.8%)   | (0%)     |
## |    |                     | Max  : 1                   |                     |          |
## +----+---------------------+----------------------------+---------------------+----------+
## | 5  | Daysatrisk          | Mean (sd) : 79.3 (17.6)    | 56 distinct values  | 0        |
## |    | [numeric]           | min < med < max:           |                     | (0%)     |
## |    |                     | 4 < 89 < 89                |                     |          |
## |    |                     | IQR (CV) : 13 (0.2)        |                     |          |
## +----+---------------------+----------------------------+---------------------+----------+
## | 6  | Age                 | Mean (sd) : 50.6 (20.4)    | 412 distinct values | 0        |
## |    | [numeric]           | min < med < max:           |                     | (0%)     |
## |    |                     | 11.3 < 45 < 117.4          |                     |          |
## |    |                     | IQR (CV) : 28.5 (0.4)      |                     |          |
## +----+---------------------+----------------------------+---------------------+----------+
## | 7  | Parity              | 1. 1                       | 11226 (35.8%)       | 0        |
## |    | [factor]            | 2. 2                       |  7916 (25.2%)       | (0%)     |
## |    |                     | 3. 3                       |  6215 (19.8%)       |          |
## |    |                     | 4. 4                       |  6026 (19.2%)       |          |
## +----+---------------------+----------------------------+---------------------+----------+
## | 8  | DIM at enrolment    | Mean (sd) : 124.7 (102)    | 209 distinct values | 0        |
## |    | [numeric]           | min < med < max:           |                     | (0%)     |
## |    |                     | 0 < 120 < 421              |                     |          |
## |    |                     | IQR (CV) : 171 (0.8)       |                     |          |
## +----+---------------------+----------------------------+---------------------+----------+
## | 9  | Enrolment date      | min : 2016-01-16           | 49 distinct values  | 0        |
## |    | [Date]              | med : 2016-01-16           |                     | (0%)     |
## |    |                     | max : 2016-04-10           |                     |          |
## |    |                     | range : 2m 25d             |                     |          |
## +----+---------------------+----------------------------+---------------------+----------+
## | 10 | Exit date           | min : 2016-01-17           | 62 distinct values  | 0        |
## |    | [Date]              | med : 2016-04-14           |                     | (0%)     |
## |    |                     | max : 2016-04-14           |                     |          |
## |    |                     | range : 2m 28d             |                     |          |
## +----+---------------------+----------------------------+---------------------+----------+
## | 11 | BirthDate           | min : 2006-04-08           | 390 distinct values | 0        |
## |    | [POSIXct, POSIXt]   | med : 2012-04-24           |                     | (0%)     |
## |    |                     | max : 2015-02-05           |                     |          |
## |    |                     | range : 8y 9m 28d          |                     |          |
## +----+---------------------+----------------------------+---------------------+----------+
## | 12 | CalveDate           | min : 2014-11-21           | 257 distinct values | 0        |
## |    | [Date]              | med : 2015-09-18           |                     | (0%)     |
## |    |                     | max : 2016-04-10           |                     |          |
## |    |                     | range : 1y 4m 20d          |                     |          |
## +----+---------------------+----------------------------+---------------------+----------+
## | 13 | Clinical case date  | min : 2016-01-17           | 59 distinct values  | 27993    |
## |    | [Date]              | med : 2016-03-08           |                     | (89.2%)  |
## |    |                     | max : 2016-04-14           |                     |          |
## |    |                     | range : 2m 28d             |                     |          |
## +----+---------------------+----------------------------+---------------------+----------+
## | 14 | CMDIM               | Mean (sd) : 161.4 (95.1)   | 89 distinct values  | 28007    |
## |    | [numeric]           | min < med < max:           |                     | (89.24%) |
## |    |                     | 0 < 148 < 348              |                     |          |
## |    |                     | IQR (CV) : 169 (0.6)       |                     |          |
## +----+---------------------+----------------------------+---------------------+----------+
## | 15 | pdirty              | Mean (sd) : 3.9 (1.2)      | 89 distinct values  | 0        |
## |    | [numeric]           | min < med < max:           |                     | (0%)     |
## |    |                     | 1.3 < 3.8 < 6.8            |                     |          |
## |    |                     | IQR (CV) : 1.9 (0.3)       |                     |          |
## +----+---------------------+----------------------------+---------------------+----------+
## | 16 | Date                | min : 2016-01-16           | 89 distinct values  | 0        |
## |    | [Date]              | med : 2016-02-26           |                     | (0%)     |
## |    |                     | max : 2016-04-13           |                     |          |
## |    |                     | range : 2m 28d             |                     |          |
## +----+---------------------+----------------------------+---------------------+----------+
## | 17 | pdirty_1            | Mean (sd) : 3.9 (1.2)      | 88 distinct values  | 381      |
## |    | [numeric]           | min < med < max:           |                     | (1.21%)  |
## |    |                     | 1.3 < 3.8 < 6.8            |                     |          |
## |    |                     | IQR (CV) : 1.9 (0.3)       |                     |          |
## +----+---------------------+----------------------------+---------------------+----------+
## | 18 | pdirty_2            | Mean (sd) : 3.9 (1.2)      | 87 distinct values  | 762      |
## |    | [numeric]           | min < med < max:           |                     | (2.43%)  |
## |    |                     | 1.3 < 3.8 < 6.8            |                     |          |
## |    |                     | IQR (CV) : 1.9 (0.3)       |                     |          |
## +----+---------------------+----------------------------+---------------------+----------+
## | 19 | pdirty_3            | Mean (sd) : 3.9 (1.2)      | 86 distinct values  | 1143     |
## |    | [numeric]           | min < med < max:           |                     | (3.64%)  |
## |    |                     | 1.3 < 3.9 < 6.8            |                     |          |
## |    |                     | IQR (CV) : 2.1 (0.3)       |                     |          |
## +----+---------------------+----------------------------+---------------------+----------+
## | 20 | pdirty_4            | Mean (sd) : 3.9 (1.2)      | 85 distinct values  | 1524     |
## |    | [numeric]           | min < med < max:           |                     | (4.86%)  |
## |    |                     | 1.3 < 3.9 < 6.8            |                     |          |
## |    |                     | IQR (CV) : 2.1 (0.3)       |                     |          |
## +----+---------------------+----------------------------+---------------------+----------+
## | 21 | DIM                 | Mean (sd) : 162 (105.4)    | 459 distinct values | 0        |
## |    | [numeric]           | min < med < max:           |                     | (0%)     |
## |    |                     | 0 < 157 < 458              |                     |          |
## |    |                     | IQR (CV) : 169 (0.7)       |                     |          |
## +----+---------------------+----------------------------+---------------------+----------+
## | 22 | cow                 | Mean (sd) : 1324.3 (1237)  | 504 distinct values | 0        |
## |    | [numeric]           | min < med < max:           |                     | (0%)     |
## |    |                     | 1 < 1316 < 9688            |                     |          |
## |    |                     | IQR (CV) : 471 (0.9)       |                     |          |
## +----+---------------------+----------------------------+---------------------+----------+
## | 23 | CMD                 | Min  : 0                   | 0 : 31266 (99.6%)   | 0        |
## |    | [numeric]           | Mean : 0                   | 1 :   117 ( 0.4%)   | (0%)     |
## |    |                     | Max  : 1                   |                     |          |
## +----+---------------------+----------------------------+---------------------+----------+
## | 24 | cowday              | 1 distinct value           | 1 : 31383 (100.0%)  | 0        |
## |    | [numeric]           |                            |                     | (0%)     |
## +----+---------------------+----------------------------+---------------------+----------+
## | 25 | pdirty3_1           | Mean (sd) : 3.9 (1.2)      | 86 distinct values  | 1143     |
## |    | [numeric]           | min < med < max:           |                     | (3.64%)  |
## |    |                     | 1.7 < 3.9 < 6.2            |                     |          |
## |    |                     | IQR (CV) : 1.9 (0.3)       |                     |          |
## +----+---------------------+----------------------------+---------------------+----------+
## | 26 | cmhx                | Min  : 0                   | 0 : 13510 (43.0%)   | 0        |
## |    | [numeric]           | Mean : 0.6                 | 1 : 17873 (57.0%)   | (0%)     |
## |    |                     | Max  : 1                   |                     |          |
## +----+---------------------+----------------------------+---------------------+----------+
#print(summarytools::dfSummary(subdays,valid.col=FALSE, graph.magnif=0.8, style="grid"), method = "render")



#Describe subjects in study

library(dplyr)

colnames(Enrollcow)
##  [1] "Cow Number"         "Mutli lact"         "Tx"                
##  [4] "CM"                 "CMCode"             "Daysatrisk"        
##  [7] "Age"                "Parity"             "DIM at enrolment"  
## [10] "Enrolment date"     "Exit date"          "BirthDate"         
## [13] "CalveDate"          "Clinical case date" "Pathogen"          
## [16] "CMDIM"              "calveposten"        "dayin"             
## [19] "dayout"             "Delete"
library(table1)
table1(~ Age + factor(Parity) + `DIM at enrolment` + factor(Tx) + factor(CM), data=Enrollcow)
Overall
(N=504)
Age
Mean (SD) 52.1 (21.4)
Median [Min, Max] 46.8 [11.3, 117]
factor(Parity)
1 170 (33.7%)
2 121 (24.0%)
3 106 (21.0%)
4 52 (10.3%)
5 29 (5.8%)
6 18 (3.6%)
7 7 (1.4%)
8 1 (0.2%)
DIM at enrolment
Mean (SD) 124 (112)
Median [Min, Max] 107 [2.00, 421]
factor(Tx)
0 251 (49.8%)
1 253 (50.2%)
factor(CM)
0 388 (77.0%)
1 116 (23.0%)

More descriptive statistics

hist(subdays$pdirty_1)

Analysis

Section 1: Relationship between udder hygiene score and clinical mastitis

library(geepack)
library(emmeans)
library(ggplot2)
subdays <- subdays[order(subdays$cow),]

#Using GEE
#GEE with clustering within Study Day. I have no reason to expect that there is a pattern of covariance of cows within study days. Therefore, I will choose independence. 
subdays <- subdays[order(subdays$StudyDay),]

#Model 1 (using UHS from 1 day prior to day at risk)
m1 <- geeglm(CMD ~ pdirty_1 + Parity + DIM + Tx + cmhx, 
              data = subdays %>% filter(!is.na(pdirty_1)), family=poisson, id=StudyDay) # %>% tidy(exp=T,conf.int=T)
summary(m1)
## 
## Call:
## geeglm(formula = CMD ~ pdirty_1 + Parity + DIM + Tx + cmhx, family = poisson, 
##     data = subdays %>% filter(!is.na(pdirty_1)), id = StudyDay)
## 
##  Coefficients:
##              Estimate   Std.err    Wald Pr(>|W|)    
## (Intercept) -7.107055  0.396936 320.581  < 2e-16 ***
## pdirty_1     0.340176  0.064194  28.082 1.16e-07 ***
## Parity2      0.285218  0.252675   1.274  0.25899    
## Parity3      0.486700  0.306613   2.520  0.11243    
## Parity4      0.653415  0.283786   5.301  0.02131 *  
## DIM         -0.003389  0.001078   9.882  0.00167 ** 
## Tx           0.516520  0.202302   6.519  0.01067 *  
## cmhx        -0.017775  0.208664   0.007  0.93211    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation structure = independence 
## Estimated Scale Parameters:
## 
##             Estimate Std.err
## (Intercept)   0.9521   1.744
## Number of clusters:   88  Maximum cluster size: 391
#Model 2 (using UHS from 2 days prior to day at risk)
m2 <- geeglm(CMD ~ pdirty_2 + Parity + DIM + Tx + cmhx, 
              data = subdays %>% filter(!is.na(pdirty_2)), family=poisson, id=StudyDay) # %>% tidy(exp=T,conf.int=T)
summary(m2)
## 
## Call:
## geeglm(formula = CMD ~ pdirty_2 + Parity + DIM + Tx + cmhx, family = poisson, 
##     data = subdays %>% filter(!is.na(pdirty_2)), id = StudyDay)
## 
##  Coefficients:
##             Estimate  Std.err   Wald Pr(>|W|)    
## (Intercept) -7.13529  0.41523 295.29  < 2e-16 ***
## pdirty_2     0.35822  0.06179  33.61  6.7e-09 ***
## Parity2      0.25610  0.25483   1.01   0.3149    
## Parity3      0.46238  0.31177   2.20   0.1380    
## Parity4      0.67708  0.28510   5.64   0.0176 *  
## DIM         -0.00352  0.00110  10.28   0.0013 ** 
## Tx           0.48942  0.20281   5.82   0.0158 *  
## cmhx        -0.04752  0.20977   0.05   0.8208    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation structure = independence 
## Estimated Scale Parameters:
## 
##             Estimate Std.err
## (Intercept)    0.958    1.93
## Number of clusters:   87  Maximum cluster size: 391
#Model 3 (using UHS from 3 days prior to day at risk)
m3 <- geeglm(CMD ~ pdirty_3 + Parity + DIM + Tx + cmhx, 
              data = subdays %>% filter(!is.na(pdirty_3)), family=poisson, id=StudyDay) # %>% tidy(exp=T,conf.int=T)
summary(m3)
## 
## Call:
## geeglm(formula = CMD ~ pdirty_3 + Parity + DIM + Tx + cmhx, family = poisson, 
##     data = subdays %>% filter(!is.na(pdirty_3)), id = StudyDay)
## 
##  Coefficients:
##             Estimate  Std.err   Wald Pr(>|W|)    
## (Intercept) -7.01383  0.43631 258.42  < 2e-16 ***
## pdirty_3     0.33268  0.06686  24.76  6.5e-07 ***
## Parity2      0.25549  0.25479   1.01   0.3160    
## Parity3      0.45997  0.31182   2.18   0.1402    
## Parity4      0.67599  0.28530   5.61   0.0178 *  
## DIM         -0.00352  0.00110  10.24   0.0014 ** 
## Tx           0.49189  0.20255   5.90   0.0152 *  
## cmhx        -0.04222  0.20992   0.04   0.8406    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation structure = independence 
## Estimated Scale Parameters:
## 
##             Estimate Std.err
## (Intercept)     0.97    1.96
## Number of clusters:   86  Maximum cluster size: 391
tab <- rbind(tidy(m1,conf.int=T)[2,],
      tidy(m2,conf.int=T)[2,],
      tidy(m3,conf.int=T)[2,]) %>% select(term,estimate,std.error,conf.low, conf.high)

tab1 <- rbind(tidy(m1,conf.int=T,exp=T)[2,],
      tidy(m2,conf.int=T,exp=T)[2,],
      tidy(m3,conf.int=T,exp=T)[2,]) %>% select(term,estimate,std.error,conf.low, conf.high)

tab1$RR <- paste0(round(tab1$estimate,1)," (",round(tab1$conf.low,1),", ",round(tab1$conf.high,1),")")
tab1 <- tab1 %>% select(term,RR)
tab <- merge(tab,tab1,by="term")
tab %>% select(term,estimate,std.error,RR)

Given that estimates are very similar when using UHS from 1-3 days prior to day at risk, I will only create a plot for UHS 1 day before day at risk.

emx <- emmeans(m1,~pdirty_1,type="response",at = list(pdirty_1=c(0:6))) %>% tidy
emx$pdirty <- emx$pdirty_1
emx$rate <- emx$rate * 3000
emx$asymp.LCL <- emx$asymp.LCL * 3000
emx$asymp.UCL <- emx$asymp.UCL * 3000
emx$pdirty <- emx$pdirty / 10
emx$std.error <- NULL
emx$pdirty_1 <- NULL
emx$df <- NULL
emx[, c(4,1,2,3)]
curve1 <- ggplot(emx) + coord_cartesian(xlim = c(0, 0.6),ylim = (c(0,25))) + aes(pdirty,rate) + geom_smooth(size=1.2, se=F) + geom_ribbon(aes(ymin=asymp.LCL, ymax=asymp.UCL),linetype=0,alpha=0.2) + labs(x = "Herd udder hygiene \n(Proportion of herd with udder hygiene scores ≥ 3/4)", y = "Mastitis cases per 100 cow-months at risk") + theme(panel.border = element_rect(colour = "black", fill=NA, size=1),axis.text=element_text(size=12,family="Times"),axis.title=element_text(size=14,face="bold",family="Times"),panel.background = element_rect(fill = "white", colour = "white",size = 0.5, linetype = "solid"),panel.grid.major = element_line(size = 0.02, linetype = 'solid', colour = "grey"), panel.grid.minor = element_line(size = 0.02, linetype = 'solid',colour = "grey"),legend.position="bottom",legend.text = element_text(colour="black", size=12,face="bold",family="Times")) + scale_color_brewer(palette="Set1") 

curve1

#tiff("curve1.tiff", units="in", width=7, height=7, res=300)
#curve1
#dev.off()

Sensitivity analysis - use of quadratic term

#Quadratic term
mm <- geeglm(CMD ~ pdirty_1 + I(pdirty_1^2) + Parity + DIM + Tx + cmhx, 
             data = subdays %>% filter(!is.na(pdirty_1)), family=poisson, id=StudyDay) 
summary(mm)
## 
## Call:
## geeglm(formula = CMD ~ pdirty_1 + I(pdirty_1^2) + Parity + DIM + 
##     Tx + cmhx, family = poisson, data = subdays %>% filter(!is.na(pdirty_1)), 
##     id = StudyDay)
## 
##  Coefficients:
##               Estimate  Std.err  Wald Pr(>|W|)    
## (Intercept)   -8.84288  0.92082 92.22   <2e-16 ***
## pdirty_1       1.19281  0.41937  8.09   0.0045 ** 
## I(pdirty_1^2) -0.09726  0.04729  4.23   0.0397 *  
## Parity2        0.28219  0.25305  1.24   0.2648    
## Parity3        0.48389  0.30675  2.49   0.1147    
## Parity4        0.65406  0.28388  5.31   0.0212 *  
## DIM           -0.00338  0.00108  9.88   0.0017 ** 
## Tx             0.51540  0.20236  6.49   0.0109 *  
## cmhx          -0.01512  0.20890  0.01   0.9423    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation structure = independence 
## Estimated Scale Parameters:
## 
##             Estimate Std.err
## (Intercept)    0.941    1.77
## Number of clusters:   88  Maximum cluster size: 391
#Cubic term
m2 <- geeglm(CMD ~ pdirty_1 + I(pdirty_1^2) + I(pdirty_1^3) +
               Parity + DIM + Tx + cmhx, 
             data = subdays %>% filter(!is.na(pdirty_1)), family=poisson, id=StudyDay) # %>% tidy(exp=T,conf.int=T)
summary(m2)
## 
## Call:
## geeglm(formula = CMD ~ pdirty_1 + I(pdirty_1^2) + I(pdirty_1^3) + 
##     Parity + DIM + Tx + cmhx, family = poisson, data = subdays %>% 
##     filter(!is.na(pdirty_1)), id = StudyDay)
## 
##  Coefficients:
##                Estimate   Std.err  Wald Pr(>|W|)    
## (Intercept)   -12.41900   2.64145 22.10  2.6e-06 ***
## pdirty_1        3.96368   1.92315  4.25   0.0393 *  
## I(pdirty_1^2)  -0.76862   0.45042  2.91   0.0879 .  
## I(pdirty_1^3)   0.05131   0.03343  2.36   0.1248    
## Parity2         0.28142   0.25276  1.24   0.2655    
## Parity3         0.48312   0.30670  2.48   0.1152    
## Parity4         0.65365   0.28401  5.30   0.0214 *  
## DIM            -0.00338   0.00108  9.88   0.0017 ** 
## Tx              0.51552   0.20241  6.49   0.0109 *  
## cmhx           -0.01294   0.20898  0.00   0.9506    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation structure = independence 
## Estimated Scale Parameters:
## 
##             Estimate Std.err
## (Intercept)    0.931    1.71
## Number of clusters:   88  Maximum cluster size: 391
atx <- unique(subdays$pdirty_1)

emx <- emmeans(m1,~pdirty_1,type="response",at = list(pdirty_1=atx)) %>% tidy
emx$pdirty <- emx$pdirty_1
emx$rate <- emx$rate
emx$asymp.LCL <- emx$asymp.LCL
emx$asymp.UCL <- emx$asymp.UCL
emx$pdirty <- emx$pdirty
emx$std.error <- NULL
emx$pdirty_1 <- NULL
emx$df <- NULL
emx[, c(4,1,2,3)]
emx$asymp.LCL <- NULL
emx$asymp.UCL <- NULL
emx$curve <- "Main Model"

emx4 <- emmeans(mm,~pdirty_1,type="response",at = list(pdirty_1=atx)) %>% tidy
emx4$pdirty <- emx4$pdirty_1
emx4$rate <- emx4$rate
emx4$asymp.LCL <- emx4$asymp.LCL
emx4$asymp.UCL <- emx4$asymp.UCL
emx4$pdirty <- emx4$pdirty
emx4$std.error <- NULL
emx4$pdirty_1 <- NULL
emx4$df <- NULL
emx4[, c(4,1,2,3)]
emx4$asymp.LCL <- NULL
emx4$asymp.UCL <- NULL
emx4$curve <- "Model with quadratic term"

emx <- rbind(emx,emx4)

smooth_vals = predict(loess(as.integer(CMD)~pdirty_1, data=subdays, span=0.4), subdays$pdirty_1)
subdays$rate <- smooth_vals
emx2 <- subdays %>% select(pdirty_1,rate)
emx2$pdirty <- emx2$pdirty_1
emx2$pdirty_1 <- NULL
emx2$curve <- "Raw data with loess smoother"

emx3 <- rbind(emx,emx2)

curve1 <- ggplot(emx3) + aes(pdirty,rate,color=curve) + geom_line(size=1.2) + 
  coord_cartesian(xlim = c(1, 6)) +
  labs(x = "Herd udder hygiene \n(Proportion of herd with udder hygiene scores ≥ 3/4)", y = "Mastitis cases per day-day at risk") + 
  theme(panel.border = element_rect(colour = "black", fill=NA, size=1),axis.text=element_text(size=12,family="Times"),axis.title=element_text(size=14,face="bold",family="Times"),panel.background = element_rect(fill = "white", colour = "white",size = 0.5, linetype = "solid"),panel.grid.major = element_line(size = 0.02, linetype = 'solid', colour = "grey"), panel.grid.minor = element_line(size = 0.02, linetype = 'solid',colour = "grey"),legend.position="bottom",legend.text = element_text(colour="black", size=12,face="bold",family="Times")) + scale_color_brewer(palette="Set1") 


curve1

Check for predicted probabilities outside the range of 0 to 1

subdays$predict <- predict(m1,subdays,type="response")
hist(subdays$predict)

range(subdays$predict, na.rm=T)
## [1] 0.000272 0.026380

All predicted probabilities were between 0 and 1 for each row in the current dataset.

Sensitivity analysis - use compound symmetry (exchangeable) covariance structure

subdays <- subdays[order(subdays$cow),]
subdays <- subdays[order(subdays$StudyDay),]

#Model 1 (using UHS from 1 day prior to day at risk)
m1 <- geeglm(CMD ~ pdirty_1 + Parity + DIM + Tx + cmhx, 
              data = subdays %>% filter(!is.na(pdirty_1)), 
             corstr = "exchangeable",
             family=poisson, id=StudyDay) # %>% tidy(exp=T,conf.int=T)
summary(m1)
## 
## Call:
## geeglm(formula = CMD ~ pdirty_1 + Parity + DIM + Tx + cmhx, family = poisson, 
##     data = subdays %>% filter(!is.na(pdirty_1)), id = StudyDay, 
##     corstr = "exchangeable")
## 
##  Coefficients:
##             Estimate  Std.err   Wald Pr(>|W|)    
## (Intercept) -7.10692  0.39636 321.50   <2e-16 ***
## pdirty_1     0.34000  0.06384  28.37    1e-07 ***
## Parity2      0.28676  0.25266   1.29   0.2564    
## Parity3      0.48705  0.30655   2.52   0.1121    
## Parity4      0.65377  0.28360   5.31   0.0212 *  
## DIM         -0.00339  0.00108   9.90   0.0017 ** 
## Tx           0.51667  0.20216   6.53   0.0106 *  
## cmhx        -0.01625  0.20854   0.01   0.9379    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation structure = exchangeable 
## Estimated Scale Parameters:
## 
##             Estimate Std.err
## (Intercept)    0.951    1.74
##   Link = identity 
## 
## Estimated Correlation Parameters:
##        Estimate Std.err
## alpha -0.000236 0.00061
## Number of clusters:   88  Maximum cluster size: 391
m1 <- geeglm(CMD ~ pdirty_1 + Parity + DIM + Tx + cmhx, 
              data = subdays %>% filter(!is.na(pdirty_1)), 
             corstr = "ar1",
             family=poisson, id=StudyDay) # %>% tidy(exp=T,conf.int=T)
summary(m1)
## 
## Call:
## geeglm(formula = CMD ~ pdirty_1 + Parity + DIM + Tx + cmhx, family = poisson, 
##     data = subdays %>% filter(!is.na(pdirty_1)), id = StudyDay, 
##     corstr = "ar1")
## 
##  Coefficients:
##             Estimate  Std.err   Wald Pr(>|W|)    
## (Intercept) -7.10594  0.39701 320.36  < 2e-16 ***
## pdirty_1     0.34009  0.06412  28.13  1.1e-07 ***
## Parity2      0.28399  0.25305   1.26   0.2617    
## Parity3      0.48930  0.30653   2.55   0.1104    
## Parity4      0.65690  0.28328   5.38   0.0204 *  
## DIM         -0.00340  0.00108   9.94   0.0016 ** 
## Tx           0.51832  0.20228   6.57   0.0104 *  
## cmhx        -0.01946  0.20859   0.01   0.9257    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation structure = ar1 
## Estimated Scale Parameters:
## 
##             Estimate Std.err
## (Intercept)    0.953    1.75
##   Link = identity 
## 
## Estimated Correlation Parameters:
##       Estimate Std.err
## alpha -0.00415 0.00793
## Number of clusters:   88  Maximum cluster size: 391

Sensitivity analysis 2 - control for clustering of cow-days within cows.

subdays <- subdays[order(subdays$cow),]

#Model 1 (using UHS from 1 day prior to day at risk)
m1 <- geeglm(CMD ~ pdirty_1 + Parity + DIM + Tx + cmhx, 
              data = subdays %>% filter(!is.na(pdirty_1)), family=poisson, id=cow) # %>% tidy(exp=T,conf.int=T)
summary(m1)
## 
## Call:
## geeglm(formula = CMD ~ pdirty_1 + Parity + DIM + Tx + cmhx, family = poisson, 
##     data = subdays %>% filter(!is.na(pdirty_1)), id = cow)
## 
##  Coefficients:
##             Estimate  Std.err   Wald Pr(>|W|)    
## (Intercept) -7.10705  0.40598 306.45   <2e-16 ***
## pdirty_1     0.34018  0.06856  24.62    7e-07 ***
## Parity2      0.28522  0.26973   1.12   0.2903    
## Parity3      0.48670  0.28823   2.85   0.0913 .  
## Parity4      0.65341  0.30631   4.55   0.0329 *  
## DIM         -0.00339  0.00110   9.46   0.0021 ** 
## Tx           0.51652  0.19722   6.86   0.0088 ** 
## cmhx        -0.01778  0.22383   0.01   0.9367    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation structure = independence 
## Estimated Scale Parameters:
## 
##             Estimate Std.err
## (Intercept)    0.952    1.91
## Number of clusters:   504  Maximum cluster size: 91

Results are very similar.

Sensitivity analysis 3. Will replicate analysis for model 1 using a generalized linear mixed model.

library(lme4)

#Using Poisson - convergence issues, but estimates are very similar to GEE
glmm <- glmer(CMD ~ pdirty_1 + Parity + DIM + Tx + cmhx + (1 | cow) + (1 | Date), 
              data = subdays %>% filter(!is.na(pdirty_1)), family=poisson)
summary(glmm)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: poisson  ( log )
## Formula: CMD ~ pdirty_1 + Parity + DIM + Tx + cmhx + (1 | cow) + (1 |  
##     Date)
##    Data: subdays %>% filter(!is.na(pdirty_1))
## 
##      AIC      BIC   logLik deviance df.resid 
##     1466     1550     -723     1446    30992 
## 
## Scaled residuals: 
##    Min     1Q Median     3Q    Max 
## -0.409 -0.037 -0.027 -0.021 16.511 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  cow    (Intercept) 4.8393   2.200   
##  Date   (Intercept) 0.0618   0.249   
## Number of obs: 31002, groups:  cow, 504; Date, 88
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -8.51181    0.63873  -13.33  < 2e-16 ***
## pdirty_1     0.37564    0.08403    4.47  7.8e-06 ***
## Parity2      0.34863    0.43613    0.80   0.4241    
## Parity3      0.58890    0.46408    1.27   0.2045    
## Parity4      0.77803    0.50606    1.54   0.1242    
## DIM         -0.00392    0.00140   -2.79   0.0052 ** 
## Tx           0.76994    0.30523    2.52   0.0117 *  
## cmhx        -0.08608    0.38246   -0.23   0.8219    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##          (Intr) pdrt_1 Party2 Party3 Party4 DIM    Tx    
## pdirty_1 -0.665                                          
## Parity2  -0.278  0.002                                   
## Parity3  -0.257  0.014  0.522                            
## Parity4  -0.201  0.012  0.534  0.558                     
## DIM      -0.292  0.009  0.060  0.092  0.002              
## Tx       -0.325  0.021 -0.002 -0.053 -0.030 -0.006       
## cmhx     -0.041  0.000 -0.353 -0.434 -0.548 -0.222  0.051
## convergence code: 0
## Model failed to converge with max|grad| = 0.686311 (tol = 0.002, component 1)
## Model is nearly unidentifiable: very large eigenvalue
##  - Rescale variables?
#Using Logistic - also has convergence issues. Similar results to GEE.
glmm <- glmer(CMD ~ pdirty_1 + Parity + DIM + Tx + cmhx + (1 | Date), 
              data = subdays %>% filter(!is.na(pdirty_1)), family=binomial)
summary(glmm)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: CMD ~ pdirty_1 + Parity + DIM + Tx + cmhx + (1 | Date)
##    Data: subdays %>% filter(!is.na(pdirty_1))
## 
##      AIC      BIC   logLik deviance df.resid 
##     1508     1583     -745     1490    30993 
## 
## Scaled residuals: 
##    Min     1Q Median     3Q    Max 
## -0.164 -0.070 -0.055 -0.044 26.860 
## 
## Random effects:
##  Groups Name        Variance Std.Dev.
##  Date   (Intercept) 9.86e-13 9.93e-07
## Number of obs: 31002, groups:  Date, 88
## 
## Fixed effects:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -7.110734   0.417720  -17.02  < 2e-16 ***
## pdirty_1     0.342027   0.075765    4.51  6.4e-06 ***
## Parity2      0.286560   0.269629    1.06  0.28788    
## Parity3      0.489284   0.285193    1.72  0.08623 .  
## Parity4      0.656937   0.309260    2.12  0.03365 *  
## DIM         -0.003406   0.000971   -3.51  0.00045 ***
## Tx           0.519173   0.191286    2.71  0.00665 ** 
## cmhx        -0.017962   0.228627   -0.08  0.93738    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##          (Intr) pdrt_1 Party2 Party3 Party4 DIM    Tx    
## pdirty_1 -0.795                                          
## Parity2  -0.289 -0.011                                   
## Parity3  -0.261 -0.004  0.547                            
## Parity4  -0.204 -0.005  0.537  0.580                     
## DIM      -0.280 -0.002  0.131  0.154  0.054              
## Tx       -0.280 -0.008 -0.027 -0.055 -0.015 -0.005       
## cmhx     -0.070 -0.007 -0.270 -0.395 -0.520 -0.214  0.069
## convergence code: 0
## boundary (singular) fit: see ?isSingular

Section 2: Investigating the reliability of UHS assessment when using a subset of the herd

Setup of dataset

set.seed(99)
#Create dataset of UHS to test necessary sample size
library(tidyr)
hyg <- uhscount
hyg$count <- seq(1,406)
hyg <- hyg[, c(36,1:35)]
hyg <- pivot_longer(hyg,-count, names_to = "date", values_to = "score",values_drop_na = TRUE)

library(janitor)
hyg$score %>% tabyl

#Reformat scores to 0 (scores 1/2) and 1 (scores 3 and 4 = dirty)
hyg$score[hyg$score<3] <- 0 
hyg$score[hyg$score>2] <- 1

maxperdate <- hyg %>% group_by(date) %>% summarise(max=max(count))
hyg <- merge(hyg,maxperdate,by="date")
hyg$quant <- hyg$count/hyg$max

#Create function to estimate pdirty when selecting a subset of the herd.
pdirt1 <- function(dat,samp.size){
  select.date <- hyg %>% filter(hyg$date==dat) %>% select(score)
  counts.today <- sum(!is.na(select.date$score))
  sample.size <- min(counts.today,samp.size)
  sample.scores <- sample(select.date$score,size = sample.size,replace = FALSE)
  mean(sample.scores, na.rm=T)
}


#Function for random selection of a given proportion of cows
pdirt1(dat=42389,samp.size=1)


hyg <- hyg[order(hyg$count),]

hyg$middle80.criteria.min <- 0.5 - (1/hyg$max*40)
hyg$middle80.criteria.max <- 0.5 + (1/hyg$max*40)
hyg$middle80 <- 0
hyg$middle80[hyg$quant > hyg$middle80.criteria.min & hyg$quant < hyg$middle80.criteria.max] <- 1

hyg %>% filter(middle80==1) %>% 
  group_by(date) %>%
  summarise(
    middle80 = n())


last80 <- hyg %>% 
  group_by(date) %>% 
  slice(tail(row_number(), 80))

last80$last80 <- 1

first80 <- hyg %>% 
  group_by(date) %>% 
  slice(head(row_number(), 80))
first80$first80 <- 1

hyg <- merge(hyg,first80,all=T)
hyg$first80[is.na(hyg$first80)] <- 0

hyg <- merge(hyg,last80,all=T)
hyg$last80[is.na(hyg$last80)] <- 0

pdirt.first80 <- function(dat){
  hyg %>% filter(hyg$date==dat & first80==1) %>% select(score) %>% summarise(mean=mean(score))}

pdirt.middle80 <- function(dat){
  hyg %>% filter(hyg$date==dat & middle80==1) %>% select(score) %>% summarise(mean=mean(score))}

pdirt.last80 <- function(dat){
  hyg %>% filter(hyg$date==dat & last80==1) %>% select(score) %>% summarise(mean=mean(score))}

#Create function to estimate pdirty when selecting a subset of the herd.

pdirt2 <- function(dat1){
  pfull <- pdirt1(dat=dat1,samp.size=1000)
  p350 <- pdirt1(dat=dat1,samp.size=350)
  p300 <- pdirt1(dat=dat1,samp.size=300)
  p250 <- pdirt1(dat=dat1,samp.size=250)
  p200 <- pdirt1(dat=dat1,samp.size=200)
  p150<- pdirt1(dat=dat1,samp.size=150)
  p100 <- pdirt1(dat=dat1,samp.size=100)
  p80 <- pdirt1(dat=dat1,samp.size=80)
  p60 <- pdirt1(dat=dat1,samp.size=60)
  p50 <- pdirt1(dat=dat1,samp.size=50)
  p40 <- pdirt1(dat=dat1,samp.size=40)
  p30 <- pdirt1(dat=dat1,samp.size=30)
  p20 <- pdirt1(dat=dat1,samp.size=20)
  p10 <- pdirt1(dat=dat1,samp.size=10)
  
pfirst80 <- pdirt.first80(dat=dat1)
pmiddle80 <- pdirt.middle80(dat=dat1)
plast80 <- pdirt.last80(dat=dat1)

  #Differences between full dataset and sample
  pd350 <- p350 - pfull
  pd300 <- p300 - pfull
  pd250 <- p250 - pfull
  pd200 <- p200 - pfull
  pd150 <- p150 - pfull
  pd100 <- p100 - pfull
  pd80 <- p80 - pfull
  pd60 <- p60 - pfull
  pd50 <- p50 - pfull
  pd40 <- p40 - pfull
  pd30 <- p30 - pfull
  pd20 <- p20 - pfull
  pd10 <- p10 - pfull
  
  pdfirst80 <- pfirst80 - pfull
  pdmiddle80 <- pmiddle80 - pfull
  pdlast80 <- plast80 - pfull
  
  tab <- as.data.frame(rbind(pfull,p350,p300,p250,p200,p150,p100,p80,p60,p50,p40,p30,p20,p10,
                             pfirst80,pmiddle80,plast80,
               pd350,pd300,pd250,pd200,pd150,pd100,pd80,pd60,pd50,pd40,pd30,pd20,pd10,
               pdfirst80,pdmiddle80,pdlast80))
  tab$measure <- c("all cows",350,300,250,200,150,100,80,60,50,40,30,20,10,
                   "first80","middle80","last80",
                   "350diff","300diff","250diff","200diff","150diff","100diff","80diff","60diff",
                   "50diff","40diff","30diff","20diff","10diff",
                   "First80diff","Middle80diff","Last80diff")
  tab$date <- dat1
  tab
}

#pdirt2(42387)

#print(pdirt2(42387))

#Create list of dates 
uhsdates <- unique(hyg$date)

#Combine into single dataframe
hygsum <- data.frame()
hygsum <- do.call(rbind.data.frame,
                  lapply(unique(uhsdates),
                         function(i){
                           pdirt2(i)}))


hygsum <- hygsum[, c(3,2,1)]
hygsum$prop <- hygsum$mean
hygsum$mean <- NULL

Using differences

library(blandr)
library(DescTools)
library(stringr)
library(ggplot2)

?CCC
#Analysis 1 - report differences and sd of differences ----
tabdiff <- hygsum %>% group_by(measure) %>% summarise(mean=mean(prop), sd=sd(prop),hilow=Range(prop), 
                                                  q5=quantile(probs=0.05,prop),q95=quantile(probs=0.95,prop))
tabdiff
tabdiffplot1 <- hygsum %>% filter(str_detect(measure, "diff")
                                  & measure != "First80diff" & measure != "Middle80diff" & measure != "Last80diff")

tabdiffplot1$measure <- recode_factor(tabdiffplot1$measure,
                                      "350diff" = "350",
                                      "300diff" = "300",
                                      "250diff" = "250",
                                      "200diff" = "200",
                                      "150diff" = "150",
                                      "100diff" = "100",
                                      "80diff" = "80",
                                      "60diff" = "60",
                                      "50diff" = "50",
                                      "40diff" = "40",
                                      "30diff" = "30",
                                      "20diff" = "20",
                                      "10diff" = "10",
                   .ordered = TRUE)

plot1 <- ggplot(tabdiffplot1, aes(x=measure, y=prop)) +
  coord_flip(ylim = c(-.25,.25)) +
  geom_boxplot(aes(fill=measure)) +
  geom_jitter(width = 0.1) +
  theme(legend.position="none") +
  labs(y = "Deviation from TRUE herd udder hygiene \n(Proportion of herd with udder hygiene scores ≥ 3/4)", x="Number of cows scored") +
  theme(axis.text.x = element_text(size=12, family="Times"),axis.title.x = element_text(size=15, family="Times",face="bold")) +
  theme(axis.text.y = element_text(size=15, family="Times"),axis.title.y = element_text(size=15, family="Times",face="bold")) +
  theme(plot.title = element_text(size=15, family="Times",hjust = 0.5)) + 
  geom_hline(yintercept=0, linetype="solid", color = "red") +
  theme(panel.border = element_rect(colour = "black", fill=NA, size=1),
        panel.background = element_rect(fill = "white", colour = "white",size = 0.5, linetype = "solid"),
        panel.grid.major = element_line(size =0, linetype = 'solid', colour = "grey"),
        panel.grid.minor = element_line(size = 0, linetype = 'solid',colour = "grey"))

plot1

tiff("newplot1.tiff", units="in", width=7, height=7, res=300)
plot1
dev.off()
## quartz_off_screen 
##                 2
tabdiff1 <-hygsum %>% filter(measure == "First80diff" | measure == "Middle80diff" | measure == "Last80diff")

tabdiff1$measure <- recode_factor(tabdiff1$measure,
                   Last80diff = "Last 80",
                   Middle80diff = "Middle 80",
                   First80diff = "First 80",

                   .ordered = TRUE)

plot2 <- ggplot(tabdiff1, aes(x=measure, y=prop)) +
  geom_boxplot(aes(fill=measure)) + 
  geom_jitter(width = 0.1) + 
  coord_flip(ylim = c(-.25,.25)) +
  theme(legend.position="none") +
  labs(y = "Deviation from TRUE herd udder hygiene \n(Proportion of herd with udder hygiene scores ≥ 3/4)", x="Stage of milking") +
  theme(axis.text.x = element_text(size=12, family="Times"),axis.title.x = element_text(size=15, family="Times",face="bold")) +
  theme(axis.text.y = element_text(size=15, family="Times"),axis.title.y = element_text(size=15, family="Times",face="bold")) +
  theme(plot.title = element_text(size=15, family="Times",hjust = 0.5)) +
  geom_hline(yintercept=0, linetype="solid", color = "red") +
  theme(panel.border = element_rect(colour = "black", fill=NA, size=1),
        panel.background = element_rect(fill = "white", colour = "white",size = 0.5, linetype = "solid"),
        panel.grid.major = element_line(size =0, linetype = 'solid', colour = "grey"),
        panel.grid.minor = element_line(size = 0, linetype = 'solid',colour = "grey"))

plot2

tiff("newplot2.tiff", units="in", width=7, height=4, res=300)
plot2
dev.off()
## quartz_off_screen 
##                 2
#Analysis 2 - limits of agreement -----
#CCC(hygsum$prop[hygsum$measure=="all cows"], hygsum$prop[hygsum$measure==10])

#?CCC
difffun <- function(group){
a <- CCC(hygsum$prop[hygsum$measure=="all cows"], hygsum$prop[hygsum$measure==group])[5] %>% as.data.frame %>% select(blalt.delta) %>%
  summarise(mean = -mean(blalt.delta),
            conf.low = -mean(blalt.delta) - 2.58*sd(blalt.delta),
            conf.high = -mean(blalt.delta) + 2.58*sd(blalt.delta))

b <- paste0(sprintf("%.2f",round(a$mean, 2)), sep=""," (",
       sprintf("%.2f",round(a$conf.low, 2)),", ",
       sprintf("%.2f",round(a$conf.high, 2)),")")

as.data.frame(cbind(b,group))

}

#difffun(350)
#Loop through proportions
proplist <- c(350,300,250,200,150,100,80,60,50,40,30,20,10,
              "first80","middle80","last80")

#for (i in proplist) {
#  print(corfun(i))
#}

#Combine into single dataframe
mdifftab <- data.frame()
mdifftab <- do.call(rbind.data.frame,
                  lapply(proplist,
                         function(i){
                           difffun(i)}))

colnames(mdifftab) <- c("mdiff","per")
mdifftab
#Analysis 2 - CCC-----
cccfun <- function(group){
  tab2 <-  CCC(hygsum$prop[hygsum$measure=="all cows"], hygsum$prop[hygsum$measure==group])[1] %>% as.data.frame %>% select('rho.c.est')
  tab2$per <- group
  tab2
}

cccfun(300)
#Loop through proportions
proplist <- c(350,300,250,200,150,100,80,60,50,40,30,20,10,
              "first80","middle80","last80")


#Combine into single dataframe
cortab <- data.frame()
cortab <- do.call(rbind.data.frame,
                  lapply(proplist,
                         function(i){
                           cccfun(i)}))

cortab$R <- cortab$.
cortab$. <- NULL
cortab

More detailed reporting

func4limitsofagreement <- function(group){
a <- CCC(hygsum$prop[hygsum$measure=="all cows"], hygsum$prop[hygsum$measure==group],conf.level = 0.99)[[5]]
meandiff.value <- -mean(a$delta)
meandiff.sd <- sd(a$delta)
meandiff.se <- meandiff.sd / sqrt(nrow(a))
meandiff.99 <- 2.58*meandiff.se
lolim99 <- -meandiff.value - 2.58*meandiff.sd
uplim99 <- -meandiff.value + 2.58*meandiff.sd
lim99.se <- meandiff.se * 1.71
lim99.99ci <- lim99.se * 2.58
c3 <- CCC(hygsum$prop[hygsum$measure=="all cows"], hygsum$prop[hygsum$measure==group],conf.level = 0.99)[[1]]
c3$ccc <- c3$est
c3$ccc.lwr.ci <- c3$lwr.ci
c3$ccc.upr.ci <- c3$upr.ci
cbind(group,c3 %>% select(ccc,ccc.lwr.ci,ccc.upr.ci),meandiff.value,meandiff.se,meandiff.99,lolim99,uplim99,lim99.se,lim99.99ci)

}

#Loop through proportions
proplist <- c(350,300,250,200,150,100,80,60,50,40,30,20,10,
              "first80","middle80","last80")


cortab2 <- data.frame()
cortab2 <- do.call(rbind.data.frame,
                  lapply(proplist,
                         function(i){
                           func4limitsofagreement(i)}))

cortab2
cortab2$ccc.sum <- paste0(sprintf("%.2f",round(cortab2$ccc, 2)), sep=""," (",
       sprintf("%.2f",round(cortab2$ccc.lwr.ci, 2)),", ",
       sprintf("%.2f",round(cortab2$ccc.upr.ci, 2)),")")

cortab2

Comparing risk of cows being dirty by order of milking

The variable ‘quant’ was used for this. For each milking the exact quantile that the cow came in. Results will be reported as a RR per quantile (i.e. relative change in risk for every 10% increase in position)

hyg$quant <- hyg$quant * 10
gee <- geeglm(score ~ quant,data=hyg,id=date,family=binomial(link="log"))
summary(gee)
## 
## Call:
## geeglm(formula = score ~ quant, family = binomial(link = "log"), 
##     data = hyg, id = date)
## 
##  Coefficients:
##             Estimate  Std.err  Wald Pr(>|W|)    
## (Intercept) -1.03196  0.07433 192.8  < 2e-16 ***
## quant        0.02158  0.00622  12.1  0.00052 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation structure = independence 
## Estimated Scale Parameters:
## 
##             Estimate Std.err
## (Intercept)        1  0.0215
## Number of clusters:   35  Maximum cluster size: 406
hyg$quant2 <- hyg$quant^2
gee1 <- geeglm(score ~ quant + quant2,data=hyg,id=date,family=binomial(link="log"))
summary(gee1)
## 
## Call:
## geeglm(formula = score ~ quant + quant2, family = binomial(link = "log"), 
##     data = hyg, id = date)
## 
##  Coefficients:
##             Estimate  Std.err   Wald Pr(>|W|)    
## (Intercept) -1.06994  0.07223 219.43   <2e-16 ***
## quant        0.04312  0.01491   8.36   0.0038 ** 
## quant2      -0.00210  0.00141   2.20   0.1379    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation structure = independence 
## Estimated Scale Parameters:
## 
##             Estimate Std.err
## (Intercept)        1  0.0214
## Number of clusters:   35  Maximum cluster size: 406
tab <- tidy(gee, conf.int=T)
tab$estimate <- (tab$estimate) %>% exp
tab$conf.low <- (tab$conf.low) %>% exp
tab$conf.high <- (tab$conf.high) %>% exp
tab$RR <- paste0(round(tab$estimate,2)," (",round(tab$conf.low,2),", ",round(tab$conf.high,2),")")
tab %>% select(term,RR)

RR = 1.02. Therefore, a cow that comes in 10% later is expected to have a 1.02 times the risk of being dirty (scores 3 or 4).

Sensitivity analysis using different covariance structures.

gee2 <- geeglm(score ~ quant,data=hyg,id=date,corstr = "ar1",
               family=binomial(link="log"))
summary(gee2)
## 
## Call:
## geeglm(formula = score ~ quant, family = binomial(link = "log"), 
##     data = hyg, id = date, corstr = "ar1")
## 
##  Coefficients:
##             Estimate  Std.err  Wald Pr(>|W|)    
## (Intercept) -1.03231  0.07433 192.9  < 2e-16 ***
## quant        0.02163  0.00621  12.1  0.00049 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation structure = ar1 
## Estimated Scale Parameters:
## 
##             Estimate Std.err
## (Intercept)        1  0.0215
##   Link = identity 
## 
## Estimated Correlation Parameters:
##       Estimate Std.err
## alpha    0.119  0.0241
## Number of clusters:   35  Maximum cluster size: 406
gee3 <- geeglm(score ~ quant,data=hyg,id=date,corstr = "exchangeable",
               family=binomial(link="log"))
summary(gee3)
## 
## Call:
## geeglm(formula = score ~ quant, family = binomial(link = "log"), 
##     data = hyg, id = date, corstr = "exchangeable")
## 
##  Coefficients:
##             Estimate  Std.err  Wald Pr(>|W|)    
## (Intercept) -1.03955  0.07482 193.1  < 2e-16 ***
## quant        0.02171  0.00625  12.1  0.00051 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation structure = exchangeable 
## Estimated Scale Parameters:
## 
##             Estimate Std.err
## (Intercept)        1  0.0236
##   Link = identity 
## 
## Estimated Correlation Parameters:
##       Estimate Std.err
## alpha   0.0774  0.0157
## Number of clusters:   35  Maximum cluster size: 406

Check for predicted probabilities outside the range of 0 to 1

hyg$predict <- predict(gee,hyg,type="response")
hist(hyg$predict)

range(hyg$predict, na.rm=T)
## [1] 0.356 0.442

All predicted probabilities were between 0 and 1 for each row in the current dataset.

#OLD ANALYSIS BELOW
set.seed(1)
#Create dataset of UHS to test necessary sample size
library(tidyr)
hyg <- uhscount
hyg$count <- seq(1,406)
hyg <- hyg[, c(36,1:35)]
hyg <- pivot_longer(hyg,-count, names_to = "date", values_to = "score",values_drop_na = TRUE)

library(janitor)
hyg$score %>% tabyl

#Reformat scores to 0 (scores 1/2) and 1 (scores 3 and 4 = dirty)
hyg$score[hyg$score<3] <- 0 
hyg$score[hyg$score>2] <- 1

#Sample first, middle and last 10% of the herd -----
maxperdate <- hyg %>% group_by(date) %>% summarise(max=max(count))
hyg <- merge(hyg,maxperdate,by="date")
hyg$quant <- hyg$count/hyg$max

#First, middle and last 10% groups
hyg$group <- NA
hyg$group[hyg$quant < 0.10000001] <- "First 10%"
hyg$group[hyg$quant > 0.45 & hyg$quant < 0.55] <- "Middle 10%"
hyg$group[hyg$quant > 0.900000] <- "Last 10%"

#Split into 10% groups
#hyg$groupq <- NA
#hyg$groupq[hyg$quant < 0.10000001] <- 0.1
#hyg$groupq[hyg$quant >= 0.1 & hyg$quant < 0.2] <- .2
#hyg$groupq[hyg$quant >= 0.2 & hyg$quant < 0.3] <- .3
#hyg$groupq[hyg$quant >= 0.3 & hyg$quant < 0.4] <- .4
#hyg$groupq[hyg$quant >= 0.4 & hyg$quant < 0.5] <- .5
#hyg$groupq[hyg$quant >= 0.5 & hyg$quant < 0.6] <- .6
#hyg$groupq[hyg$quant >= 0.6 & hyg$quant < 0.7] <- .7
#hyg$groupq[hyg$quant >= 0.7 & hyg$quant < 0.8] <- .8
#hyg$groupq[hyg$quant >= 0.8 & hyg$quant < 0.9] <- .9
#hyg$groupq[hyg$quant >= 0.9 & hyg$quant] <- 1

#Split into 20% groups (quintiles)
hyg$groupq <- NA
hyg$groupq[hyg$quant >= 0 & hyg$quant < 0.2] <- .2
hyg$groupq[hyg$quant >= 0.2 & hyg$quant < 0.4] <- .4
hyg$groupq[hyg$quant >= 0.4 & hyg$quant < 0.6] <- .6
hyg$groupq[hyg$quant >= 0.6 & hyg$quant < 0.8] <- .8
hyg$groupq[hyg$quant >= 0.8 & hyg$quant] <- 1


#Create function to estimate pdirty using 100, 90, 80 --> 10% of data
hyg %>% filter(hyg$date==42389) %>% select(score) %>% sample_frac(size = 0.1) %>% summarise(mean=mean(score))

pdirt1 <- function(dat,prop){
  hyg %>% filter(hyg$date==dat) %>% select(score) %>% sample_frac(size = prop,replace = FALSE) %>% summarise(mean=mean(score))
}

hyg %>% filter(hyg$date==42387 & group =="First 10%") %>% select(score) %>% summarise(mean=mean(score))



#Function for random selection of a given proportion of cows
pdirt1(dat=42389,prop=1)

#Function for first, middle and last 10%
pdirt3 <- function(dat,groupnam){
  hyg %>% filter(hyg$date==dat & group ==groupnam) %>% select(score) %>% summarise(mean=mean(score))}
pdirt3(42387,groupnam ="First 10%")

#Function for 10% groups (0-10, 11-20, etc)
pdirt4 <- function(dat,groupnam){
  hyg %>% filter(hyg$date==dat & groupq ==groupnam) %>% select(score) %>% summarise(mean=mean(score))}
pdirt4(42387,groupnam=.1)



pdirt2 <- function(dat1){
  p100 <- pdirt1(dat=dat1,prop=1)
  p90 <- pdirt1(dat=dat1,prop=0.9)
  p80 <- pdirt1(dat=dat1,prop=.8)
  p70 <- pdirt1(dat=dat1,prop=.7)
  p60 <- pdirt1(dat=dat1,prop=.6)
  p50 <- pdirt1(dat=dat1,prop=.5)
  p40 <- pdirt1(dat=dat1,prop=.4)
  p30 <- pdirt1(dat=dat1,prop=.3)
  p20 <- pdirt1(dat=dat1,prop=.2)
  p10 <- pdirt1(dat=dat1,prop=.1)
  p5 <- pdirt1(dat=dat1,prop=.05)
  
  pfirst10 <- pdirt3(dat=dat1,groupnam="First 10%")
  pmiddle10 <- pdirt3(dat=dat1,groupnam="Middle 10%")
  plast10 <- pdirt3(dat=dat1,groupnam="Last 10%")
  
  #pq10 <- pdirt4(dat=dat1,groupnam=0.1)
  pq20 <- pdirt4(dat=dat1,groupnam=0.2)
  #pq30 <- pdirt4(dat=dat1,groupnam=0.3)
  pq40 <- pdirt4(dat=dat1,groupnam=0.4)
  #pq50 <- pdirt4(dat=dat1,groupnam=0.5)
  pq60 <- pdirt4(dat=dat1,groupnam=0.6)
  #pq70 <- pdirt4(dat=dat1,groupnam=0.7)
  pq80 <- pdirt4(dat=dat1,groupnam=0.8)
  #pq90 <- pdirt4(dat=dat1,groupnam=0.9)
  pq100 <- pdirt4(dat=dat1,groupnam=1)
  
  #Differences between full dataset and sample
  pd90 <- p90 - p100
  pd80 <- p80 - p100
  pd70 <- p70 - p100
  pd60 <- p60 - p100
  pd50 <- p50 - p100
  pd40 <- p40 - p100
  pd30 <- p30 - p100
  pd20 <- p20 - p100
  pd10 <- p10 - p100
  pd5 <- p5 - p100

  pqd100 <- pq100 - p100  
  #pqd90 <- pq90 - p100
  pqd80 <- pq80 - p100
  #pqd70 <- pq70 - p100
  pqd60 <- pq60 - p100
  #pqd50 <- pq50 - p100
  pqd40 <- pq40 - p100
  #pqd30 <- pq30 - p100
  pqd20 <- pq20 - p100
  #pqd10 <- pq10 - p100
  
  pdfirst10 <- pfirst10 - p100
  pdmiddle10 <- pmiddle10 - p100
  pdlast10 <- plast10 - p100
  
  
  tab <- rbind(p100,p90,p80,p70,p60,p50,p40,p30,p20,p10,p5,
               pfirst10,pmiddle10,plast10,
               pq20,pq40,pq60,pq80,pq100,
               pd90,pd80,pd70,pd60,pd50,pd40,pd30,pd20,pd10,pd5,
               pdfirst10,pdmiddle10,pdlast10,
               pqd20,pqd40,pqd60,pqd80,pqd100)
  tab$per <- c(1,0.9,0.8,0.7,0.6,0.5,0.4,0.3,0.2,0.1,0.05,
               "pfirst10","pmiddle10","plast10",
               "pq20","pq40","pq60","pq80","pq100",
               "90diff","80diff","70diff","60diff","50diff","40diff","30diff","20diff","10diff","5diff",
               "first10diff","middle10diff","last10diff",
               "pqd20","pqd40","pqd60","pqd80","pqd100")
  tab$date <- dat1
  tab
}

print(pdirt2(42387))

#Create list of dates 
uhsdates <- unique(hyg$date)

#Loop through dates
#for (i in uhsdates) {
#  print(pdirt2(i))
#}

#Combine into single dataframe
hygsum <- data.frame()
hygsum <- do.call(rbind.data.frame,
                  lapply(unique(uhsdates),
                         function(i){
                           pdirt2(i)}))


hygsum <- hygsum[, c(3,2,1)]
hygsum$prop <- hygsum$mean
hygsum$mean <- NULL

Using differences

library(blandr)
library(DescTools)
library(stringr)
library(ggplot2)
#Analysis 1 - report differences and sd of differences ----
tabdiff <- hygsum %>% group_by(per) %>% summarise(mean=mean(prop), sd=sd(prop),hilow=Range(prop), 
                                                  q5=quantile(probs=0.05,prop),q95=quantile(probs=0.95,prop))
tabdiff
tabdiffplot1 <- hygsum %>% filter(str_detect(per, "diff") & per != "first10diff" & per != "middle10diff" & per != "last10diff")

tabdiffplot1$per <- recode_factor(tabdiffplot1$per,
                   "90diff" = "90%",
                   "80diff" = "80%",
                                      "70diff" = "70%",
                                      "60diff" = "60%",
                                      "50diff" = "50%",
                                      "40diff" = "40%",
                                      "30diff" = "30%",
                                      "20diff" = "20%",
                                      "10diff" = "10%",
                                      "5diff" = "5%",
                   .ordered = TRUE)

plot1 <- ggplot(tabdiffplot1, aes(x=per, y=prop)) +
  coord_flip(ylim = c(-.25,.25)) +
  geom_boxplot(aes(fill=per)) +
  geom_jitter(width = 0.1) +
  theme(legend.position="none") +
  labs(y = "Deviation from TRUE prevalence of dirty udders", x="Proportion of scores used") +
  theme(axis.text.x = element_text(size=12, family="Times"),axis.title.x = element_text(size=15, family="Times",face="bold")) +
  theme(axis.text.y = element_text(size=15, family="Times"),axis.title.y = element_text(size=15, family="Times",face="bold")) +
  theme(plot.title = element_text(size=15, family="Times",hjust = 0.5)) + 
  geom_hline(yintercept=0, linetype="solid", color = "red") +
  theme(panel.border = element_rect(colour = "black", fill=NA, size=1),
        panel.background = element_rect(fill = "white", colour = "white",size = 0.5, linetype = "solid"),
        panel.grid.major = element_line(size =0, linetype = 'solid', colour = "grey"),
        panel.grid.minor = element_line(size = 0, linetype = 'solid',colour = "grey"))

plot1

tiff("plot1.tiff", units="in", width=7, height=7, res=300)
plot1
dev.off()
tabdiff1 <-hygsum %>% filter(str_detect(per, "pqd"))

tabdiff1$per <- recode_factor(tabdiff1$per,
                   pqd100 = "5th quintile",
                   pqd80 = "4th quintile",
                   pqd60 = "3rd quintile",
                   pqd40 = "2nd quintile",
                   pqd20 = "1st quintile",
                   .ordered = TRUE)

plot2 <- ggplot(tabdiff1, aes(x=per, y=prop)) +
  geom_boxplot(aes(fill=per)) + 
  geom_jitter(width = 0.1) + 
  coord_flip(ylim = c(-.25,.25)) +
  theme(legend.position="none") +
  labs(y = "Deviation from TRUE prevalence of dirty udders", x="Stage of milking") +
  theme(axis.text.x = element_text(size=12, family="Times"),axis.title.x = element_text(size=15, family="Times",face="bold")) +
  theme(axis.text.y = element_text(size=15, family="Times"),axis.title.y = element_text(size=15, family="Times",face="bold")) +
  theme(plot.title = element_text(size=15, family="Times",hjust = 0.5)) +
  geom_hline(yintercept=0, linetype="solid", color = "red") +
  theme(panel.border = element_rect(colour = "black", fill=NA, size=1),
        panel.background = element_rect(fill = "white", colour = "white",size = 0.5, linetype = "solid"),
        panel.grid.major = element_line(size =0, linetype = 'solid', colour = "grey"),
        panel.grid.minor = element_line(size = 0, linetype = 'solid',colour = "grey"))

plot2

tiff("plot2.tiff", units="in", width=7, height=4, res=300)
plot2
dev.off()
#Analysis 2 - limits of agreement -----

difffun <- function(group){
a <- CCC(hygsum$prop[hygsum$per==1], hygsum$prop[hygsum$per==group])[5] %>% as.data.frame %>% select(blalt.delta) %>%
  summarise(mean = mean(blalt.delta),
            conf.low = mean(blalt.delta) - 1.96*sd(blalt.delta),
            conf.high = mean(blalt.delta) + 1.96*sd(blalt.delta))

a$mean <- a$mean * -1
a$conf.low <- a$conf.high * -1
a$conf.high <- a$conf.low * -1

b <- paste0(sprintf("%.2f",round(a$mean, 2)), sep=""," (",
       sprintf("%.2f",round(a$conf.low, 2)),", ",
       sprintf("%.2f",round(a$conf.high, 2)),")")
b$per <- group
b
}


#Loop through proportions
proplist <- c(0.9,.8,.7,.6,.5,.4,.3,.2,.1,.05,
              "pq20","pq40","pq60","pq80","pq100")

#for (i in proplist) {
#  print(corfun(i))
#}

#Combine into single dataframe
mdifftab <- data.frame()
mdifftab <- do.call(rbind.data.frame,
                  lapply(proplist,
                         function(i){
                           difffun(i)}))

colnames(mdifftab) <- c("mdiff","per")
mdifftab
#Analysis 2 - CCC-----
CCC(hygsum$prop[hygsum$per==1], hygsum$prop[hygsum$per=="plast10"])


cccfun <- function(group){
  tab2 <-  CCC(hygsum$prop[hygsum$per==1], hygsum$prop[hygsum$per==group])[1] %>% as.data.frame %>% select('rho.c.est')
  tab2$per <- group
  tab2
}

cccfun(0.05)

#Loop through proportions
proplist <- c(0.9,.8,.7,.6,.5,.4,.3,.2,.1,.05,
              "pq20","pq40","pq60","pq80","pq100")

#for (i in proplist) {
#  print(corfun(i))
#}

#Combine into single dataframe
cortab <- data.frame()
cortab <- do.call(rbind.data.frame,
                  lapply(proplist,
                         function(i){
                           cccfun(i)}))

cortab$R <- cortab$.
cortab$. <- NULL
cortab