Measles seroprevalence data

Reading and cleaning the data:
Plotting IgG concentration distributions by age class
Sensitivity of the seroprevalence respective to the cutoff value
Seroprevalence as a function of age

Reading and cleaning the data:

Reading the 3 excel data files. Note that, to make the function read_excel work on the Age16-34.xlsx file, we had to add a a letter before the first numerical value of the Concentration column.

> library(readxl) # for "read_excel"
> data1 <- read_excel("../data/Boy&Girl5.xlsx","Boy&Girl5")
> data2 <- read_excel("../data/6-10.xlsx","Result")
> data3 <- read_excel("../data/Age16-34.xlsx","Result",na="N/A")

Merging the 3 sources of data:

> #----- data1:
> names(data1) <- sub(" ","",names(data1)) # removing trailing spaces.
> #----- data3:
> names(data3) <- sub("Subject","",names(data3)) # "age" column of data3.
> data3$Gender <- "Female" # "gender" column in data3.
> data3$Concentration[1] <- sub("a","",data3$Concentration[1])
> #----- selecting, merging, cleaning:
> var <- c("Location","Gender","Age","Concentration") # the variables we want.
> data <- rbind(data1[,var],data2[,var],data3[,var]) # merging.
> rm(data1,data2,data3,var) # cleaning.

Putting the dataset in good shape:

> data <- na.exclude(data) # remove missing values.
> names(data) <- tolower(names(data)) # put column names in small letters.
> # The locations:
> sites <- c("14"="Dak Lak","18"="Hue","23"="Hanoi","24"="Ho Chi Minh")
> data$location <- as.factor(sites[as.character(data$location)])
> # The genders:
> data$gender <- as.factor(tolower(data$gender))
> # The age classes:
> tmp1 <- 1:10
> tmp2 <- 17:20
> tmp3 <- c(25,30,35)
> age_class <- cbind(rbind(tmp1-1,tmp1),rbind(tmp2-1,tmp2),rbind(tmp3-5,tmp3))
> age_class <- as.list(data.frame(age_class))
> names(age_class) <- c(1:10,17:23)
> rm(tmp1,tmp2,tmp3)
> # The age:
> data$age <- ordered(data$age)
> # The concentrations:
> sel <- which(data$concentration=="NA") # deal with missing values.
> data$concentration[sel] <- NA
> sel <- grep(">",data$concentration) # remove ">5000".
> data$concentration[sel] <- 5001
> data$concentration <- as.numeric(data$concentration) # convert to numeric.
> sel <- which(data$concentration>5000) # deal with the values above 5000.
> data$concentration[sel] <- 5001
> data <- na.exclude(data) # remove missing values.
> rownames(data) <- NULL # deal with row names.
> attr(data,"na.action") <- NULL # remove useless attributes.
> data <- as.data.frame(data) # remove useless class.
> # Cleaning:
> rm(sel)

The data now looks like this:

> str(data)
'data.frame':   3636 obs. of  4 variables:
 $ location     : Factor w/ 4 levels "Dak Lak","Hanoi",..: 3 3 2 2 2 2 4 4 1 3 ...
 $ gender       : Factor w/ 2 levels "female","male": 1 1 2 2 2 2 2 2 2 2 ...
 $ age          : Ord.factor w/ 17 levels "1"<"2"<"3"<"4"<..: 5 2 3 4 2 3 4 4 1 5 ...
 $ concentration: num  5001 5001 5001 5001 5001 ...
> head(data)
     location gender age concentration
1 Ho Chi Minh female   5          5001
2 Ho Chi Minh female   2          5001
3       Hanoi   male   3          5001
4       Hanoi   male   4          5001
5       Hanoi   male   2          5001
6       Hanoi   male   3          5001
> tail(data)
        location gender age concentration
3631 Ho Chi Minh female  18     1381.8191
3632 Ho Chi Minh female  19      109.2167
3633 Ho Chi Minh female  21      572.5138
3634 Ho Chi Minh female  22      519.4086
3635 Ho Chi Minh female  22     3212.3442
3636 Ho Chi Minh female  23      789.7214

Plotting IgG concentration distributions by age class

This function estimates the density of the distribution of the log concentration of the IgG for each age class. It also structures the output in a way that can be easily plotted in 3 dimensions.

> density3d <- function(data,f=log,from=0,to=10,nb=512) {
+ # "data" is a dataframe with one "age" column and one "concentration" column.
+ # "age" is coded by classes with are defined by the "age_class" vector.
+ # "f" is the transformation applied to the data.
+ # "from" and "to" are the range over which to estimate the densities.
+ # "nb" is the number of equally spaced points at which the density is to be
+ # estimated (see the "density" function).
+ # It returns a list of 3 matrices each with "nb" rows and the number of columns
+ # equal to the number of age classes. The 3 matrices contain the values of
+ # log(concentration), age, and density of the log(concentration) by age.
+   age <- as.character(sort(unique(data$age)))
+   n <- length(age) # the number of age classes.
+ # Estimates the densities of IgG concentrations for each age class:
+   densities <- lapply(age,function(x)
+     with(subset(data,age==x),density(f(concentration),n=nb,from=from,to=to)))
+ # Puts the result in good shape:
+   x <- sapply(densities,function(x)x$x) # log(concentration)
+   z <- sapply(densities,function(x)x$y) # density
+   y <- rep(sapply(age_class,mean)[age],each=nb) # calculates ages from age classes.
+   x <- matrix(x,ncol=n) # log(concentration)
+   y <- matrix(y,ncol=n) # age
+   z <- matrix(z,ncol=n) # density
+ # Returns the output:
+   return(list(logconc=x,age=y,density=z))
+ }
> # Let's try it:
> data2 <- density3d(data)

A first way to vizualize the distributions of IgG concentrations is to plot the densities in 3 dimensions thanks to the following function:

> plot_densities3d <- function(data,ylim=c(0,35),zlim=c(0,.36),phi=20,
+                            theta=-30,expand=15,eps=1.5) {
+ # "data" is an output of the "density3d" function. The other parameters are
+ # parameters of the "polygon3D" function of the "plot3D" package.
+   require(plot3D) # for "polygon3D"
+   with(data,{
+     n <- ncol(logconc) # this is the number of age classes.
+ # We first plot the IgG concentration distribution of the last age class:
+     polygon3D(eps*logconc[,n],age[,n],density[,n],border="black",ylim=ylim,
+               zlim=zlim,phi=phi,theta=theta,xlab="log(concentration)",
+               ylab="age",zlab="density",scale=F,expand=expand)
+ # We then plot the othe age classes, starting from the highest one to the
+ # smallest one.
+     for(i in (n-1):1)
+       polygon3D(eps*logconc[,i],age[,i],density[,i],border="black",add=T)
+   })
+ #  polygon3D(rep(log(275),4),c(-33,33,33,-33),c(0,0,.63,.63),add=T,col=rgb(0,0,1,.1))
+ #  polygon3D(rep(log(200),4),c(-33,33,33,-33),c(0,0,.63,.63),add=T,col=rgb(0,0,1,.1))
+ }
> # Let's try it:
> plot_densities3d(data2)

The number of samples per age class is not constant:

> table(data$age)

  1   2   3   4   5   6   7   8   9  10  17  18  19  20  21  22  23 
195 213 211 213 213 213 212 201 201 210 129 130 151 155 311 345 333

In order to see whether this could affect the estimated distribution, we re-sampled all the classes with the lowest number of samples (129). It appeared that it did not affect the shape of the estimated distributions (not shown). The pattern observed on the above figure thus seems fairly robuest. We can alternatively plot the distributions of the IgG concentrations in 2 dimensions. The following function re-arrange the data in order to use the image function:

> density2d <- function(data) {
+ # "data" is an output of the "density3d" function.
+   age <- min(data$age):max(data$age) # age values incremented from the minimal to the maximal value by step of 1.
+   l <- length(age) # the number of age values.
+ # The template of the density matrix made of NA:
+   density <- matrix(NA,nrow=nrow(data$density),ncol=l)
+ # We will fill the "density" matrix only for the age classes for which we have data:
+   crit <- age %in% data$age[1,]
+ # Let's fill the matrix:
+   ind <- 1
+   density0 <- data$density
+   for(i in 1:l) { # for each age class...
+     if(crit[i]) { # ... we fill the density matrix only if we have data.
+       density[,i] <- density0[,ind]
+       ind <- ind + 1
+     }
+   }
+ # Returns the output:
+   return(list(logconc=data$logconc[,1],age=age,density=density))
+ }
> # Let's try it:
> data3 <- density2d(data2)

The last 3 data classes are 5-year wide whereas the other ones are 1-year wide. The following function extend the values of the last 3 age classes to their 5 years width.

> density2d2 <- function(data) {
+ # "data" is an output of the "density2d2" function.
+   density <- data$density
+ # Adding the 2 missing years to age and density:
+   age <- c(data$age,33.5,34.5)
+   for(i in 1:2) density <- cbind(density,NA)
+   for(x in c(22.5,27.5,32.5)) { # for each of the last 3 age classes
+     y <- which(age==x)
+     density[,y+c(-2,-1,1,2)] <- density[,y] # extend the values to the whole width of the age class.
+   }
+ # Replacing the updated age and density:
+   data$age <- age
+   data$density <- density
+ # Returns the output:
+   return(data)
+ }
> # Let's try it:
> data3 <- density2d2(data3)

This function plots the densities of the log concentrations in 2 dimensions:

> plot_densities2d <- function(data,f=heat.colors,n=100) {with(data,{
+ # "data" is an output of the "density2d" or the "density2d2" functions.
+ # First we plot a background of missing values:
+   density2 <- density
+   density2[,] <- 1
+   image(logconc,age,density2,xlab="log(concentration)",ylab="age",col="lightgrey")
+ # Then we plot the non-missing values:
+   image(logconc,age,-density,add=T,col=f(n))
+ # Add the cutoff values:
+   abline(v=log(c(200,275)),col="lightgrey")
+ # And we add a bounding box:
+   box(bty="o")
+ })}
> # Let's try it:
> plot_densities2d(data3)

Sensitivity of the seroprevalence respective to the cutoff value

The concentration of IgG is used as correlate of immunity. However, even when the IgG concentration is expressed in international units (IU), the cutoff value to discriminate immunes from susceptibles varies quite a lot from publication to publication. Here we investigate how sensitive the seroprevalence is respective to the choice of the cutoff value. The following function calculates the seroprevalence with confidence interval for different values of cutoff.

> calc_seroprev <- function(cutoff,data) {
+ # "cutoff" is a vector of cutoff values.
+ # "data" is a dataframe with a "concentration" column.
+   n <- nrow(data) # number of trials
+   seroprev <- function(cutoff) {
+     tmp <- prop.test(sum(data$concentration>cutoff),n)
+     return(with(tmp,c(estimate,conf.int))) # returns estimate + confidence interval.
+   }
+ # Returns the output:
+   sero <- sapply(cutoff,seroprev)
+   return(list(cutoff=cutoff,sero=sero))
+ }
> # Let's try it:
> cutoff <- seq(200,275,le=512)
> seroprevalence <- calc_seroprev(cutoff,data)

This function plots the result of the previous function, performing a linear smoothing of the data:

> plot_seroprev <- function(data,col1="blue",col2=rgb(0,0,1,.2),add=F) {
+ # "data" is an output of the "calc_seroprev" function.
+ # the background of the graph:
+   with(data,{
+     if(!add) {
+       plot(cutoff,sero[1,],type="n",ylim=0:1,xaxs="i",yaxs="i",ylab="seroprevalence")
+       abline(v=seq(200,275,10),col="lightgrey")
+       abline(h=seq(0,1,.05),col="lightgrey")
+       abline(v=seq(200,275,50),col="grey")
+       abline(h=seq(0,1,.2),col="grey")
+     }
+ # The confidence interval:
+     polygon(c(cutoff,rev(cutoff)),c(sero[2,],rev(sero[3,])),col=col2,border=NA)
+     abline(lm(sero[1,]~cutoff),col=col1)
+     for(i in 2:3) abline(lm(sero[i,]~cutoff),col=col1,lty=2)
+   })
+ # Adding a bounding box:
+   box(bty="o")
+ }
> # Let's try it:
> plot_seroprev(seroprevalence)

It shows that the seroprevalence varies between 80 and 79 % depending on the cutoff value. Even 79 % is pretty low for measles. We can now explore whether this sensitivity depends on the age class:

> # This function calculates the seroprevalence for a given cutoff "c".
> # "data" is a dataframe with a "concentration" column.
> fct <- function(data,c) return(sum(data$concentration>c)/nrow(data))
> fct <- Vectorize(fct,"c") # vectorize the function for cutoff values.
> age <- data$age
> s <- by(data,age,function(x)fct(x,cutoff)) # seroprevalences by age and cutoff.
> a <- rep(names(s),each=length(cutoff)) # age vector.
> c <- rep(cutoff,length(s)) # cutoff vector.
> s2 <- unlist(s)
> m <- lm(s2~c*a)
> anova(m)
Analysis of Variance Table

Response: s2
            Df Sum Sq Mean Sq  F value    Pr(>F)    
c            1  2.816  2.8155 133745.2 < 2.2e-16 ***
a           16 98.015  6.1259 290999.3 < 2.2e-16 ***
c:a         16  0.418  0.0262   1242.3 < 2.2e-16 ***
Residuals 8670  0.183  0.0000                       
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

This shows that the seroprevalence depends not only on the cutoff as shown above, but also on the age class considered. Moreover, there is a significant interaction between age and cutoff. We can illustrate this:

> # The background of the plot:
> plot(NULL,xlim=range(cutoff),ylim=0:1,xlab="cutoff",ylab="seroprevalence",xaxs="i")
> abline(v=seq(150,350,10),col="lightgrey")
> abline(h=seq(0,1,.05),col="lightgrey")
> abline(v=seq(150,350,50),col="grey")
> abline(h=seq(0,1,.2),col="grey")
> # One line per age class, except the first and last one:
> for(i in 2:16) abline(lm(s[[i]]~cutoff),lwd=1)
> # One line for the first and last age classes:
> for(i in c(1,17)) abline(lm(s[[i]]~cutoff),col="blue",lwd=1.5)
> # Adding bounding box:
> box(bty="o")
> # Adding the previous plot where all the age classes are pooled together:
> plot_seroprev(seroprevalence,add=T)

> # Cleaning:
> rm(fct,i,s,s2,a,c)

We can see that the seroprevalence is pretty low for the first age class: between 42 and 58 %, which is fairly low for measles, especially for Vietnam with such a high birth rate (Grenfell and collaborators’ results show that measles dynamics depends not only on the vaccine coverage but also on the birth rate).

Seroprevalence as a function of age

Let’s investigate now the seroprevalence as a function of age. The following function plots the seroprevalence with confidence interval as a function of age, for a given value of the cutoff.

> plot_sero_age <- function(data,c,col="black",add=F,xlim=NULL,v=5,h=.1) {
+ # "data" is a dataframe with an "age" and "concentration" column.
+ # "c" is the value of the cutoff.
+ # "col" is the color of the points and confidence intervals.
+ # This function returns the seroprevalence with confidence interval for a given
+ # value of the cutoff.
+ # "data" is a dataframe with a "concentration" column.
+ # "c" is the cutoff value.
+   fct <- function(data,c) {
+     tmp <- prop.test(sum(data$concentration>c),nrow(data))
+     return(c(tmp$estimate,tmp$conf.int))
+   }
+   data$age <- factor(data$age)
+   serop <- by(data,data$age,fct,c,simplify=T)
+ # Calculating the age values from age classes:
+   ageval <- sapply(age_class,mean)[names(serop)]
+ # The background of the plot:
+   if(!add) {
+     if(is.null(xlim)) xlim <- range(ageval)+c(-.5,.5)
+     plot(NULL,xlim=xlim,ylim=0:1,xlab="age",ylab="seroprevalence",xaxs="i",yaxs="i")
+     abline(v=seq(0,35,v),col="lightgrey")
+     abline(h=seq(0,1,h),col="lightgrey")
+     box(bty="o")
+   }
+ # The confidence intervals:
+   arrows(ageval,sapply(serop,function(x)x[2]),ageval,
+          sapply(serop,function(x)x[3]),.04,90,3,col=col)
+ # The point estimations:
+   points(ageval,sapply(serop,function(x)x[1]),pch=19,col=col)
+ }
> # Let's try it:
> orange <- rgb(222,141,8,max=255)
> blue <- rgb(72,165,226,max=255)
> plot_sero_age(data,200,orange)
> plot_sero_age(data,275,blue,T)
> legend("bottomright",paste("cutoff=",c(200,275)),col=c(blue,orange),pch=19,
+        bty="o",lty=1,box.lty=0)
> box(bty="o")

The effect of age on seroprevalence does not seem to depend much of the exact cutoff value. Let’s model that for the sample of age below 11 years with a logistic regression and see whether there is effect of the site and/or the gender.

> # Adding the age values to the dataframe data from the age class information:
> data$age_val <- sapply(age_class,mean)[data$age]
> # The model for a given value of the cutoff and for age below 11 years:
> mod <- glm((concentration>200)~location+gender+age_val+location:age_val+gender:age_val,binomial,subset(data,age_val<11))
> anova(mod,test="LRT")
Analysis of Deviance Table

Model: binomial, link: logit

Response: (concentration > 200)

Terms added sequentially (first to last)

                 Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
NULL                              2081     2474.8              
location          3  28.6154      2078     2446.2 2.697e-06 ***
gender            1   0.2231      2077     2445.9   0.63666    
age_val           1  25.2511      2076     2420.7 5.033e-07 ***
location:age_val  3   6.8221      2073     2413.9   0.07779 .  
gender:age_val    1   2.2053      2072     2411.7   0.13753    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

This means that, for ages below 11 years, the seroprevalence depends not only on the age, but also on the location. However, the two effects (age and location) are additive and there is no interaction between them. Let’s try with other values of cutoff:

> mod <- glm((concentration>275)~location+gender+age_val+location:age_val+gender:age_val,binomial,subset(data,age_val<11))
> anova(mod,test="LRT")
Analysis of Deviance Table

Model: binomial, link: logit

Response: (concentration > 275)

Terms added sequentially (first to last)

                 Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
NULL                              2081     2683.8              
location          3  21.1763      2078     2662.6 9.677e-05 ***
gender            1   1.4909      2077     2661.1 0.2220766    
age_val           1  13.6936      2076     2647.4 0.0002152 ***
location:age_val  3   4.7793      2073     2642.6 0.1886915    
gender:age_val    1   0.7654      2072     2641.8 0.3816592    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Let’s now investigate in a systematic way the effets of location and age as the cutoff value varies. The following function returns the p-values of the location and age effects for a given value of the cutoff:

> fct <- function(cutoff,dataset) {
+ # "cutoff" is the value of the cutoff.
+ # "dataset" is a dataframe with concentration, location, gender and age_val
+ # columns.
+   mod <- glm((concentration>cutoff)~location+gender+age_val+location:age_val+
+                gender:age_val,binomial,subset(dataset,age_val<11))
+   modtest <- anova(mod,test="LRT")
+   return(modtest["Pr(>Chi)"][-1,1])
+ }
> # Let's try it:
> fct(275,data)
[1] 0.0000967650 0.2220765785 0.0002151837 0.1886915454 0.3816591934
> # Let's plot it systematically:
> pval <- sapply(cutoff,fct,data)
> plot(NULL,xlab="cutoff",ylab="log10(p-value)",xlim=c(200,280),ylim=log10(c(min(pval[c(1,3),]),max(.05,max(pval[c(1,3),])))))
> points(cutoff,log10(pval[3,]),col=orange)
> points(cutoff,log10(pval[1,]),col=blue)
> abline(h=log10(.05))
> legend("bottomright",c("location effect","age effect"),
+        col=c(blue,orange),pch=1,bty="n")

> # And now for the gender effect and the interactions:
> plot(NULL,xlab="cutoff",ylab="log10(p-value)",xlim=c(200,280),ylim=log10(c(min(.05,min(pval[-c(1,3),])),max(pval[-c(1,3),]))))
> points(cutoff,log10(pval[2,]),col=orange)
> points(cutoff,log10(pval[4,]),col=blue)
> points(cutoff,log10(pval[5,]),col="grey")
> abline(h=log10(.05))
> legend("topright",c("gender","age:location","age:gender"),
+        col=c(orange,blue,"grey"),pch=1,bty="n")

Now, let’s see in more detail the location variable. Its default contrast reads like

> contrasts(data$location)
            Hanoi Ho Chi Minh Hue
Dak Lak         0           0   0
Hanoi           1           0   0
Ho Chi Minh     0           1   0
Hue             0           0   1

meaning that Dak Lak is taken as a point of reference to which the 3 other provinces are compared to. We can probably choose better contrasts for this variable. For that, let’s perform a hierarchical clustering on the seroprevalences by province:

> # First we define a function that calculate the seroprevalence for a given value of the cutoff:
> fct <- function(data,c) return(sum(data$concentration>c)/nrow(data))
> # "data" is a data frame with a "concentration" column and "c" is the cutoff value. Let's use it to calculate the prevalence for the 4 locations and 2 extreme values of cutoff
> data1 <- subset(data,age_val<11)
> a <- sapply(c(200,275),function(x)by(data1,data1$location,fct,x))
> # Let's see what kind of grouping it gives:
> tree200 <- hclust(dist(a[,1]))
> plot(tree200,main=NULL,ann=F)
> mtext("height",2,1.5)

> # and
> tree275 <- hclust(dist(a[,2]))
> plot(tree275,main=NULL,ann=F)
> mtext("height",2,1.5)

> tree200$merge
     [,1] [,2]
[1,]   -1   -3
[2,]   -2   -4
[3,]    1    2

Let’s now explore this more systematically:

> # This function calculates the seroprevalences by location, performs a
> # hierarchical clustering on it and returns the matrix of the tree.
> fct <- function(cutoff,dataset) {
+   fct0 <- function(data,c) return(sum(data>c)/length(data))
+   tmp <- tapply(dataset$concentration,dataset$location,fct0,cutoff)
+   return(as.vector(t(hclust(dist(tmp))$merge)))
+ }
> # Let's try it:
> fct(200,data1)
[1] -1 -3 -2 -4  1  2
> out <- t(sapply(cutoff,fct,data1))
> out2 <- apply(out,1,paste,collapse="/")
> (tab <- table(out2))
out2
-1/-3/-2/-4/1/2 
            512

All the trees are the same. Consequently, let’s define the location with other contrasts:

> levels(data$location)
[1] "Dak Lak"     "Hanoi"       "Ho Chi Minh" "Hue"        
> data$location2 <- data$location
> contrasts(data$location2) <- cbind(north_south=c(1,-1,1,-1),
+                                    north=c(0,1,0,-1),south=c(1,0,-1,0))
> contrasts(data$location2)
            north_south north south
Dak Lak               1     0     1
Hanoi                -1     1     0
Ho Chi Minh           1     0    -1
Hue                  -1    -1     0
> mod2 <- glm((concentration>200)~location2+age_val,binomial,subset(data,age_val<11))
> summary(mod2)

Call:
glm(formula = (concentration > 200) ~ location2 + age_val, family = binomial, 
    data = subset(data, age_val < 11))

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.9114  -1.3135   0.7225   0.8407   1.0830  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)           0.53698    0.09748   5.509 3.61e-08 ***
location2north_south  0.23211    0.05052   4.595 4.34e-06 ***
location2north        0.12288    0.06471   1.899   0.0576 .  
location2south       -0.04331    0.07811  -0.554   0.5793    
age_val               0.08830    0.01764   5.006 5.55e-07 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2474.8  on 2081  degrees of freedom
Residual deviance: 2420.7  on 2077  degrees of freedom
AIC: 2430.7

Number of Fisher Scoring iterations: 4

Let’s see what it gives for a cutoff of 275:

> mod3 <- glm((concentration>275)~location2+age_val,binomial,subset(data,age_val<11))
> summary(mod3)

Call:
glm(formula = (concentration > 275) ~ location2 + age_val, family = binomial, 
    data = subset(data, age_val < 11))

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.6873  -1.3394   0.8261   0.9308   1.1285  

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)          0.351464   0.093445   3.761 0.000169 ***
location2north_south 0.199962   0.047169   4.239 2.24e-05 ***
location2north       0.066461   0.062006   1.072 0.283784    
location2south       0.007312   0.071604   0.102 0.918668    
age_val              0.062025   0.016533   3.752 0.000176 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2683.8  on 2081  degrees of freedom
Residual deviance: 2648.4  on 2077  degrees of freedom
AIC: 2658.4

Number of Fisher Scoring iterations: 4

So the difference between the two provinces of the north seems to diminish when the cutoff increases. Let’s investigate that in a systematic way:

> output <- sapply(cutoff,function(x){
+   mod3 <- glm((concentration>x)~location2+age_val,binomial,subset(data,age_val<11))
+   return(coef(summary(mod3))[2:4,4])
+ })
> plot(cutoff,output[1,],ylab="p-value",col=orange,ylim=0:1)
> points(cutoff,output[2,],col="grey")
> points(cutoff,output[3,],col=blue)
> abline(h=.05)
> legend("topleft",c("north vs south","Hanoi vs Hue","HCM vs Dak Lak"),col=c(orange,"grey",blue),pch=1,bty="n")

So, (i) the north is always signifcantly different from the south, whatever the cutoff value (orange dots), (ii) the provinces from the south are always very similar, whatever the cutoff value (blue dots), and (iii) the difference between the provinces of the north increases with the cutoff value and is marginally significant for all the cutoff values below 250 (grey dots).

> for(i in c(200,275)) {
+   par(mfrow=c(1,4))
+   for(j in sites[c(2,3,1,4)]) {
+     plot_sero_age(subset(data,location==j & age_val<11),i,orange,v=1)
+     mtext(j,cex=.85)
+   }
+ }