ST 590 HW 4

The Data

The ca20 dataset in the geoR package contains the calcium content measured in soil samples taken from the 0-20cm layer at 178 locations within a certain study area divided in three sub-areas. The elevation at each location was also recorded.

The first region is typically flooded during the rain season and not used as an experimental area. The calcium levels would represent the natural content in the region. The second region has received fertilisers a while ago and is typically occupied by rice fields. The third region has received fertilisers recently and is frequently used as an experimental area.

Load geoR package and the calcium content data

library(geoR)
data(ca20)

part (a)

To create a plot showing the relationship between the calcium content and the area covariate, I first convert the ca20 data into a dataframe in order to use ggplot2

Note: The widths of boxplots created for each area value are proportional to the square roots of the number of observations in the data that have that area value.

library(ggplot2)
calcium <- as.data.frame(ca20$coords)
calcium$values <- ca20$data
calcium <- cbind(calcium,ca20$covariate)
calcium <- calcium[,-4] # remove elevation

ggplot(calcium,aes(x=area,y=values,colour=area)) + 
   geom_boxplot(varwidth=TRUE,show.legend=FALSE) +
    labs(title="Calcium Content Values by Area") +
    xlab("Area") +
    ylab("Calcium Content Values (mmol / liter)")

part (b)

Here, variofit() is used to fit (estimate parameters of) an exponential covariance model with a nugget to the empirical variogram of the data, which is easily computed with variog() since it’s a geoR dataset.

# empirical variogram of the data
calcium_vg <- variog(ca20)

## variog: computing omnidirectional variogram

# fit exponential cov model with nugget
vfit <- variofit(calcium_vg, ini.cov.pars = c(140,300), cov.model = "exponential")

## variofit: covariance model used is exponential 
## variofit: weights used: npairs 
## variofit: minimisation function used: optim

I now use the fitted parameter values to construct a covariance matrix for the data using the varcov.spatial() function in geoR

# the covariance matrix is saved as calcium_cov_matrix$varcov
calcium_cov_matrix <- varcov.spatial(coords = as.matrix(ca20$coords), cov.model = "exponential", nugget = vfit$nugget, cov.pars = vfit$cov.pars)

Sigma <- calcium_cov_matrix$varcov

# could've also calculated the covariance matrix manually and got same results
#distmat = as.matrix(dist(ca20$coords,upper=TRUE))
#Sigma = vfit$cov.pars[1]*exp(-distmat/vfit$cov.pars[2])+ vfit$nugget*diag(dim(distmat)[1])

Now, I compute the generalized least squares estimates of the regression coefficients for area (treated as a factor) as well as the confidence intervals for the estimates. I use a significance level of alpha = 0.05

# design matrix A 
A <- model.matrix(~ area,calcium) 


# compute generalized least squares estimates of regression coefficients for area (i.e. compute beta hat)
betahat <- solve(t(A) %*% solve(Sigma,A) , t(A) %*% solve(Sigma,calcium$values))

# compute residuals
resids <- calcium$values - A %*% betahat

# compute sigma hat (estimate of population variance) 
sigmahat <- as.numeric(sqrt(t(resids) %*% solve(Sigma,resids) / 175))

# use sigmahat to compute std. error used in confidence intervals
stderror <- ((sigmahat)^2)*(solve(t(A) %*% solve(Sigma,A) ))

# compute confidence intervals
# se_beta_i = sqrt(stderror[i,i]) 
beta0ci <- betahat[1] + c(-1,1)*qnorm(0.975)*sqrt(stderror[1,1])
beta1ci <- betahat[2] + c(-1,1)*qnorm(0.975)*sqrt(stderror[2,2])
beta2ci <- betahat[3] + c(-1,1)*qnorm(0.975)*sqrt(stderror[3,3])

regression parameter estimates and confidence intervals:

Regression estimate for beta0 (intercept): 42.4293114

Regression estimate for beta1 (area2): 6.1212713

Regression estimate for beta2 (area3): 11.5403218

Confidence Interval for beta0: [27.9971948 , 56.861428]

Confidence Interval for beta1: [-5.4416992 , 17.6842417]

Confidence Interval for beta2: [0.0844414 , 22.9962023]

part (c)

Here’s the residual plot for the fitted model:

plot(as.numeric(calcium$area),resids,ylab = "Residuals", xlab = "Area",main = "Residual Plot")
abline(0,0)

The residuals are ideal for area 3 (random scatter around 0), and below 0 on average for area 1 and area 2(i.e. the model on average is predicting calcium content values too high for area 1 and area 2).

part (d)

Using the coordinates in ca20_missinglocs.txt file, I now form the full covariance matrix (data + missing values)

calcium2 <- calcium[,-3] # to match variables in missinglocs.txt

# load in missing data
ca20missing = read.table('http://www4.stat.ncsu.edu/~guinness/ST590_2016/ca20_missinglocs.txt',header=F)
names(ca20missing) <- names(calcium2)
ca20missing$east <- as.numeric(ca20missing$east)
ca20missing$north <- as.numeric(ca20missing$north)
ca20missing$area <- as.factor(ca20missing$area)

n1 = dim(calcium2)[1]      # number rows for non-missing locations
n2 = dim(ca20missing)[1]  # number rows of missing locations

inds1 = 1:n1
inds2 = (n1+1):(n1+n2)

# form full covariance matrix
distmat <- as.matrix( dist( rbind(calcium2[,1:2],ca20missing[,1:2]),upper=T) )
newSigma <- vfit$cov.pars[1]*exp(-distmat/vfit$cov.pars[2])+ vfit$nugget*diag(dim(distmat)[1]) 

Sigma11 <- newSigma[inds1,inds1]
Sigma12 <- newSigma[inds1,inds2]
Sigma21 <- newSigma[inds2,inds1]
Sigma22 <- newSigma[inds2,inds2]

# plot covariance matrix
# library(fields)
# image.plot(newSigma[,seq(n1+n2,1,by=-1)]) 


# computations necessary for conditional expectation of forecast
mean1 <- mean(A %*% betahat) # mean vector for non-missing locations
A2 <- model.matrix(~ area,ca20missing) # design matrix for missing locations
mean2 <- mean(A2 %*% betahat) # mean vector for missing locations

# compute conditional expectation
mu2 <- mean2 + Sigma21%*%solve(Sigma11,(calcium$values - mean1))

# Conditional Co-variance of forecast
sig2 <- Sigma22 - Sigma21%*%solve(Sigma11,Sigma12)

I now make two maps: (1) with the original data and expected values of the missing data, and (2) with the square roots of the diagonal of the conditional covariance matrix (the standard deviations of the conditional distribution).

# plot 1
ca20missing$values <- NA
both <- rbind(calcium, ca20missing) 
both$values[179:288] <- as.numeric(mu2) #fill in missing response values with expected


ggplot(both,aes(x=east,y=north,colour=area,size=values)) +
    geom_point() + 
    scale_colour_manual(values=c("firebrick3","navy","gold"),name="Sub-area") +
    scale_size(guide=guide_legend(title="Calcium Content \n (mmol / liter)")) +
    theme(legend.position="right") +
    labs(title="Location and Calcium Content Values of ca20 Data \n Original plus Missing Data Expected Values ") +
    xlab("Eastern Soil Sample Location Coordinates") +
    ylab("Northern Soil Sample Location Coordinates")

# plot 2
missing2 <- cbind(ca20missing,sqrt(diag(sig2)))
names(missing2)[5] <- c("std")


ggplot(missing2,aes(x=east,y=north,colour=factor(area),size=std)) +
    geom_point() + 
    scale_colour_manual(values=c("firebrick3","navy","gold"),name="Sub-area") +
    scale_size(guide=guide_legend(title="Std. error ")) +
    theme(legend.position="right") +
    labs(title="Standard Error of Missing Data Estimates") +
    xlab("Eastern Soil Sample Location Coordinates") +
    ylab("Northern Soil Sample Location Coordinates")