Gaussian Process for modeling ultrasonic waves

Modeling the data with Gaussian Process

if(!require(rstan)) install.packages("rstan");require(rstan)

if(!require(coda)) install.packages("coda");require(coda) 

if(!require(ggplot2)) install.packages("ggplot2");require(ggplot2) 

if(!require(RCurl)) install.packages("RCurl");require(RCurl) 

Data

For each frame and wall, 10 ultrassonic waves were emitted and received in a total of 240 waves. Each wave contains six points of measurements.

##dataset from github respository
ur <- getURL("https://raw.githubusercontent.com/larebufc/External-influence-on-ultrasonic-waves-propagation-time-in-masonry-walls/main/dados_com_e_sem_vazios.csv")
dados <- read.csv(text = ur,h=T,sep=",")

Selecting \(m=231\) observed curves. Nine waves were not captured by the receiver in any distance and they are not in the dataset. One sequence appears with only 4 measurements instead of 6 and it is also omitted.

tira<-which(dados[,4]==0 & dados[,5]==0)
dados<-dados[-tira,]
tira_2 <- which(dados[,2]=="223") #this wave does not contain six points and is left out
dados<-dados[-tira_2,]
indiv<-as.numeric(dados[,2])
#
#
m=length(table(indiv))    #total of individuals
repl=table(indiv) # total of observations for each individual 
n=length(indiv)   # total of measurements

Factors (inputs) for the model

Covariates for fixed effects

X<-as.matrix(dados[,c(5,1,6)])
X<-cbind(rep(1,n),X)
X<-as.matrix(X)
#
parede<-dados[,1]
quadrante<-dados[,3]
######################
##dummies construction
######################
####################
tijolo_sb<-which(quadrante %in% c(2,3) & parede==2 | (quadrante %in% c(5,6,7,8,10) & parede==1))
tijolo_cb<-which(quadrante %in% c(1,5,6,10) & parede==2)
argamassa<-which((quadrante %in% c(1,2,3,11) & parede==1) | (quadrante %in% c(7,8) & parede==2))
interface<-which(quadrante %in% c(4,9,12) | (quadrante %in% c(11) & parede==2))
####################
####################
##condition of the walls at each propagation time measurement location.
categoria<-NULL
categoria[argamassa]<-"Argamassa"  ### measurement made on grout as reference 
categoria[interface]<-"Interface"  ### dummy indicating if the measurement is made on the brick–grout interface or not
categoria[tijolo_cb]<-"Tij_com_buraco" ###dummy indicating if the measurement is made on the brick with void or not
categoria[tijolo_sb]<-"Tij_sem_buraco" ###dummy indicating if the measurement is made on the brick without void or not
#####################
#####################
elem<-model.matrix(dados$Valor ~ factor(categoria))[,-1]
elem<-matrix(as.numeric(elem),ncol=3)
####################
DummiesCategory = categoria
####################
print(table(DummiesCategory))
DummiesCategory
     Argamassa      Interface Tij_com_buraco Tij_sem_buraco 
           360            414            240            366 
####################
Vt<-c(10,15,20,25,30,35) ##six distances
aux<-matrix(round(poly(Vt, degree=3),3),ncol=3) ##orthogonal polynomials for each individual
#
aux<-cbind(1,aux)
Z<-NULL
for (i in 1:m) Z<-rbind(Z,aux[1:repl[i],])
#
Xfinal<-cbind(X,elem,Z[,-1])
#
Y<-as.matrix(dados[,4],ncol=1) ##column vector for response variable
####################
## removing measurements at 35cm for fitting the model and using then for validating it
####################
tira<-which(Xfinal[,2]==35)
Xmod<-Xfinal[-tira,-2]
Ymod<-matrix(Y[-tira,],ncol=1)
Zmod<-Z[-tira,]
Data<-dados[-tira,]
#
Xvalid<-Xfinal[tira,-2]
Yvalid<-matrix(Y[tira,],ncol=1)
Zvalid<-Z[tira,]
Datavalid<-dados[tira,]
#
Amostramod<-NULL
for (i in 1:length(table(indiv))){
    Amostramod<-c(Amostramod,rep(i,repl[i]-1))}
####################

Fitting the Model

Stan code for estimating the Gaussian process

stanmodelcode=" data {
    int<lower=1> R; //total number of observations inside each wave
    int<lower=1> Rp; //total number of observations for training and validating data
    int<lower=1> Q; //total number of frames
    int<lower=1> m; // total number of individuals - waves
    int<lower=1> N; //total number of observations
    int idxs[m,R]; // indicates the position of each observation in a column vector
    int<lower=1, upper=Q> quad[m]; //frame index for each curve
    int<lower=1> p; // number of factors considered for fixed effects
    real temp[R];   // distances where each measurement was made
    real tempp[Rp];   // distances where each measurement was made for fitting and validating data 
    matrix[N,p] X;  //the model matrix
    matrix[m,p] Xvalid;  //the model matrix for the validating data
    matrix[m,R] yn; //y values
    vector[R] u; 
}
 
transformed data {
    real delta = 1e-10;
}

parameters {
    real<lower=0> sigma[Q];
    real<lower=0> length_scale[Q];
    real <lower=0>  sigma_y;
    vector[p] beta; // coefficients of the fixed effect
    vector[R] eta[m];
}
transformed parameters{ 
  vector[R] f[m];
  matrix[R, R] L_K[Q];
  matrix[R, R] K[Q];
  for (i in 1:Q){
    K[i]=cov_exp_quad(temp, sigma[i], length_scale[i]);
    for (t in 1:R){
      K[i][t, t]=K[i][t, t]+ delta;}
    L_K[i]=cholesky_decompose(K[i]);}
  for (i in 1:m){
    f[i] =  L_K[quad[i]] * eta[i];}  // the GP value for each observation
}

model {
 int n=0;
 real pred;
 1/sigma_y ~ gamma(0.1,0.1);
 to_vector(beta) ~ normal(0,1000);
 to_vector(sigma) ~ normal(0, 100);
 to_vector(length_scale) ~ inv_gamma(3, 5);
 for (i in 1:m)
  to_vector(eta[i]) ~ std_normal();  // simulating values from a standard normal distribution and after apply the Cholesky factorization to obtain to correct the var-cov information
  //likelihood
 for (i in 1:m){
    for (t in 1:R){
    n = n+1; 
    pred=f[i][t]+X[n,]*beta; 
    yn[i,t] ~ normal(pred, sqrt(sigma_y));
     }
  }
} 

generated quantities{
  int n=0;
  real pred;
  real Log; //likelihood values for checking global convergence
  matrix[Rp, Rp] Kp[Q];  //covariance matrix with the new observation
  row_vector[R] Kdi[Q];  
  row_vector[R] Kdj[Q];  
  vector[R] xbeta[m];  
  real Yest[m];
  Log=0;

  //forecasting for distance 35cm
  for (i in 1:Q){
    Kp[i] = cov_exp_quad(tempp, sigma[i], length_scale[i]);
    Kdi[i] = Kp[i][Rp,1:R];
    Kdj[i] = Kdi[i]*inverse_spd(K[i]+sigma_y*diag_matrix(u));
  }
  for(i in 1:m){
     xbeta[i] = X[idxs[i,],]*beta;
     Yest[i] = Kdj[quad[i]]*(to_vector(yn[i,])-xbeta[i])+Xvalid[i,]*beta;
    //likelihood for convergence checking
    for (t in 1:R){
        n = n+1; 
        pred=f[i][t]+X[n,]*beta; 
        Log =+ normal_lpdf(yn[i,t]|pred, sqrt(sigma_y));
     }
  }
}
 "
#

Run Stan code for estimate the parameters of Gaussian Process

Configuration for the arguments of Stan code
Chain= 1
Iter = 20000
Warmup = 5000 
Thin = 10
Seed = 1000
Fitting the model with Stan
### excluding some  predictor variables
out = c(2)

##########################################
### Configuration for running stan code for GP
##########################################

quadrante<-Data[,3]
quadrante[Data$parede_n==2]=quadrante[Data$parede_n==2] + 12
Q = length(unique(quadrante))
R = ncol(matrix(Ymod,nrow=m))
quad = matrix(quadrante, m, R, T)[,1]
N = nrow(Data)
idxs = matrix(1:N, nrow= m, ncol=R, byrow=T)
X = Xmod[,-out]
xv=Xvalid[,-out]
p = ncol(X)
temp=Data$D_m[1:R]
tempp=dados$D_m[1:(R+1)]
yn = matrix(Ymod, m, R, T) 
u = c(1,1,1,1,1)



### run the code
dat_list <- list(u=u,idxs=idxs,R=R,Rp=R+1, Q=Q, m=m, N = N, quad=quad, p=p, temp=temp, tempp=tempp, X=X, Xvalid=xv, yn=yn)
draw1 <- stan(model_code =stanmodelcode, 
             data = dat_list, iter=Iter, chains = Chain, thin = Thin, warmup = Warmup)

Convergence checking

Checking the convergence of the chain using the log-likelihood
cadeia<-as.numeric(extract(draw1, pars = "Log")$Log)
gewek<-geweke.diag(mcmc(cadeia))
TestConvergency = numeric()
TestConvergency["Zscore"] = gewek[1]
TestConvergency["Pvalue"] = data.frame(pnorm(as.numeric(abs(gewek$z[1])),lower.tail=FALSE)*2)

TestConvergency
$Zscore
     var1 
0.7649886 

$Pvalue
[1] 0.4442783
plot(mcmc(exp(cadeia)),type='l')

Estimation

Regression coefficients estimates (point and interval estimates – 90% credibility interval, CI)
cadeia<-data.frame(extract(draw1, pars = "beta")[[1]])

names(cadeia) = c("X0",names(cadeia)[-ncol(cadeia)])


###########################
## HPD interval
##########################

hpd <- apply(cadeia, 2, function(x){HPDinterval(mcmc(x),0.9)})
t(hpd)
         [,1]        [,2]
X0 105.537928 114.5117395
X1   0.639424  10.0565437
X2   4.893965  13.4196980
X3  -7.103278  -0.4240229
X4   8.349969  16.6383005
X5 131.048091 141.4668071
X6   1.775049  10.6878588
X7  -1.662266   4.3319527
###########################
## CI based on quantil
##########################


Quantiles = c(0.05,0.5,0.95)
Qnames = c("5%","50%","95%")

ICQM <- data.frame(Mean=apply(cadeia, 2, function(x){mean(x)}))
ICQM[Qnames] <- t(apply(cadeia, 2, function(x){quantile(x,Quantiles)}))

ICQM
         Mean          5%        50%         95%
X0 110.056003 105.5915360 110.012603 114.5919756
X1   5.465182   0.8369299   5.424662  10.3237226
X2   9.086923   4.9492988   9.094767  13.5877033
X3  -3.579473  -7.0119460  -3.524019  -0.2573997
X4  12.248403   8.1822547  12.218231  16.5424118
X5 136.383161 131.3582956 136.303303 141.9310137
X6   6.005032   1.5027496   6.043174  10.4503726
X7   1.216164  -1.8192202   1.236443   4.2148052

Final Analysis

Model fit without the non relevant coefficient X7 (cubic distance)

### excluding some  predictor variables
out = c(2,ncol(Xmod))

##########################################
### Configuration for running stan code for GP
##########################################
quadrante<-Data[,3]
quadrante[Data$parede_n==2]=quadrante[Data$parede_n==2] + 12
Q = length(unique(quadrante))
R = ncol(matrix(Ymod,nrow=m))
quad = matrix(quadrante, m, R, T)[,1]
N = nrow(Data)
idxs = matrix(1:N, nrow= m, ncol=R, byrow=T)
X = Xmod[,-out]
xv=Xvalid[,- out]
p = ncol(X)
temp=Data$D_m[1:R]
tempp=dados$D_m[1:(R+1)]
yn = matrix(Ymod, m, R, T) 
u = c(1,1,1,1,1) ## to construct the identity matrix

### run the code
dat_list <- list(u=u,idxs=idxs,R=R,Rp=R+1, Q=Q, m=m, N = N, quad=quad, p=p, temp=temp, tempp=tempp, X=X, Xvalid=xv, yn=yn)
draw2 <- stan(model_code =stanmodelcode, 
             data = dat_list, iter=Iter, chains = Chain, thin = Thin, warmup = Warmup)

Convergence checking

Regression coefficients estimates (point and interval estimates – 90% credibility inter-val)

cadeia<-data.frame(extract(draw2, pars = "beta")[[1]])
names(cadeia) = c("X0",names(cadeia)[-ncol(cadeia)])


###########################
## HPD interval
##########################
hpd <- apply(cadeia, 2, function(x){HPDinterval(mcmc(x),0.9)})
t(hpd)
         [,1]         [,2]
X0 104.764299 113.80006905
X1   0.911690  10.72446252
X2   4.841046  13.44606073
X3  -6.455084   0.05529099
X4   8.422481  16.86625079
X5 130.842926 139.86132521
X6   1.615908   7.14251547
###########################
## CI based on quantil
#########################
Quantiles = c(0.05,0.5,0.95)
Qnames = c("5%","50%","95%")

CI <- data.frame(Mean=apply(cadeia, 2, function(x){mean(x)}))
CI[Qnames] <- t(apply(cadeia, 2, function(x){quantile(x,Quantiles)}))

CI
         Mean          5%        50%         95%
X0 109.380269 104.8330214 109.397763 113.9658100
X1   5.715265   0.9676061   5.657403  10.8045372
X2   9.193683   4.8003479   9.254361  13.4236740
X3  -3.413327  -6.8191525  -3.328706  -0.1675722
X4  12.360297   8.1270395  12.404472  16.6070370
X5 135.242918 130.8443460 135.229079 139.8627699
X6   4.534878   1.7554744   4.560892   7.3610276
Barplot for coefficients of the fixed-effects
coeffs = c(3,4,5)
R=data.frame(Coefficients = rownames(CI)[coeffs],Value=CI[coeffs,1])


ggplot(R, aes(x = Coefficients)) +
  geom_col(aes(y = Value,), fill = "gray") +
  theme_minimal()  +  xlab('Coefficients in Gaussian Process') +
theme( plot.title=element_text(size=16),
         axis.text=element_text(size=18),
         axis.title=element_text(size=20))

Quality of prediction in the wall without void (Panel 0), and with void (Panel 1)

cadeia=data.frame(extract(draw2, pars = "Yest")[[1]])

dif =  numeric()
for(j in 1:m){
dif = cbind(dif, cadeia[,j] - Yvalid[j])}

dif=data.frame(dif)


EQM1=data.frame(MSE=sqrt(colMeans((dif)^2)))

EQM_P1 = EQM1[Xvalid[,2]==1,]
EQM_P1 = data.frame(Index = 1:length(EQM_P1),Panel=as.factor(rep(0,length(EQM_P1))),MSE=EQM_P1)
EQM_P2 = EQM1[Xvalid[,2]==2,]
EQM_P2 = data.frame(Index = 1:length(EQM_P2),Panel=as.factor(rep(1,length(EQM_P2))),MSE=EQM_P2)


EQM = rbind(EQM_P1,EQM_P2)



# Visualization
ggplot(EQM, aes(x = Index, y = MSE)) + 
  geom_line(aes(color = Panel, linetype = Panel)) + 
ylab("RMSE (Gaussian process)")+
 xlab("Individual Index")+
  scale_color_manual(values = c("gray", "black"))+ theme_minimal()+
theme( plot.title=element_text(size=16),
         axis.text=element_text(size=18),
         axis.title=element_text(size=20))

EQMtotal = mean(EQM[,3])

paste("EQM total:",round(EQMtotal,2))
[1] "EQM total: 82.87"
Plot for predicted and observed values
f <- extract(draw2, pars = "f")$f

Fcurv = as.data.frame(f)

seq = rep(1:230,5)

Fcur = data.frame(ID=seq,Value = rowMeans(t(Fcurv)))

 
or = order(Fcur[,1])
ypred = Fcur[or,2] + X%*%CI[,1]




ind=Data$Amostra

plt_df = data.frame(x = Data$D_m, ypred =ypred, Y=c(t(yn)), ind=ind, Panel = Data$parede_n   )

p <- ggplot(data = plt_df, aes(x = x, y = Y, group = ind,colour = 'Observed data'))

p + geom_line(aes(group=ind ),size=1)+
  labs(x = "Time", y = " Response (s) ")+
  geom_point(aes(colour = 'Observed data'), size=1) +  
  geom_line(data = plt_df, aes(x = x, y = ypred, colour = 'Predicted value'), size=1, alpha=0.6) +
  theme_bw() + theme(legend.position="bottom") +
  #scale_color_manual(name = '', values = c('Realized data'='black')) +
  xlab('Distance') +
  ylab('Propagation time') 
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

Residual for Gaussian Process
Res = c(t(yn)) - ypred


datares = data.frame(Index = 1:N,Res = Res)

# Visualization
ggplot(datares, aes(x = Index, y = Res)) + 
geom_point(aes(), size=1) + 
ylab("Residual (GP)")+
 xlab("Index")+
  scale_color_manual(values = c("darkblue", "red"))+ theme_minimal()+
theme( plot.title=element_text(size=16),
         axis.text=element_text(size=18),
         axis.title=element_text(size=12))

Plot for predicted and observed values considering fitting and validating data
Panel_val = Xvalid[,2]


Forecasting<- data.frame(ind =1:m ,x=rep(35,m),ypred = colMeans(data.frame(extract(draw2, pars = "Yest")[[1]])),Y=Yvalid, Panel =Panel_val)

plt_df2 = rbind(plt_df,Forecasting) 


or = order(plt_df2$ind)
plt_df2 = plt_df2[or,]


p <- ggplot(data = plt_df2, aes(x = x, y = Y, group = ind,colour = 'Observed data'))


p + geom_line(aes(group=ind ),size=1)+ ylim(0, 1600)+
  geom_point(aes(colour = 'Observed data'), size=1) +  
  geom_line(data = plt_df2, aes(x = x, y = ypred, colour = 'Predicted value'), size=1, alpha=0.6) +
  theme_bw() + 
  #scale_color_manual(name = '', values = c('Realized data'='black')) +
  xlab('Distance') +
  ylab('Propagation time') +theme( plot.title=element_text(size=20),
                                   axis.text=element_text(size=10),
                                   axis.title=element_text(size=12),
                                   legend.position="bottom")

The same plot as before, this time considering just data from wall 0, without damage
plt_df3 = plt_df2[plt_df2$Panel==1,]

p <- ggplot(data = plt_df3, aes(x = x, y = Y, group = ind,colour = 'Observed data'))


p + geom_line(aes(group=ind ),size=1)+
  labs(x = "Time", y = " Response (s) ")+
  geom_point(aes(colour = 'Observed data'), size=1) +  
  geom_line(data = plt_df3, aes(x = x, y = ypred, colour = 'Predicted value'), size=1, alpha=0.6) +
  theme_bw() + theme(legend.position="bottom") +
  #scale_color_manual(name = '', values = c('Realized data'='black')) +
  xlab('Distance') +
  ylab('Propagation time') 

The same plot as before, this time considering just data from wall 1, with voids inside
plt_df3 = plt_df2[plt_df2$Panel==2,]

p <- ggplot(data = plt_df3, aes(x = x, y = Y, group = ind,colour = 'Observed data'))


p + geom_line(aes(group=ind ),size=1)+
  labs(x = "Time", y = " Response (s) ")+
  geom_point(aes(colour = 'Observed data'), size=1) +  
  geom_line(data = plt_df3, aes(x = x, y = ypred, colour = 'Predicted value'), size=1, alpha=0.6) +
  theme_bw() + theme(legend.position="bottom") +
  #scale_color_manual(name = '', values = c('Realized data'='black')) +
  xlab('Distance') +
  ylab('Propagation time') 

Checking the convergence of all chains using the log-likelihood
cadeia<-as.numeric(extract(draw2, pars = "Log")$Log)
gewek<-geweke.diag(mcmc(cadeia))
TestConvergency = numeric()
TestConvergency["Zscore"] = gewek[1]
TestConvergency["Pvalue"] = data.frame(pnorm(as.numeric(abs(gewek$z[1])),lower.tail=FALSE)*2)

TestConvergency
$Zscore
      var1 
-0.2700179 

$Pvalue
[1] 0.7871465
plot(mcmc(exp(cadeia)),type='l')