This note book contains R code for MC simulation and for real data analysis using YRBFS data set to evaluate the proposed Hierarchical generalized modelling approach in small area estimation.
MC Simulation to compare HGLM and GLMM results
## Function to get trace of a m atrix
func_trace <- function(X){
n <- dim(X)[1]
tr <- 0 ### initialize trace
for (j in 1:n){
k <- X[j,j]
tr <- tr + k
}
return(tr[[1]])
}
# Function to compute inv(J)
func_Jinv <- function(X,Z,W,G_inv){
D2h_BB <- t(X)%*%W%*%X # or crossprod(t(crossprod(X,W)),X)
D2h_BU <- t(X)%*%W%*%Z
D2h_UB <- t(Z)%*%W%*%X
D2h_UU <- t(Z)%*%W%*%Z + G_inv
J <- rbind(cbind(D2h_BB,D2h_BU),cbind(D2h_UB,D2h_UU)) # (143+9) by (9+143) Jacobian
J <- as.matrix(J)
J_inv <- solve(J)
return(J_inv)
}
## Simulated fixed effects data
sim_x <- function(j){
X <- c()
if(j==2){
for (i in 1:m){
Xsim <- as.matrix(sample(c(1,0),n,replace=TRUE))
X <- rbind(X, Xsim)
}
}else if(j==3){
for (i in 1:m){
Xsim = as.matrix(sample(c("XC1","XC2","XC3"),n,replace=TRUE))
X <- rbind(X, Xsim)
}
}
return(X)
}
########### ---------------------------------------------------------#############
########### Simulate data - START #############
########### ---------------------------------------------------------#############
# areas <- c(5,10,20,30) # Select number of small areas
# sample_size <- c(30,50,100,500) # area wise sample size
areas <- c(5) # Select number of small areas
sample_size <- c(30,50,100,500) # area wise sample size
nsim <- 1000
# set.seed(123)
sim_seed <- seq(1000,1999,1)
theta_sim <- 0.1 # theta to create simulated data
beta_sim <- c(-1.5,1.3,1.5)
beta_sim <- as.matrix(beta_sim)
all_iters <- names_iter <- run_time_all <- u_sim_ls3 <- list()
final_ests <- list()
for(q in 1:length(areas)){
# q <- 4
m <- areas[q]
u_sim <- as.matrix(rnorm(m,0,sd=sqrt(theta_sim)))
conv_beta <- beta_sim; conv_u <- u_sim; conv_theta <- theta_sim; conv_iter <- 0;
mean_beta_hglm <- data.frame(True=beta_sim); mean_u_hglm <- data.frame(True=u_sim);
mean_theta_hglm <- data.frame(True=theta_sim); mean_beta_glmm <- data.frame(True=beta_sim);
mean_u_glmm <- data.frame(True=u_sim); mean_theta_glmm <- data.frame(True=theta_sim)
MSE_glmm <- list()
glmm_nameB <- hglm_nameB <- glmm_nameU<-hglm_nameU<-glmm_nameT<- hglm_nameT<- "true" ;
MSE_temp <- c(); name_MSE <- c();conv_name <- c()
conv_iter_all <- MSE_glmm0 <- u_sim_ls2 <- list()
run_time_s <- c()
for(s in 1:length(sample_size)){
# s <- 4
all_beta_sim <- data.frame(True=beta_sim); all_u_sim <- data.frame(True=u_sim);
all_theta_sim <- data.frame(True=theta_sim)
all_beta_glmm <- all_u_glmm <- all_theta_glmm <- u_sim_ls <- list()
err_all_sims <- namesErr <- conv_iter <- c()
SSE_beta_glmm <- rep(0,length(beta_sim)); SSE_u_glmm <- rep(0,m);
SSE_theta_glmm <- rep(0,length(theta_sim))
start_time <- Sys.time()
for(r in 1:nsim){
# r <- 1
set.seed(sim_seed[r])
n <- sample_size[s] # Areawise sample size
grps <- c(2,2) # Select the number of categorical fixed effects and groups in each var
j <- 0 # Select the number of fixed effects (continuous vars)
theta0 <- 0.1
N <- n*m # Total sample size
# create county variable
county <- c()
for (i in 1:m){
cnty <- as.matrix(rep(c(i),n))
county <- rbind(county, cnty)
}
# create each discrete var and combine to a df
fixed_eff_ds <- data.frame(county)
for(k in 1:length(grps)){
tempX <- sim_x(grps[k]) # call func and create dis fixed effects data
colnames(tempX) <- paste0("X",k)
fixed_eff_ds <- cbind.data.frame(fixed_eff_ds,tempX)
}
fixed_eff_ct <- cbind.data.frame(XX1=log(rnorm(N,mean=60,sd=3)),
XX2=log(rnorm(N,mean=80000,sd=30)))
char_var <- colnames(fixed_eff_ds) # col names of fixed effects - discrete
vars_ds <- char_var[1:(length(grps)+1)] # No of fixed effects(discrete) to consider for modelling
cnt_var <- colnames(fixed_eff_ct) # col names of fixed effects - continuous
if(j!=0){ # No of fixed effects (continuous) for modelling
vars_ct <- cnt_var[1:j]
}else{
vars_ct <- NULL
}
df_sim <- cbind.data.frame(fixed_eff_ds[,colnames(fixed_eff_ds)%in%vars_ds],
fixed_eff_ct[,colnames(fixed_eff_ct)%in%vars_ct])
df_sim <- df_sim[order(df_sim$county), ]
colnames(df_sim) <- c(colnames(fixed_eff_ds)[which(colnames(fixed_eff_ds)%in%vars_ds)],
colnames(fixed_eff_ct)[which(colnames(fixed_eff_ct)%in%vars_ct)])
table(df_sim$county,df_sim$X1)
table(df_sim$county,df_sim$X2)
## Simulated random effects given theta
# theta_sim <- 1
# u_sim <- as.matrix(rnorm(m,0,sd=sqrt(theta_sim)))
# u_sim <- as.matrix(qnorm(runif(m, min=pnorm(-2, sd=sqrt(theta_sim)),
# max=pnorm(2, sd=sqrt(theta_sim))),
# sd=sqrt(theta_sim)))
u_sim0 <- cbind.data.frame(county=unique(df_sim$county),u_sim)
# Merge simulated fixed and random effects data
data_all <- merge(df_sim,u_sim0,by.x="county",by.y="county")
data_all <- data_all[order(data_all$county), ]
# # fixed effect - design matrix
Xvars_ds <- vars_ds[!vars_ds %in% "county"]
if(length(vars_ct) !=0){
dm_form <- as.formula(paste('~',paste('as.factor(',Xvars_ds,')', collapse = "+"),"+",vars_ct))
}else if(length(vars_ct)==0){
dm_form <- as.formula(paste('~',paste('as.factor(',Xvars_ds,')', collapse = "+")))
}
X <- model.matrix(dm_form,data=data_all)
##### Assign initial betas
# beta_sim <- runif(ncol(X),min=0.5,max=2) # beta for data simulation
# beta_sim <- runif(ncol(X),min=-1,max=2)
# beta_sim <- c(1.5,1.8,-2,1.2,-1.5)
beta_sim <- as.matrix(beta_sim[1:ncol(X)])
u <- as.matrix(data_all$u_sim)
# Calculate binomial p
p0 <- exp(X%*%beta_sim+u)/(1+exp(X%*%beta_sim+u))
y <- rbinom(N,1,p0)
table(y)
data <- cbind.data.frame(data_all,y)
table(data$y, data$county)
### Define matrices and values for hglm modelling
data <- data[order(data$county),]
X <- X
Z <- model.matrix(~0+as.factor(data$county))
y <- data[,"y"]
m <- m
N <- n*m
p <- ncol(X)
## Fit logit model to get initial fixed effects parameters
all_vars <- c(Xvars_ds,vars_ct)
glm_formula <- as.formula(paste("y",'~',paste(all_vars, collapse = "+")))
# glm_fit <- glm(y~as.factor(X3)+as.factor(X2)+as.factor(X1),family = binomial(link=logit),data=data)
glm_fit <- glm(formula = glm_formula,family = binomial(link=logit),data=data)
# Initial parameters
beta0 <- as.numeric(glm_fit$coefficients)
u0 <- as.matrix(runif(m,(u_sim-0.1),(u_sim+0.1)))
########### ---------------------------------------------------------#############
########### Simulate data - END #############
########### ---------------------------------------------------------#############
########### Parameter Estimation - START #############
########### ---------------------------------------------------------#############
beta_new <- beta0
u_new <- u0
theta_new <- theta0
beta_new_all <- data.frame(True=beta_sim) ;
theta_new_FS <- data.frame(True=theta_sim) ;
theta_new_all <- data.frame(True=theta_sim);
u_new_all <- data.frame(True=u_sim);
theta_all <- data.frame(True=theta_new)
delta_final <- data.frame(True="NA")
delta <- NULL ;
iter <- 0; iter_all <- "iter0"
repeat{
beta <- as.vector(beta_new)
u <- as.vector(u_new)
theta <- theta_new
BU_old <- rbind(as.matrix(beta),as.matrix(u))
theta_inv <- solve(theta)
G_inv <- kronecker(theta_inv, diag(m))
# G_inv
P <- 1/(1+exp(-(X%*%beta+Z%*%u)))
W <- diag(as.vector(P*(1-P)))
D2h_BB <- t(X)%*%W%*%X # or crossprod(t(crossprod(X,W)),X)
D2h_BU <- t(X)%*%W%*%Z
D2h_UB <- t(Z)%*%W%*%X
D2h_UU <- t(Z)%*%W%*%Z + G_inv
J <- rbind(cbind(D2h_BB,D2h_BU),cbind(D2h_UB,D2h_UU)) # (m+p) by (p+m) Jacobian
J <- as.matrix(J)
J_inv <- solve(J)
D1h_B <- t(X)%*%(y-P)
D1h_u <- t(Z)%*%(y-P)- G_inv%*%u
S <- as.vector(rbind(D1h_B, D1h_u)) # Gradient (score function) vector
# Fisher scoring algorithm
BU_new = BU_old + (J_inv%*%S)
beta_new <- as.vector(BU_new[1:length(beta)])
u_new <- as.vector(BU_new[(length(beta)+1):length(BU_new)])
# convergence_beta <- abs(beta_new[-1]-beta[-1])
convergence_beta <- abs(beta_new-beta)
convergence_u <- abs(u_new-u)
max(convergence_beta)
max(convergence_u)
beta_new_all <- cbind.data.frame(beta_new_all,value=round(beta_new,6))
u_new_all <- cbind.data.frame(u_new_all,value=round(u_new,6))
# iter <- c(iter0,iter)
rm(P); rm(W); rm(J);
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# #
# Bias Correction #
# #
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# Bias Corr step 1: update J with estimated beta and u
P <- 1/(1+exp(-(X%*%beta_new+Z%*%u_new)))
pp <- P*(1-P)
W <- Diagonal(x=pp)
D2h_BB <- t(X)%*%W%*%X # or crossprod(t(crossprod(X,W)),X)
D2h_BU <- t(X)%*%W%*%Z
D2h_UB <- t(Z)%*%W%*%X
D2h_UU <- t(Z)%*%W%*%Z + G_inv
J <- rbind(cbind(D2h_BB,D2h_BU),cbind(D2h_UB,D2h_UU)) # (m+p) by (p+m) Jacobian
J <- as.matrix(J)
Jhat_Inv <- pseudoinverse(J)
Tau <- Jhat_Inv[(p+1):(p+m),(p+1):(p+m)]
Taut <- diag(Tau)
zeta <- theta/(theta+Taut)
corr_u <- zeta*u_new
rm(P);rm(W); rm(J);
# Use corrected u to estimate theta
########### ---------------------------------------------------------#############
########### Estimate theta=sigma^2 using Dh/Dtheta=0 #############
########### ---------------------------------------------------------#############
## Using SAE6 to estimate theta
theta_inv <- solve(theta)
G_inv <- kronecker(theta_inv, diag(m))
P <- 1/(1+exp(-(X%*%beta_new+Z%*%corr_u)))
W <- diag(as.vector(P*(1-P)))
D2h_BB <- t(X)%*%W%*%X # or crossprod(t(crossprod(X,W)),X)
D2h_BU <- t(X)%*%W%*%Z
D2h_UB <- t(Z)%*%W%*%X
D2h_UU <- t(Z)%*%W%*%Z + G_inv
J <- rbind(cbind(D2h_BB,D2h_BU),cbind(D2h_UB,D2h_UU)) # (143+9) by (9+143) Jacobian
J <- as.matrix(J)
J_inv <- solve(J)
###Estimate theta
J_inv22 <- J_inv[(p+1):(p+m),(p+1):(p+m)]
theta_new <- 1/m*(t(u_new)%*%u_new)+1/m*func_trace(J_inv22)
theta_new_all <- cbind.data.frame(theta_new_all,value=theta_new)
convergence_theta <- abs(theta_new-theta)
delta <- max(convergence_beta,convergence_theta)
delta_final <- cbind.data.frame(delta_final,value=delta)
est_out <- rbind.data.frame(beta_new_all,u_new_all,theta_new_all,value=delta_final)
rownames(est_out) <- c(colnames(X),paste0("u_",unique(df_sim$county)),"theta","delta")
## rbind only beta, u and theta estimated values
est_pars <- rbind.data.frame(beta_new_all,u_new_all,theta_new_all)
iter <- iter + 1
iter_all <- c(iter_all,paste0("iter",iter))
if(delta <= 1e-5)break
}
colnames(u_new_all) <- iter_all
u_sim_ls[[r]] <- u_new_all
conv_iter <- c(conv_iter,iter)
conv_iter_all[[s]] <- conv_iter
SqErr_beta <- (beta_sim-beta_new)^2
SqErr_u <- (u_sim-u_new)^2
SqErr_theta <- (theta_sim-theta_new)^2
err_all <- rbind.data.frame(SqErr_beta,SqErr_u,SqErr_theta)
err_all_sims <- c(err_all_sims,err_all)
err_all_sims <- data.frame(err_all_sims)
rownames(err_all_sims) <- c(colnames(X),paste0("u_",unique(df_sim$county)),"theta")
namesErr <- c(namesErr,paste0("Sq_Err_",r,"_sims"))
# print(paste0("Small area with sample size: ",n, " & No of sims: ",r))
beta_new <- data.frame(beta_new)
u_new <- data.frame(u_new)
theta_new <- data.frame(theta_new)
colnames(beta_new) <- paste0("sim_beta_",r,"_sims")
colnames(u_new) <- paste0("sim_u_",r,"_sims")
colnames(theta_new) <- paste0("sim_theta_",r,"_sims")
all_beta_sim <- cbind.data.frame(all_beta_sim,beta_new)
all_u_sim <- cbind.data.frame(all_u_sim, u_new)
all_theta_sim <- cbind.data.frame(all_theta_sim, theta_new)
## poststratification method
glmm_sim <- glmer(y ~ as.factor(X1) + as.factor(X2) + (1|county),
data = data, family = binomial, weights=NULL,
control = glmerControl(optimizer = "bobyqa"), nAGQ = 10)
beta_glmm <- glmm_sim@beta
u_glmm <- glmm_sim@u
theta_glmm <- glmm_sim@theta
## Sum squared error of glmm estimates
SSE_beta_glmm <- SSE_beta_glmm + (beta_sim-beta_glmm)^2
SSE_u_glmm <- SSE_u_glmm + (u_sim-u_glmm)^2
SSE_theta_glmm <- SSE_theta_glmm + (theta_sim-theta_glmm)^2
SSE_glmm <- rbind(SSE_beta_glmm,SSE_u_glmm,SSE_theta_glmm)
row.names(SSE_glmm) <- c(colnames(X),paste0("glmm_u_",unique(df_sim$county)),"theta")
all_beta_glmm[[r]] <- beta_glmm
all_u_glmm[[r]] <- u_glmm
all_theta_glmm[[r]] <- theta_glmm
}
u_sim_ls2[[s]] <- u_sim_ls
colnames(err_all_sims) <- namesErr
MSE <- rowSums(err_all_sims)/nsim
MSE_temp <- cbind(MSE_temp,MSE)
name_MSE <- c(name_MSE,paste0("MSE_",areas[q],"_",sample_size[s]))
mean_beta_hglm <- cbind.data.frame(mean_beta_hglm,rowSums(all_beta_sim[,2:ncol(all_beta_sim)])/nsim)
mean_u_hglm <- cbind.data.frame(mean_u_hglm,rowSums(all_u_sim[,2:ncol(all_u_sim)])/nsim)
mean_theta_hglm <- cbind.data.frame(mean_theta_hglm,rowSums(all_theta_sim[,2:ncol(all_theta_sim)])/nsim)
glmm_nameB <- c(glmm_nameB,paste0("glmm_mean_beta_",areas[q],"_",sample_size[s]))
glmm_nameU <- c(glmm_nameU,paste0("glmm_mean_u_",areas[q],"_",sample_size[s]))
glmm_nameT <- c(glmm_nameT,paste0("glmm_mean_theta_",areas[q],"_",sample_size[s]))
hglm_nameB <- c(hglm_nameB,paste0("hglm_mean_beta_",areas[q],"_",sample_size[s]))
hglm_nameU <- c(hglm_nameU,paste0("hglm_mean_u_",areas[q],"_",sample_size[s]))
hglm_nameT <- c(hglm_nameT,paste0("hglm_mean_theta_",areas[q],"_",sample_size[s]))
### MSE and estimated parameters using glmm
MSE_beta_glmm0 <- SSE_beta_glmm/nsim
MSE_u_glmm0 <- SSE_u_glmm/nsim
# MSE_glmm0 <- SSE_glmm/nsim
MSE_glmm0[[s]] <- SSE_glmm/nsim
all_beta_glmm <- do.call(cbind,all_beta_glmm)
all_u_glmm <- do.call(cbind,all_u_glmm)
all_theta_glmm <- do.call(cbind,all_theta_glmm)
mean_beta_glmm <- cbind.data.frame(mean_beta_glmm, rowSums(all_beta_glmm)/nsim)
mean_u_glmm <- cbind.data.frame(mean_u_glmm, rowSums(all_u_glmm)/nsim)
mean_theta_glmm <- cbind.data.frame(mean_theta_glmm, rowSums(all_theta_glmm)/nsim)
End_time <- Sys.time()
run_time <- End_time-start_time
run_time_s <- cbind(run_time_s,run_time)
run_time_s <- data.frame(run_time_s)
# cat(paste0("\n ",s," areas Finish ! \n"))
conv_name <- c(conv_name,paste0("m_",areas[q],"n_",sample_size[s]))
}
u_sim_ls3[[q]] <- u_sim_ls2
run_time_all[[q]] <- run_time_s
conv_iter_df <- do.call(cbind.data.frame,conv_iter_all)
# colnames(conv_iter_df) <- conv_name
all_iters[[q]] <- conv_iter_df
names_iter[[q]] <- conv_name
MSE_all <- data.frame(MSE_temp)
colnames(MSE_all) <- name_MSE
colnames(mean_beta_hglm) <- hglm_nameB
colnames(mean_u_hglm) <- hglm_nameU
colnames(mean_theta_hglm) <- hglm_nameT
MSE_glmm <- do.call(cbind,MSE_glmm0)
colnames(MSE_glmm) <- name_MSE
colnames(mean_beta_glmm) <- glmm_nameB
colnames(mean_u_glmm) <- glmm_nameU
colnames(mean_theta_glmm) <- glmm_nameT
MSE_u_df <- MSE_all[(ncol(X)+1):((ncol(X)+1)+length(u_sim)-1),]
MSE_u <- colSums(MSE_u_df)/length(u_sim)
MSE_hglm <- rbind.data.frame(MSE_all,MSE_final=MSE_u)
MSE_hglm <- MSE_hglm[-nrow(MSE_hglm), ]
rownames(mean_beta_hglm) <- c(colnames(X))
rownames(mean_u_hglm) <- c(paste0("u_",unique(df_sim$county)))
rownames(mean_theta_hglm) <- c("theta")
beta_glmm_hglm <- data.frame(mean_beta_hglm,mean_beta_glmm)
u_glmm_hglm <- data.frame(mean_u_hglm,mean_u_glmm)
theta_glmm_hglm <- data.frame(mean_theta_hglm,mean_theta_glmm)
MSE_glmm_hglm <- data.frame(MSE_glmm, MSE_hglm)
# write.csv(beta_glmm_hglm,paste0(path,"/out/mean_beta_hglm"),row.names = FALSE)
print(mean_beta_glmm)
print(beta_glmm_hglm)
print(mean_u_hglm)
print(mean_theta_hglm)
print(MSE_hglm)
print(mean_beta_glmm)
print(mean_u_glmm)
print(mean_theta_glmm)
print(MSE_glmm)
print(u_sim_ls3)
final_ests[[q]] <- list(beta_glmm_hglm,u_glmm_hglm,theta_glmm_hglm,MSE_glmm_hglm)
}
all_iters_df <- do.call(cbind.data.frame,all_iters)
iter_names <- unlist(names_iter)
colnames(all_iters_df) <- iter_names
run_time_final <- do.call(cbind.data.frame,run_time_all)
colnames(run_time_final) <- iter_names
save(final_ests,beta_glmm_hglm,mean_beta_glmm,mean_u_hglm,mean_theta_hglm,MSE_hglm,
mean_beta_glmm, mean_u_glmm, mean_theta_glmm, MSE_glmm,
u_sim_ls3,all_iters_df,run_time_final, file="/work/dai/niroshapr/HGLM/out_BiasC/est_all_sim.RData")
# colx <- df
f_lvls <- function(colx,x){
colx <- colx[,paste0(x)]
lvls=unique(colx)
return(lvls)
}
### Create groups combining levels
new_grp <- function(df,x){
# df <- df_sub; x <- "age"
df <- df[order(df$county), ]
varx <- df[, paste0(x)]
varx <- ifelse(df$age %in% c(1:n1),"1",
ifelse(df$age %in% c((n1+1):n2), "2","3"))
df <- df[,!colnames(df)%in%paste0(x)]
cnames <- colnames(df)
df <- data.frame(df, varx)
colnames(df) <- c(cnames,paste0(x))
return(df)
}
# yrbss_dat
## Grouping the response variable, 1=ecig_no, 2=ecig_yes
# df <- df_sub or df <- data, resp
df_y <- function(df,y){
temp <- df[, y]
temp <- ifelse(temp==1,0,ifelse(temp==2,1,temp))
temp <- as.integer(temp)
temp <- data.frame(temp)
colnames(temp) <- y
df <- data.frame(temp,df)
df <- df[,!colnames(df) %in% paste0(y,".1")]
return(df)
}
# This function create model formula combining
# discrete and cont variables depending on the
# model algorithm
my.form <- function(dis_vars,cont_vars){
fac_fomular <- paste0(resp,"~")
for(i in 1:length(dis_vars)){
temp <- paste0("as.factor(",dis_vars[i],")+")
fac_fomular <- paste0(fac_fomular,temp)
}
cont_fomular <- fac_fomular
if(length(cont_vars)!=0){
if(model.type=="glmm"){
for(j in 1:length(cont_vars)){
temp <- paste0(cont_vars[j])
cont_fomular <- paste0(cont_fomular,temp)
}
}else if(model.type=="hglm"){
for(j in 1:length(cont_vars)){
temp <- paste0(cont_vars[j])
cont_fomular <- paste0(cont_fomular,temp)
}
}
}else{
cont_fomular <- fac_fomular
}
return(cont_fomular)
}
## Levels of each discrete var
ls_func <- function(dis_vars){
ls_disc <- list()
for(i in 1:length(dis_vars)){
# i <- 3
dis_vars <- dis_vars[order(dis_vars)]
v <- dis_vars[i]
lvls <- f_lvls(data,paste0(v))
lvls <- lvls[!is.na(lvls)]
lvls <- lvls[order(lvls)]
X <- c(noquote("Levels of"),v,as.vector(lvls))
ls_disc[[i]] <- X
}
return(ls_disc)
}
## Function to get trace of a m atrix
func_trace <- function(X){
n <- dim(X)[1]
tr <- 0 ### initialize trace
for (j in 1:n){
k <- X[j,j]
tr <- tr + k
}
return(tr[[1]])
}
plot_func <- function(df,us_state){
# us_state <- usmap::us_map(regions = "counties")
us_state$fips <- as.numeric(us_state$fips)
df$county <- as.numeric(df$county)
colnames(df) <- ifelse(colnames(df)%in%"county","fips",colnames(df))
data_all0 <- left_join(us_state,df,by="fips")
# get the center location for each state
if(plot_by_year=="YES"){
plot_data<- list()
for(j in 1:length(yr)){
# j <- 1
data_all <- data_all0[data_all0$year==yr[j],]
df_state <- data_all %>% group_by(abbr) %>% summarise(mlong=mean(long),mlat=mean(lat))
no_classes <- 7
labels <- c()
quantiles <- quantile(data_all$pred_cnty_prev,
probs = seq(0, 1, length.out = no_classes + 1),na.rm=TRUE)
# Custom labels, rounding
labels <- c()
for(idx in 1:length(quantiles)){
labels <- c(labels, paste0(round(quantiles[idx], 2),
"_",
round(quantiles[idx + 1], 2)))
}
# Minus one label to remove the odd ending one
labels <- labels[1:length(labels)-1]
# Create new variable for fill
data_all$prev_quant <- cut(data_all$pred_cnty_prev,
breaks = quantiles,
labels = labels,
include.lowest = T)
plot_data[[j]] <- data_all
}
}else if(plot_by_year=="NO"){
data_all <- data_all0 %>% dplyr::select(long,lat,fips,year,pred_cnty_prev) %>%
group_by(fips) %>% summarize(AvgPrev=mean(pred_cnty_prev))
data_all <- data.frame(data_all)
data_all1 <- data_all0[,!colnames(data_all0)%in% c("year","pred_cnty_prev","logit_prob","true_prevalence")]
data_all1 <- unique(data_all1)
data_all2 <- merge(data_all1,data_all, by="fips")
df_state <- data_all2 %>% group_by(abbr) %>% summarise(mlong=mean(long),mlat=mean(lat))
no_classes <- 7
labels <- c()
quantiles <- quantile(data_all2$AvgPrev,
probs = seq(0, 1, length.out = no_classes + 1),na.rm=TRUE)
# Custom labels, rounding
labels <- c()
for(idx in 1:length(quantiles)){
labels <- c(labels, paste0(round(quantiles[idx], 2),
"_",
round(quantiles[idx + 1], 2)))
}
# Minus one label to remove the odd ending one
labels <- labels[1:length(labels)-1]
# Create new variable for fill
data_all2$prev_quant <- cut(data_all2$AvgPrev,
breaks = quantiles,
labels = labels,
include.lowest = T)
plot_data <- data_all2
}
return(plot_data)
}
resp_title <- function(resp){
if(resp=="ecig_ever"){
tit_y <- "Ever Ecig"
}else if(resp=="ecig_current"){
tit_y <- "Current Ecig"
}else if(resp=="smoke_current"){
tit_y <- "Current Smoke"
}else if(resp=="smoke_ever"){
tit_y <- "Ever Smoke"
}
return(tit_y)
}
# get the center location for each state
mean_log_lat <- function(df){
# df <- pred_prev_cnty
us_state <- usmap::us_map(regions = "counties")
colnames(us_state) <- c("long","lat","order","hole","piece","group","fips","abbr","full","county")
us_state$fips <- as.numeric(us_state$fips)
colnames(df) <- ifelse(colnames(df)%in%"county","fips",colnames(df))
data_all0 <- left_join(us_state,df,by="fips")
if(plot_by_year=="YES"){
df_state <- data_all %>% group_by(abbr) %>% summarise(mlong=mean(long),mlat=mean(lat))
}else{
df_state <- data_all0 %>% group_by(abbr) %>% summarise(mlong=mean(long),mlat=mean(lat))
}
# data_all <- data_all0[data_all0$year==yr[1],]
return(df_state)
}
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# #
# Call packages #
# #
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
rm(list=ls())
packages <- c("sae", "data.table","readxl", "dplyr", "gmodels",
"pdftools", "reshape2","broom","tidyr","BayesSAE","corpcor",
"fastDummies","maps","ggmap","geosphere","sqldf","lme4")
ipak <- function(pkg){
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
if(length(new.pkg))
install.packages(new.pkg, dependencies = TRUE)
sapply(pkg, require, character.only = TRUE)
}
ipak(packages)
dir_path <- getwd()
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# #
# Read Data #
# #
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
source(paste0(dir_path,"/C1_Census_DataPrep.R"))
source(paste0(dir_path,"/hglm_glmm_integrated_funcV2.R"))
data <- data.frame(data.table::fread(paste0(dir_path,"/data/YRBSSNew.csv")))
df <- data
df <- df[order(df$county), ]
resp <- "ecig_ever" # ecig_current, ecig_ever, smoke_current, smoke_ever
dis_vars <- c("age","sex","race4","year") ### consider year later
cont_vars <- c("povt_rate")
wght <- "wt1"
re <- "county"
model.type <- "hglm"
save.out <- "YES"
data_change <- "NO" ## If YES heed to calculate distance for adjcent counties
fit.model <- "YES"
plot_by_year <- "NO"
save.plot <- "YES"
## Age levels 1:3=1, 4:5=2, & 6:7=3
n1 <- 3; n2=5
# after re-joining levels, enter how many levels in each dis var
DX0 <- expand.grid(age=c("1","2","3"),sex=c("1","2"), race4=c("1","2","3","4"), year=c("1","2"))
out.path <- paste0(dir_path,"/out/",resp,"/")
dis_vars <- dis_vars[order(dis_vars)]
dis_vars
if("year" %in% colnames(df)){
df$year <- as.numeric(df$year)
df$year <- ifelse(df$year=="2015",1,
ifelse(df$year=="2017",2,df$year))
}
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# #
# #
# Data Prep #
# #
# #
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
if("FIPS" %in% colnames(df)){
colnames(df) <- "county"
}else{
colnames(df) <- colnames(df)
}
#** Filter variables from original data set
df_sub <- data.frame(df[, colnames(df) %in% c(resp,re,dis_vars,cont_vars,wght)])
wt <- data.frame(df[, colnames(df) %in% c(wght)])
colnames(wt) <- "wt"
df_sub <- df_sub[complete.cases(df_sub), ]
#** Levels of age variable
lvls <- f_lvls(df_sub,"age")
lvls
#** Create age grouping
df_sub <- new_grp(df_sub,"age")
#** group response variable using function
df_sub <- df_y(df=df_sub,y=resp) #
yrbss_df <- df_sub ## obs , this df will be used in area est modelling section
#** with new grouping for age
print("Freq table with new grouping: ")
table(yrbss_df$age)
table(yrbss_df$year)
table(yrbss_df$sex)
table(yrbss_df$race4)
# ******************************************************************************* #
# #
# Calculate true prevalence #
# #
# ******************************************************************************* #
# Calculate county wise prevalence for each year and combine dfs
yr <- unique(df_sub$year)
#** df_sub0 <- df_sub
true_prev1 <- list()
for(i in 1:length(yr)){
# i <- 2
df_sub0 <- df_sub[df_sub$year%in%yr[i], ]
myFormula <- sprintf("county~%s", resp)
true_prev0 <- sqldf:: sqldf(paste0("select * from df_sub0 where ",resp," is not null")) %>%
dcast(as.formula(myFormula)) %>% # total # of records for each county
gather(variable, value, -county) %>% ##
group_by(county) %>%
mutate(percentage = value/sum(value)) %>%
# select(-value) %>%
spread(variable, percentage)
true_prev0 <- data.frame(true_prev0)
true_prev0 <- data.frame(true_prev0, logit_prob=log(true_prev0$X1/(1-true_prev0$X1)))
true_prev0 <- true_prev0[!is.na(true_prev0$logit_prob), ]
true_prev1[[i]] <- data.frame(true_prev0, year=yr[i])
rm(true_prev0)
}
true_prev <- do.call(rbind,true_prev1)
head(true_prev)
#** Remove -Inf for prevalence
if("-Inf" %in% true_prev$logit_prob){
rem_cnty <- as.numeric(true_prev$county[which(true_prev$logit_prob =="-Inf")])
true_prev <- true_prev[true_prev$logit_prob !="-Inf",]
}else{
true_prev <- true_prev
true_prev <- true_prev[!is.na(true_prev$logit_prob), ]
}
true_prev <- true_prev[, !colnames(true_prev) %in% c("value","X0","X1")]
head(true_prev)
#** Combine YRBSS data with pov data and create dummy colns for dis vars
# counties in YRBSS data for y
com_cnty <- unique(yrbss_df$county)
if("year" %in% colnames(pov_pop)){
pov_pop$year <- ifelse(pov_pop$year=="2015",1,
ifelse(pov_pop$year=="2017",2,pov_pop$year))
}
pov_pop_sub <- pov_pop[pov_pop$county %in% com_cnty, ]
dis_vars0 <- dis_vars[!dis_vars%in%"year"]
Xdata_dummy <- fastDummies::dummy_cols(yrbss_df,select_columns = dis_vars)%>%
# dplyr::select(-c(y)) %>%
left_join(pov_pop_sub, by=c("county","year")) %>% # Join popn and poverty data
left_join(true_prev, by=c("county","year")) %>% # Join prev data
dplyr::select(-c(paste0(dis_vars0))) ### Remove dis vars except year
yrbss_dummy <- Xdata_dummy
yrbss_dummy <- yrbss_dummy[,!colnames(yrbss_dummy) %like% "_NA"]
# yrbss_dummy <- data.frame(lapply(yrbss_dummy,function(x)(as.numeric(x))))
yrbss_dummy <- yrbss_dummy[order(yrbss_dummy$county),]
print("Combining YRBSS data with pov rate and creating dummy cols for dis vars")
head(yrbss_dummy)
#** remove rows that has na for logit prob
na_rows <- which(is.na(yrbss_dummy$logit_prob))
if(length(na_rows) != 0){
yrbss_dummy <- yrbss_dummy[-na_rows, ]
}else{
yrbss_dummy <- yrbss_dummy
}
colnames(yrbss_dummy) <- gsub("_","",colnames(yrbss_dummy))
head(yrbss_dummy)
#** merge with dummy county data
keep_vars <- colnames(census_SexAgeRace_DX)[!colnames(census_SexAgeRace_DX) %in% dis_vars] # remove original dis vars
keep_vars <- c(keep_vars, "year")
Census_dummy <- census_SexAgeRace_DX[census_SexAgeRace_DX$county%in%com_cnty,
colnames(census_SexAgeRace_DX)%in% keep_vars]
head(Census_dummy)
# ******************************************************************************* #
# #
# Calculate entroid of each county #
# #
# ******************************************************************************* #
#### Estimate centroid of each county using long and lat data
long_lat_cnty <- map_data('county')
data("county.fips")
cnty_fips0 <- tstrsplit(county.fips$polyname,",", names=TRUE)
cnty_fips <- data.frame(county=county.fips$fips,region=cnty_fips0$V1,subregion=cnty_fips0$V2)
cnty_fips$region <- as.character(cnty_fips$region)
cnty_fips$subregion <- as.character(cnty_fips$subregion)
long_lat_dat <- long_lat_cnty %>% left_join(cnty_fips,by=c("region","subregion"))
long_lat_dat <- long_lat_dat %>%dplyr::select(county,long,lat)
df_by_cnty <- split(long_lat_dat, long_lat_dat$county) ## Split data frame by county
cnty_centrd <- list() # CREATE list of Data frames
centrd_by_cnty <- list()
### Long and lat for al counties
for (i in 1:length(df_by_cnty)){
# i <- 1
cnty_centrd[[i]] <- as.data.frame((df_by_cnty[i]),row.names = NULL)
colnames(cnty_centrd[[i]]) <- colnames(long_lat_dat)
centrd_by_cnty0 <- centroid(cnty_centrd[[i]][,2:3])
centrd_by_cnty[[i]] <- cbind.data.frame(county=unique(cnty_centrd[[i]][,1]),centrd_by_cnty0)
}
cnty_centrd_df <- rbindlist(centrd_by_cnty) #Requires library "data.table"
# out_ls <- list(com_cnty,yrbss_dummy,true_prev0,yrbss_unit,Xmean_sub,cnty_pop,cnty_centrd_df,perc_tbl,pop_dat,true_prev)
data_prep <- list(df_sub,wt,com_cnty,yrbss_dummy,Census_dummy,cnty_centrd_df,pop_dat,true_prev)
# ******************************************************************************* #
# #
# Obtain initial Parameters #
# #
# ******************************************************************************* #
data_all <- merge(df_sub,pov_pop,by=c("county","year"))
data_all <- data_all[with(data_all, order(county, year)), ]
head(data_all)
## Filter variables to create design matrix for dis vars
# DX_0 <- data_all[, c(resp,cont_vars,dis_vars)]
DX_out <- data_all[, c(resp,cont_vars)]
for(i in 1:length(dis_vars)){
# i <- 1
var_temp <- dis_vars[i]
DX_temp <- data_all[,var_temp]
DX_temp <- as.factor(DX_temp)
DX_out <- cbind.data.frame(DX_out,DX_temp)
}
colnames(DX_out) <- c(resp,cont_vars,dis_vars)
DX11 <- model.matrix(as.formula(paste0(resp,"~.")), data=DX_out,
contrasts.arg =list(age=contrasts(DX_out$age, contrasts = F),
race4=contrasts(DX_out$race4, contrasts = F),
sex=contrasts(DX_out$sex, contrasts = F),
year=contrasts(DX_out$year, contrasts = F)))
DX11 <- cbind.data.frame(county=data_all$county,
resp=data_all[ , paste0(resp)], DX11)
ref_grp <- c("age1","race44","sex1","year1") ### Ref group
DX <- DX11[ ,!colnames(DX11)%in%ref_grp] ### Remove ref group
X <- DX[ ,3:ncol(DX)]
X <- X[ ,order(colnames(X))]
y <- DX[ ,"resp"]
#*** define parameters
# y <- data_all[data_all$county %in% cnty0, ] #paste0(resp)
m <- length(unique(data_all$county))
N <- nrow(data_all)
Z <- model.matrix(~0+as.factor(DX11$county))
p <- ncol(X)
X <- as.matrix(X)
cnty <- unique(data_all$county)
length(cnty)
## Fit logit model to get initial fixed effects parameters
my_form <- my.form(dis_vars,cont_vars)
glm_formula <- as.formula(my_form)
data_all$race4 <- as.factor(data_all$race4)
data_all$race4 <- relevel(data_all$race4, ref="4") # race44 is ref group
glm_fit <- glm(formula = glm_formula,family = binomial(link=logit),data=data_all)
theta0 <- 0.1
u0 <- as.matrix(rnorm(m,0,sd=sqrt(theta0)))
re_names <- c()
for(i in 1:length(cnty)){
re_names <- c(re_names,paste0(cnty[i]))
}
re_names <- as.numeric(re_names)
str(re_names)
u0 <- data.frame(county = re_names,est=u0)
## beta values for all parameters (including ref group)
beta0 <- data.frame(summary(glm_fit)$coefficients[,1])
beta0 <- data.frame(parameter=rownames(beta0),est=beta0[,1])
rownames(beta0) <- NULL
# ============================***********************=========================== #
beta0_cont <- beta0[!beta0$parameter %like% "as.factor",] ## beta for Cont vars
beta0_Int <- beta0_cont[!beta0_cont$parameter %in% cont_vars, ]
beta0_cont <- beta0_cont[beta0_cont$parameter %in% cont_vars, ]
beta0_cont$parameter <- gsub("_","",beta0_cont$parameter)
beta0_Int$parameter <- gsub("\\(Intercept\\)","Int",beta0_Int$parameter)
beta0_dis <- beta0[beta0$parameter %like% "as.factor",] # beta for disc vars
beta0_dis$parameter <- gsub("as.factor\\(","",beta0_dis$parameter)
beta0_dis$parameter <- gsub("\\)","_",beta0_dis$parameter)
beta0_dis$col <- gsub("_.*","",beta0_dis$parameter)
unq <- unique(beta0_dis$col)
beta0_dis$parameter <- gsub("_","",beta0_dis$parameter)
beta0_dis$col <- NULL
beta0 <- rbind(beta0_cont,beta0_dis)
if(!is.character(beta0$parameter)){beta0$parameter <- as.character(beta0$parameter)}
beta0 <- beta0[order(beta0$parameter), ]
beta0$est <- round(as.numeric(beta0$est),5)
beta0 <- rbind.data.frame(beta0_Int,beta0)
# ===============================*************************========================= #
# ******************************************************************************* #
# #
# hglm model fit #
# #
# ******************************************************************************* #
fit.hglm <- match.call(); namedList <- list()
X <- as.matrix(X)
beta_new <- as.matrix(beta0$est)
u_new <- u0$est
theta_new <- theta0
delta <- NULL
beta_new_all <- beta_new ;u_new_all <- u_new; theta_all <- theta_new
theta_new_all <- c(theta_new); delta_final <- 0 ; conv_iter <- 0; iter <- 0
col_names <- c()
dim(X)
length(beta_new)
length(u_new)
dim(Z)
length(y)
repeat{
beta <- as.vector(beta_new)
beta <- round(beta,10)
# beta <- beta_new
u <- as.vector(u_new)
theta <- theta_new
BU_old <- rbind(as.matrix(beta),as.matrix(u))
theta_inv <- solve(theta)
G_inv <- kronecker(theta_inv, diag(m))
P <- 1/(1+exp(-(X%*%beta+Z%*%u)))
pp <- P*(1-P)
W <- Diagonal(x=pp)
D2h_BB <- t(X)%*%W%*%X # or crossprod(t(crossprod(X,W)),X)
D2h_BU <- t(X)%*%W%*%Z
D2h_UB <- t(Z)%*%W%*%X
D2h_UU <- t(Z)%*%W%*%Z + G_inv
J <- rbind(cbind(D2h_BB,D2h_BU),cbind(D2h_UB,D2h_UU)) # (m+p) by (p+m) Jacobian
J <- as.matrix(J)
# library(corpcor)
# J_inv <- solve(J, tol=10^-20)
J_inv <- pseudoinverse(J) # For non-singular matrices the pseudoinverse is equivalent to the standard inverse.
# J_inv <- func_Jinv(beta,u_new,G_inv)
# yp <- y - P
D1h_B <- t(X)%*%(y-P)
D1h_u <- t(Z)%*%(y-P)- G_inv%*%u
S <- as.vector(rbind(D1h_B, D1h_u)) # Gradient (score function) vector
BU_new = BU_old + (J_inv%*%S) # Newton Raphson
beta_new <- as.vector(BU_new[1:length(beta)])
u_new <- as.vector(BU_new[(length(beta)+1):length(BU_new)])
# convergence_beta <- abs(beta_new[-1]-beta[-1])
convergence_beta <- abs(beta_new-beta)
convergence_u <- abs(u_new-u)
max(convergence_beta)
max(convergence_u)
beta_new_all <- cbind.data.frame(beta_new_all,value=round(beta_new,6))
u_new_all <- cbind.data.frame(u_new_all,value=round(u_new,6))
rm(P); rm(W); rm(J);
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# #
# Bias Correction #
# #
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# Bias Corr step 1: update J with estimated beta and u
P <- 1/(1+exp(-(X%*%beta_new+Z%*%u_new)))
pp <- P*(1-P)
W <- Diagonal(x=pp)
D2h_BB <- t(X)%*%W%*%X # or crossprod(t(crossprod(X,W)),X)
D2h_BU <- t(X)%*%W%*%Z
D2h_UB <- t(Z)%*%W%*%X
D2h_UU <- t(Z)%*%W%*%Z + G_inv
J <- rbind(cbind(D2h_BB,D2h_BU),cbind(D2h_UB,D2h_UU)) # (m+p) by (p+m) Jacobian
J <- as.matrix(J)
Jhat_Inv <- pseudoinverse(J)
Tau <- Jhat_Inv[(p+1):(p+m),(p+1):(p+m)]
Taut <- diag(Tau)
zeta <- theta/(theta+Taut)
corr_u <- zeta*u_new # Corrected u_new
rm(P); rm(W); rm(J);
iter <- iter + 1
########### ---------------------------------------------------------#############
########### Estimate theta=sigma^2 using Dh/Dtheta=0 #############
########### ---------------------------------------------------------#############
## Using SAE6 to estimate theta
theta_inv <- solve(theta)
G_inv <- kronecker(theta_inv, diag(m))
P <- 1/(1+exp(-(X%*%beta_new+Z%*%corr_u)))
# W <- diag(as.vector(P*(1-P)))
W <- Diagonal(x=as.vector(P*(1-P)))
D2h_BB <- t(X)%*%W%*%X # or crossprod(t(crossprod(X,W)),X)
D2h_BU <- t(X)%*%W%*%Z
D2h_UB <- t(Z)%*%W%*%X
D2h_UU <- t(Z)%*%W%*%Z + G_inv
J <- rbind(cbind(D2h_BB,D2h_BU),cbind(D2h_UB,D2h_UU)) # (143+9) by (9+143) Jacobian
J <- as.matrix(J)
J_inv <- pseudoinverse(J)
# J_inv <- func_Jinv(beta=beta_new,u_new,G_inv)
dim(J_inv)
###Estimate theta
J_inv22 <- J_inv[(p+1):(p+m),(p+1):(p+m)]
theta_new <- 1/m*(t(u_new)%*%u_new)+1/m*func_trace(J_inv22)
theta_new_all <- cbind.data.frame(theta_new_all,value=theta_new)
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
### To get the variance of u based on J
theta_inv <- solve(theta_new)
G_inv <- kronecker(theta_inv, diag(m))
P <- 1/(1+exp(-(X%*%beta_new+Z%*%u_new)))
W <- Diagonal(x=as.vector(P*(1-P)))
D2h_BB <- t(X)%*%W%*%X # or crossprod(t(crossprod(X,W)),X)
D2h_BU <- t(X)%*%W%*%Z
D2h_UB <- t(Z)%*%W%*%X
D2h_UU <- t(Z)%*%W%*%Z + G_inv
J <- rbind(cbind(D2h_BB,D2h_BU),cbind(D2h_UB,D2h_UU)) # (143+9) by (9+143) Jacobian
J <- as.matrix(J)
J_inv <- pseudoinverse(J)
# J_inv <- func_Jinv(beta=beta_new,u_new,G_inv)
fe.cov <- J_inv[1:p,1:p] ## Var Cov matrix of fixed effects
re.cov <- J_inv[(p+1):(p+m),(p+1):(p+m)] ## Var Cov matrix of random effects
fe.var <- diag(fe.cov) ### Var of fixed effects
fe.var <- round(fe.var,10)
fe.var <- fe.var[fe.var != 0] ## Remove coefs of ref group(which has very small values)
fe.std.err <- round(sqrt(fe.var), 5)
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ #
convergence_theta <- abs(theta_new-theta)
delta <- max(convergence_beta,convergence_theta)
delta_final <- cbind.data.frame(delta_final,value=delta)
col_names <- c(col_names,paste0("iter",iter))
cat(paste0("delta: ",delta,"\n \n"))
if(delta <= 1e-10){break}
}
colnames(theta_new_all) <- col_names
colnames(beta_new_all) <- col_names
rownames(beta_new_all) <- beta0$parameter
colnames(u_new_all) <- col_names
rownames(u_new_all) <- u0$county
cat(paste0("Converged in ",iter," iterations with tol = ",delta,". \n \n"))
# *************************************************************************************** #
# #
# Get model estimates #
# #
# *************************************************************************************** #
est.beta <- beta_new_all[,ncol(beta_new_all)]
est.theta <- theta_new_all[,ncol(theta_new_all)]
est.re <- data.frame(county=rownames(u_new_all),
est=u_new_all[,ncol(u_new_all)])
beta_hat <- round(beta_new_all[,ncol(beta_new_all)],5)
beta_hat <- beta_hat[beta_hat!=0]
z1 <- beta_hat/fe.std.err
est.fe <- data.frame(est=beta_hat,
std.err=fe.std.err,
Z0 = round(z1,5),
P = round(2*pnorm(-abs(z1)),5))
rownames(est.fe) <- rownames(beta_new_all)[round(beta_new_all[,ncol(beta_new_all)],5) !=0] # rownames(beta_new_all)
colnames(est.fe) <- c("Estimate","Std.Error","Z Value","P(>|Z|)")
est.beta <- round(est.beta,5)
est.beta <- data.frame(Variable=rownames(beta_new_all),Estimate=est.beta)
fit.hglm$iter <- iter
fit.hglm$est.beta <- est.fe
fit.hglm$re <- est.re
fit.hglm$var.par <- est.theta
fit.hglm$fe.cov <- fe.cov
fit.hglm$re.cov <- re.cov
colnames(fe.cov) <- rownames(beta_new_all)
rownames(fe.cov) <- rownames(beta_new_all)
# *************************************************************************************** #
# **************************** model estimates *********************************** #
# *************************************************************************************** #
# **************************** Calculate AIC, BIC *********************************** #
est.beta <- as.vector(est.beta$Estimate)
P <- 1/(1+exp(-(X%*%est.beta+Z%*%(est.re$est))))
yt <- 1-y
loglikel <- y*log(P) +yt*log(1-P)
sumloglik <- sum(loglikel)
AIC <- -2*sum(loglikel) + 2*(length(est.beta)+1)
BIC <- -2*sum(loglikel) + (length(est.beta)+1)*log(N)
model.sel <- cbind.data.frame(AIC, BIC,LogLik=sumloglik)
# **************************** Calculate AIC *********************************** #
namedList <- list(`hglm model inference`= model.sel, iter=iter,`fixed effects estimates`=est.fe,
` `=paste0("Converged in ",iter," iterations with tol = ",delta,".")
)
class(namedList) <- "summary.hglm.fit"
fit.hglm$summary.hglm <- namedList
if(save.out=="YES"){
if(length(out.path)==0){
stop("output path is not defined !!")
}else if(length(out.path)!=0){
write.csv(theta_new_all, paste0(out.path,model.type,"/",model.type,"_theta_all.csv"),row.names = T)
write.csv(beta_new_all, paste0(out.path,model.type,"/",model.type,"_beta_new_all.csv"), row.names = T)
write.csv(u_new_all, paste0(out.path,model.type,"/",model.type,"_u_new_all.csv"), row.names = T)
write.csv(est.theta, paste0(out.path,model.type,"/",model.type,"_theta_est.csv"),row.names = F)
write.csv(est.fe, paste0(out.path,model.type,"/",model.type,"_fe_est.csv"), row.names = T)
write.csv(est.re, paste0(out.path,model.type,"/",model.type,"_re_est.csv"), row.names = F)
write.csv(fe.cov, paste0(out.path,model.type,"/",model.type,"_fe_VarCov_est.csv"), row.names = T)
write.csv(re.cov, paste0(out.path,model.type,"/",model.type,"_re_VarCov_est.csv"), row.names = T)
# save(paste0(out.path,"fit.hglm.RData"))
}
print(paste0("hglm model fit output is saved in ",out.path))
}
# *************************************************************************************** #
# #
# Model prediction #
# #
# *************************************************************************************** #
pred.out <- match.call()
DX1 <- data.frame(Int=rep(1,nrow(DX0)))
for(i in 1:length(dis_vars)){
# i <- 1
temp <- paste0(dis_vars[i],"+")
temp_mtx <- model.matrix(as.formula(paste0("~",temp,"0")),DX0)
DX1 <- cbind.data.frame(DX1, temp_mtx)
}
if(resp=="ecig_ever"){
tit_y <- "Ever"
}else if(resp=="ecig_current"){
tit_y <- "Current"
}else if(resp=="smoke_ever"){
tit_y <- "Ever"
}else if(resp=="smoke_current"){
tit_y <- "Current"
}
out_path <- paste0(dir_path,"/out/",resp,"/",model.type)
if(save.out=="YES"){
if(length(out.path)==0){
stop("Output path is not defined !!")
}else if(length(out.path!=0)){
beta_est <- data.frame(read.csv(paste0(out.path,model.type,"/",model.type,"_fe_est.csv")))
theta_est <- data.frame(read.csv(paste0(out.path,model.type,"/",model.type,"_theta_est.csv")))
u_est <- data.frame(read.csv(paste0(out.path,model.type,"/",model.type,"_re_est.csv")))
}
}else{
beta_est <- fit.hglm$fe
u_est <- fit.hglm$re
theta_est <- fit.hglm$var.par
}
## Center of each county
cnty_centrd_df <- data.frame(cnty_centrd_df)
### Random effects available for these counties
# est_rand_eff_cnty <- cnty_centrd_df[cnty_centrd_df$county %in% com_cnty,]
No_rand_effect_cnty <- cnty_centrd_df[!cnty_centrd_df$county %in% u_est$county,]
est_random_effect <- u_est
colnames(est_random_effect) <- c("county","rand_eff")
# est_random_effect$county <- substring(est_random_effect$county,2,nchar(est_random_effect$county))
# est_random_effect$county <- as.integer(est_random_effect$county)
# est_random_effect$rand_eff <- as.numeric(est_random_effect$rand_eff)
# est_random_effect <- est_random_effect[est_random_effect$county %in% est_rand_eff_cnty$county, ]
if(data_change=="YES"){
pred_adj_cnty_df <- pred_adjc_cnty(df1=est_rand_eff_cnty,df2=No_rand_effect_cnty)
write.csv(pred_adj_cnty_df,paste0(dir_path,"/out/",resp,"/",model.type,"/pred_adj_cnty_df.csv"),row.names = F)
}else if(data_change=="NO"){
# ## read adjacent county information
pred_adj_cnty_df <- read.csv(paste0(dir_path,"/out/",resp,"/",model.type,"/pred_adj_cnty_df.csv"))
}
dim(pred_adj_cnty_df)
## Left join with predicted adjcent county
pred_adj_cnty_final <- pred_adj_cnty_df %>%
left_join(est_random_effect, by=c("pred_adj_cnty"="county"))
## All random effects (combine predicted and estimated random effects)
all_random_effect <- rbind(data.frame(county=pred_adj_cnty_final$county,
rand_eff=pred_adj_cnty_final$rand_eff),
est_random_effect)
all_random_effect <- all_random_effect[order(all_random_effect$county), ]
dim(all_random_effect)
head(all_random_effect) ## incuding estimates random effets
# $$ $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
if("FIPS" %in% colnames(df)){
colnames(df) <- "county"
}else{
colnames(df) <- colnames(df)
}
df_sub <- data.frame(df[, colnames(df) %in% c(dis_vars)])
df_sub <- df_sub[complete.cases(df_sub), ]
### Create age grouping
df_sub$age <- ifelse(df_sub$age %in% c(1:n1),"1",
ifelse(df_sub$age %in% c((n1+1):n2), "2","3"))
table(df_sub$age)
# $$ $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
### Reg coefficients from hglm model
if(!is.character(beta_est[,1])){
beta_est[,1] <- as.character(beta_est[,1])
}
cont_vars0 <- gsub("_","",cont_vars)
beta_disc <- beta_est[!beta_est[,1] %in% c(cont_vars0),]
if(length(beta_disc[,1][!beta_disc[,1] %in% names(DX1)])==0){
est_beta0 <- rbind.data.frame(beta_disc[ ,1:2],
cbind(X=ref_grp,Estimate=rep(0,length(ref_grp))))
est_beta1 <- est_beta0[2:nrow(est_beta0), ]
est_beta2 <- est_beta0[1, ]
est_beta1 <- est_beta1[order(est_beta1$X), ]
estBeta <- rbind.data.frame(est_beta2,est_beta1)
estBeta$Estimate <- as.numeric(estBeta$Estimate)
est_beta <- as.matrix(estBeta$Estimate)
DX <- as.matrix(DX1)
}else{
stop("Check for the dimension of fixed effects and design matrix !")
}
X_beta0 <- DX %*% est_beta
X_beta <- X_beta0
poverty_pop <- merge(pov_pop,all_random_effect, by="county")
in_cnty <- est_random_effect$county
## ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ ##
## ---------- predict prev for missing counties for each combination of dis vars - START -------------- ##
## ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ ##
## counties long and lat available but no prevalence data
oot_cnty <- all_random_effect$county[!all_random_effect$county%in%in_cnty]
random_effect_oot <- all_random_effect[all_random_effect$county%in%oot_cnty, ]
poverty_oot <- poverty_pop[poverty_pop$county%in%oot_cnty,]
poverty_oot <- poverty_oot[order(poverty_oot$county), ]
random_effect_oot <- random_effect_oot[order(random_effect_oot$county), ]
### predict county level prevalence using eqn (3) for all counties
# beta for cont vars
beta_cont <- beta_est[beta_est[,1] %in% cont_vars0,]
if(is.factor(beta_cont$X)){
beta_cont$X <- as.character(beta_cont$X)
}
beta_cont_var <- beta_cont[,"Estimate"] # coefs for cont vars
cnty_pred <- list(); cnty_pred_all<-list()
for(j in 1:length(unique(poverty_pop$year))){
# j <- 2
yr <- unique(poverty_pop$year)[j]
pov_rate_val <- poverty_pop[poverty_pop$year==yr,]
pov_rate_val <- pov_rate_val[order(pov_rate_val$county), ]
oot_com_cnt <- intersect(pov_rate_val$county,all_random_effect$county)
re_temp <- all_random_effect[all_random_effect$county%in%oot_com_cnt, ]
re_temp <- re_temp[order(re_temp$county), ]
if(j==1){
temp_X_beta <- X_beta[1:(length(X_beta)/2)]
temp_DX1 <- DX1[1:(length(X_beta)/2),!colnames(DX1)%in%"year2017"]
temp_DX1 <- temp_DX1[,-ncol(temp_DX1)]
}else if(j==2){
temp_X_beta <- X_beta[(length(X_beta)/2):(length(X_beta))]
temp_DX1 <- DX1[(length(X_beta)/2):(length(X_beta)),!colnames(DX1)%in%"year2015"]
temp_DX1 <- temp_DX1[,-ncol(temp_DX1)]
}
for(i in 1:nrow(re_temp)){
# i <-1
temp_pvt_rate <- pov_rate_val$povt_rate[i]
cnty_pred[[i]] <- data.frame(county=re_temp$county[i],temp_DX1,temp_X_beta,pov_rate_val$povt_rate[i],re_temp$rand_eff[i])
# cnty_pred[[i]] <- data.frame(county=re_temp$county[i],DX1,X_beta,pov_rate_val$povt_rate[i],re_temp[i])
}
cnty_pred_yr <- do.call(rbind,cnty_pred)
cnty_pred_yr <- data.frame(year=yr,cnty_pred_yr,
logitP=cnty_pred_yr$temp_X_beta+
cnty_pred_yr$pov_rate_val.povt_rate.i.+
cnty_pred_yr$re_temp.rand_eff.i.)
cnty_predicted_prevalence <- data.frame(cnty_pred_yr,
pred_prevalence=round(exp(cnty_pred_yr$logitP)/(1+exp(cnty_pred_yr$logitP))*100,2))
cnty_pred_all[[j]] <- cnty_predicted_prevalence
}
cnty_pred_all <- do.call(rbind,cnty_pred_all)
cnty_pred_all$year1 <- NULL
cnty_pred_all <- fastDummies::dummy_columns(cnty_pred_all,select_columns = "year")
colnames(cnty_pred_all) <- gsub("_","",colnames(cnty_pred_all))
cnty_pred_all <- cnty_pred_all[,!colnames(cnty_pred_all)%in%dis_vars]
## ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ ##
## ---------- predict prev for missing counties for each combination of dis vars - END -------------- ##
## ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ ##
merge_by_var <- c(colnames(DX1),"county")
merge_by_var <- merge_by_var[merge_by_var!="Int"]
final_3way_df <- merge(census_SexAgeRace_DX,cnty_pred_all,
by=merge_by_var,all.x=TRUE)
final_3way_df <- final_3way_df[order(final_3way_df$county), ]
head(final_3way_df)
final_3way_df <- data.frame(final_3way_df,
prob_cnty=(final_3way_df$predprevalence)*final_3way_df$popn)
final_3way_df <- final_3way_df[order(final_3way_df$county), ]
pred_prev_cnty <- sqldf("SELECT county,year ,round(SUM(prob_cnty)/SUM(popn),2) as pred_cnty_prev
FROM final_3way_df GROUP BY county, year")
head(pred_prev_cnty)
dim(pred_prev_cnty)
##### combine pred_prev_cnty with available YRBSS county prevalence
true_prev=data.frame(true_prev,
true_prevalence=round(exp(true_prev$logit_prob)/(1+exp(true_prev$logit_prob))*100,2))
prev_pred_all <- merge(pred_prev_cnty,true_prev, by=c("county","year"),all.x=TRUE)
head(prev_pred_all)
######### ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ ############
######### +++++++++++++++++++ +++++++++++++++++ ############
######### +++++++++++++++++++ Create US map prevalnce +++++++++++++++++ ############
######### +++++++++++++++++++ +++++++++++++++++ ############
######### ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ ############
us_state <- usmap::us_map(regions = "counties")
colnames(us_state) <- c("long","lat","order","hole","piece","group","fips","abbr","full","county")
plot_data<- list()
data_all <- data.frame(plot_func(pred_prev_cnty,us_state)[1]) ### Call the plot function
# df_state <- data.frame(plot_func(pred_prev_cnty)[2])
pred.out$tit_y <- tit_y
pred.out$prev_all <- prev_pred_all
pred.out$pred_prev_cnty <- pred_prev_cnty
if(save.out=="YES"){
write.csv(pred_prev_cnty,
paste0(out_path,"/pred_prev_cnty.csv"), row.names = F)
write.csv(prev_pred_all,
paste0(out_path,"/prev_pred_all.csv"), row.names = F)
write.csv(all_random_effect,
paste0(out_path,"/all_random_effect.csv"), row.names = F)
write.csv(cnty_predicted_prevalence,
paste0(out_path,"/cnty_predicted_prevalence.csv"), row.names = F)
}
# *************************************************************************************** #
# #
# Prevalence map #
# #
# *************************************************************************************** #
if(fit.model=="YES"){
# hglm.fit <- hglm_3model_fit(X,Z,beta0,u0,theta0,save.out,out.path)
beta_est <- beta_est # hglm.fit$summary.hglm$`fixed effects estimates`
theta_est <- theta_est # hglm.fit$var.par
u_est <- u_est # hglm.fit$re
# pred.out <- hglm_4pred(DX0,resp,dis_vars,cnty_centrd_df,data_change,out.path)
pred_prev_cnty <- pred.out$pred_prev_cnty
tit_y <- pred.out$tit_y
prev_all <- pred.out$prev_all
}else{
beta_est <- data.frame(read.csv(paste0(out.path,model.type,"/",model.type,"_fe_est.csv")))
theta_est <- data.frame(read.csv(paste0(out.path,model.type,"/",model.type,"_theta_est.csv")))
u_est <- data.frame(read.csv(paste0(out.path,model.type,"/",model.type,"_re_est.csv")))
prev_all <- data.frame(read.csv(paste0(out.path,model.type,"/prev_pred_all.csv")))
pred_prev_cnty <- data.frame(read.csv(paste0(out.path,model.type,"/pred_prev_cnty.csv")))
tit_y <- resp_title(resp=resp)
}
beta_est
yr <- unique(df$year)
yr
obs_pred_prev <- prev_all[!is.na(prev_all$true_prevalence), ]
obs_pred_prev <- obs_pred_prev[!is.na(obs_pred_prev$pred_cnty_prev), ]
if(is.factor(pred_prev_cnty$year)){pred_prev_cnty$year <- as.numeric(as.character(pred_prev_cnty$year))}
resp0 <- gsub("_"," ",resp)
df_state <- data.frame(mean_log_lat(pred_prev_cnty))
if(plot_by_year=="YES"){
if(is.factor(pred_prev_cnty$year)){
pred_prev_cnty$year <- as.numeric(as.character(pred_prev_cnty$year))
}
plots <- list()
for(j in 1:length(yr)){
# j <- 1
obs_pred_cor <- obs_pred_prev[obs_pred_prev$year==yr[j] ,]
us_state <- usmap::us_map(regions = "counties")
colnames(us_state) <- c("long","lat","order","hole","piece","group","fips","abbr","full","county")
data_all <- plot_func(pred_prev_cnty,us_state)[[j]]
# Pearson and Spearman correlation for predicted and observed prev
pear_cor <- round(cor(obs_pred_cor$pred_cnty_prev,
obs_pred_cor$true_prevalence, method = "pearson"), 2)
spea_cor <- round(cor(obs_pred_cor$pred_cnty_prev,
obs_pred_cor$true_prevalence, method = "spearman"),2)
plot_prev <-
ggplot(data = data_all,aes(x=long,y=lat))+
geom_polygon(mapping = aes(x = long, y = lat,
group = fips,
fill = prev_quant),colour="gray", size = 0.4)+
scale_fill_brewer(palette ="Oranges")+
xlab("longitude")+ylab("latitude") +
guides(fill=guide_legend(title=paste0(resp0 ,": ")))+
theme(legend.position="bottom")+
annotate("text",x=df_state$mlong,y=df_state$mlat,label=df_state$abbr) +
borders("state") +
ggtitle(paste0("County prevalence of ",resp0," - ",yr[j]),
subtitle = substitute(paste("Correlation = (",rho^{P},", ",rho^{S},")"," = (",pear_cor,", ",spea_cor,")"),
list(pear_cor=pear_cor,spea_cor=spea_cor)))+
theme(plot.subtitle = element_text(color = "Brown", size=12, face="italic"))
plots[[j]] <- plot_prev
if(save.plot=="YES"){
ggsave(paste0(dir_path,"/out/",resp,"/",model.type,"/cnty_prev_",yr[j],".png"),plot=plots[[j]],width = 8,height = 6)
}
}
}else if(plot_by_year=="NO"){
obs_pred_cor0 <- obs_pred_prev
obs_pred_cor <- obs_pred_cor0 %>% dplyr::select(county,pred_cnty_prev,true_prevalence) %>%
group_by(county) %>%
summarise(ObsAvgPrev=mean(true_prevalence),PredAvgPrev=mean(pred_cnty_prev))
obs_pred_cor <- data.frame(obs_pred_cor)
us_state <- usmap::us_map(regions = "counties")
colnames(us_state) <- c("long","lat","order","hole","piece","group","fips","abbr","full","county")
data_all <- plot_func(pred_prev_cnty,us_state)
# Pearson and Spearman correlation for predicted and observed prev
pear_cor <- round(cor(obs_pred_cor$PredAvgPrev,
obs_pred_cor$ObsAvgPrev, method = "pearson"), 2)
spea_cor <- round(cor(obs_pred_cor$PredAvgPrev,
obs_pred_cor$ObsAvgPrev, method = "spearman"),2)
plot_prev <-
ggplot(data = data_all,aes(x=long,y=lat))+
geom_polygon(mapping = aes(x = long, y = lat,
group = fips,
fill = prev_quant),colour="gray", size = 0.4)+
scale_fill_brewer(palette ="Oranges")+
xlab("longitude")+ylab("latitude") +
guides(fill=guide_legend(title=paste0(resp0,": ")))+
theme(legend.position="bottom")+
annotate("text",x=df_state$mlong,y=df_state$mlat,label=df_state$abbr) +
borders("state") +
ggtitle(paste0("County prevalence of ",resp0,", 2015 and 2017 combined"),
subtitle = substitute(paste("Correlation = (",rho^{P},", ",rho^{S},")"," = (",pear_cor,", ",spea_cor,")"),
list(pear_cor=pear_cor,spea_cor=spea_cor)))+
theme(plot.subtitle = element_text(color = "Brown", size=12, face="italic"))
if(save.plot=="YES"){
ggsave(paste0(dir_path,"/out/",resp,"/",model.type,"/cnty_prev_2015_2017",".png"),plot=plot_prev,width = 8,height = 6)
}
}
Author: Nirosha Rathnayake, Department of Biostatistics, University of Nebraska Medical Center, Omaha, NE.
Prepared using R Markdown