library(rjags) #Uses JAGS to create bayesian models
## Loading required package: coda
## Linked to JAGS 4.3.0
## Loaded modules: basemod,bugs
library(coda)
library(tidyverse) #Utility functions
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.1.0 ✓ dplyr 1.0.5
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
set.seed(123)
Examine <- function(vector){
#To examine the variable to grab info from it
H <- head(vector)
C <- class(vector)
S <- summary(vector)
M <- sum(is.na(vector))
D <- sum(duplicated(vector))
print("First 6 obs")
print(H)
print(S)
print(paste("Class ", C))
print(paste("Missing ", M))
print(paste("Duplicates", D))
}
ScatterPlot <- function(data, x, y){
ggplot(data, mapping = aes(x, y))+
geom_point()+
geom_smooth() +
xlab(deparse(substitute(x)))+
ylab(deparse(substitute(y)))
}
The following will be a typical modeling situation. Fitting a linear regression with continuous variables the data set I will be using is the Auto dataset from the ISLR package
df.A <- ISLR::Auto #A dataset that contains various variables related to cars
attach(df.A) #This allows me to avoid writing the df.A$ notation and can call variables as is
## The following object is masked from package:ggplot2:
##
## mpg
With the Auto dataset we will try to model Miles per gallon (mpg) for a car.
pairs(df.A) #Get a glimpse of the behavior of the data
We can see that there are variables that seem to have a linear relationship with mpg. Mainly displacement, horsepower, and weight. We will focus on these variables.
miles per gallon
Examine(mpg)
## [1] "First 6 obs"
## [1] 18 15 18 16 17 15
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 9.00 17.00 22.75 23.45 29.00 46.60
## [1] "Class numeric"
## [1] "Missing 0"
## [1] "Duplicates 265"
We can see no missing data. The range is 9 - 46.60 with a mean of 23.45
hist(mpg)
In the histogram we see the data seems to have right skewness
Engine displacement (cu. inches)
Examine(displacement)
## [1] "First 6 obs"
## [1] 307 350 318 304 302 429
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 68.0 105.0 151.0 194.4 275.8 455.0
## [1] "Class numeric"
## [1] "Missing 0"
## [1] "Duplicates 311"
No missing data. The range is 68 - 455 with a mean of 194.4
hist(displacement)
Very right skewed
ScatterPlot(data = df.A, x = displacement, y = mpg)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
cor(displacement, mpg)
## [1] -0.8051269
The relationship between displacement and mpg is negative and not linear.
#Engine horsepower
Examine(horsepower)
## [1] "First 6 obs"
## [1] 130 165 150 150 140 198
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 46.0 75.0 93.5 104.5 126.0 230.0
## [1] "Class numeric"
## [1] "Missing 0"
## [1] "Duplicates 299"
No missings. Ranges 46 - 230 with a mean of 104.
hist(horsepower)
Again right skewed
ScatterPlot(df.A, horsepower, mpg)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
cor(horsepower, mpg)
## [1] -0.7784268
The relationship between horsepower and mpg is negative and not linear.
Vehicle weight (lbs.)
Examine(weight)
## [1] "First 6 obs"
## [1] 3504 3693 3436 3433 3449 4341
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1613 2225 2804 2978 3615 5140
## [1] "Class numeric"
## [1] "Missing 0"
## [1] "Duplicates 46"
No Missings. Range 1613 - 5140 with mean 2978
ScatterPlot(df.A, weight, mpg)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
cor(weight, mpg)
## [1] -0.8322442
The relationship between weight and mpg is negative and looks pretty linear.
From our variables displacement, horsepower and weight they seem all good variables to explain mpg.
This model show uninformative priors.
df.A.model <- "model {
#Model
for (i in 1:length(Y)) {
Y[i] ~ dnorm(mu[i], s^(-2))
mu[i] <- b0 + b1*x1[i] + b2*x2[i] + b3*x3[i]
}
#Prior
b0 ~ dnorm(0, 200^(-2))
b1 ~ dnorm(0, 200^(-2))
b2 ~ dnorm(0, 200^(-2))
b3 ~ dnorm(0, 200^(-2))
s ~ dunif(0,200)
}"
A.data <- list(
Y = mpg,
x1 = displacement,
x2 = horsepower,
x3 = weight
)
Using the initial coef from a linear model to set the intial values
A.n.chains <- 3
A.inital.coef <- coef(df.A.lm <- lm(mpg ~ displacement + horsepower + weight))
A.inits = list()
for(i in 1:A.n.chains){
A.inits[[i]] <- list(b0 = rnorm(1,A.inital.coef['(Intercept)'],200),
b1 = rnorm(1,A.inital.coef["displacement"], 200),
b2 = rnorm(1,A.inital.coef["horsepower"], 200),
b3 = rnorm(1,A.inital.coef["weight"], 200),
.RNG.name = "base::Wichmann-Hill",
.RNG.seed = 10)
}
Is n.adapt the same as burn in?
df.A.jags <- jags.model(
file = textConnection(df.A.model),
data = A.data,
inits = A.inits,
n.chains = A.n.chains,
n.adapt=0
)
## Compiling model graph
## Resolving undeclared variables
## Allocating nodes
## Graph information:
## Observed stochastic nodes: 392
## Unobserved stochastic nodes: 5
## Total graph size: 2486
##
## Initializing model
A.n.iter <- 100000
df.A.sim <- coda.samples(
model = df.A.jags,
variable.names = c("b0", "b1", 'b2', 'b3', 's'),
n.iter = A.n.iter
)
## NOTE: Stopping adaptation
#Seems 1000 is a good burnin time
plot(window(df.A.sim, 1,1000), density = F)
df.A.sim.nb <- window(df.A.sim, 1000)
plot(df.A.sim.nb)
I am curious if it looks better just because it has more points so can’t see the detail.
plot(window(df.A.sim.nb, A.n.iter-5000))
When looking at the small window seems okay.
We see the coefficients are negative which agree with our preliminary data.
summary(df.A.sim.nb)
##
## Iterations = 1000:1e+05
## Thinning interval = 1
## Number of chains = 3
## Sample size per chain = 99001
##
## 1. Empirical mean and standard deviation for each variable,
## plus standard error of the mean:
##
## Mean SD Naive SE Time-series SE
## b0 44.810943 1.2073154 2.215e-03 2.659e-02
## b1 -0.006015 0.0066504 1.220e-05 1.422e-04
## b2 -0.041273 0.0129945 2.384e-05 2.171e-04
## b3 -0.005335 0.0007251 1.331e-06 1.791e-05
## s 4.255586 0.1538231 2.823e-04 3.211e-04
##
## 2. Quantiles for each variable:
##
## 2.5% 25% 50% 75% 97.5%
## b0 42.444434 43.998344 44.810561 45.618054 47.198248
## b1 -0.019068 -0.010490 -0.005988 -0.001585 0.007126
## b2 -0.066449 -0.050045 -0.041414 -0.032685 -0.015115
## b3 -0.006765 -0.005817 -0.005337 -0.004847 -0.003920
## s 3.967313 4.150135 4.250443 4.356207 4.570975
gelman.diag(df.A.sim.nb)
## Potential scale reduction factors:
##
## Point est. Upper C.I.
## b0 1 1
## b1 1 1
## b2 1 1
## b3 1 1
## s 1 1
##
## Multivariate psrf
##
## 1
gelman.plot(df.A.sim.nb)
The values are less than 1.1
df.A.sim.post <- data.frame(df.A.sim.nb[[1]])
Density for each parameter
df.A.sim.post %>%
gather() %>%
ggplot(mapping = aes(value))+
facet_wrap(~key, scales = "free")+
geom_density()
95 % Credible interval
df.A.sim.post %>%
gather() %>%
group_by(key) %>%
summarise(mean = mean(value),
q2.5= quantile(value, .025),
q97.5 = quantile(value, .975))
## # A tibble: 5 x 4
## key mean q2.5 q97.5
## <chr> <dbl> <dbl> <dbl>
## 1 b0 44.8 42.5 47.1
## 2 b1 -0.00619 -0.0190 0.00679
## 3 b2 -0.0411 -0.0666 -0.0151
## 4 b3 -0.00532 -0.00677 -0.00392
## 5 s 4.26 3.97 4.57
Interestingly the b1 variable changes from positive to negative.
mean(df.A.sim.post$b1 > 0)
## [1] 0.1679074
There is a 20% chance of b1 being positive.
summary(df.A.lm)
##
## Call:
## lm(formula = mpg ~ displacement + horsepower + weight)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.3347 -2.8028 -0.3402 2.2037 16.2409
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 44.8559357 1.1959200 37.507 < 2e-16 ***
## displacement -0.0057688 0.0065819 -0.876 0.38132
## horsepower -0.0416741 0.0128139 -3.252 0.00125 **
## weight -0.0053516 0.0007124 -7.513 4.04e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.241 on 388 degrees of freedom
## Multiple R-squared: 0.707, Adjusted R-squared: 0.7047
## F-statistic: 312 on 3 and 388 DF, p-value: < 2.2e-16
When comparing with a standard linear regression we can see that b1(displacement) is not significant
Correlation of parameters
pairs(df.A.sim.post)
cor(df.A.sim.post)
## b0 b1 b2 b3 s
## b0 1.000000000 0.733744806 -0.22888246 -0.8174942678 -0.0025109505
## b1 0.733744806 1.000000000 -0.47012734 -0.6932017830 -0.0057228788
## b2 -0.228882459 -0.470127343 1.00000000 -0.2289251712 0.0080312402
## b3 -0.817494268 -0.693201783 -0.22892517 1.0000000000 -0.0004849883
## s -0.002510951 -0.005722879 0.00803124 -0.0004849883 1.0000000000
There seems to be multicolinearity which we saw already.