This paper analyses five candidate nonlinear regression models and choosing the one that can best describe the relationship between four brain electroencephalogram (EEG) signals. Model selection is based on Residual Sum of Squares (RSS), Akaike Information Criterion (AIC), Bayesian Information Criterion (BIC), Shapiro-Wilk normality test and residuals distribution plot. After obtaining the parameter estimations, the data set is split and the model with the obtained parameters from the train set is tested. A residual plot with 95% confidence interval error bars is plotted. Approximate Bayesian Computation is used to approximate the two largest parameters from the selected best model as an alternative method to estimate model parameters.
time <- read.csv("/home/issam/Documents/7089/time.csv", header=F)
x <- read.csv("/home/issam/Documents/7089/x.csv", header=F)
y <- read.csv("/home/issam/Documents/7089/y.csv", header=F)
head(time)
## V1
## 1 0.000
## 2 0.002
## 3 0.004
## 4 0.006
## 5 0.008
## 6 0.010
head(x)
## V1 V2 V3 V4
## 1 -3.0615656 -1.8651967 -2.90981227 -0.3394280
## 2 0.2446936 2.0391609 0.99930151 0.6122276
## 3 -1.2009670 0.1774090 -0.79654755 -2.6770103
## 4 -0.7279816 -1.5566153 0.01039548 -1.6230083
## 5 2.2049284 0.7661962 1.86810725 1.4313685
## 6 3.1672849 1.8484216 1.73971053 2.4726877
head(y)
## V1
## 1 -1.11678535
## 2 1.33388908
## 3 0.21606254
## 4 0.17645157
## 5 0.01104331
## 6 1.03716149
time <- rename(time,
Time = V1)
x <- rename(x,
X1 = V1,
X2 = V2,
X3 = V3,
X4 = V4
)
y <-rename (y,
Output = V1
)
df <- cbind(time,x,y)
str(df)
## 'data.frame': 201 obs. of 6 variables:
## $ Time : num 0 0.002 0.004 0.006 0.008 0.01 0.012 0.014 0.016 0.018 ...
## $ X1 : num -3.062 0.245 -1.201 -0.728 2.205 ...
## $ X2 : num -1.865 2.039 0.177 -1.557 0.766 ...
## $ X3 : num -2.9098 0.9993 -0.7965 0.0104 1.8681 ...
## $ X4 : num -0.339 0.612 -2.677 -1.623 1.431 ...
## $ Output: num -1.117 1.334 0.216 0.176 0.011 ...
summary(df)
## Time X1 X2 X3
## Min. :0.0 Min. :-8.6660 Min. :-6.13928 Min. :-5.97673
## 1st Qu.:0.1 1st Qu.:-2.2482 1st Qu.:-1.40102 1st Qu.:-1.57921
## Median :0.2 Median : 0.2447 Median : 0.08675 Median :-0.09896
## Mean :0.2 Mean :-0.0154 Mean : 0.03334 Mean : 0.03360
## 3rd Qu.:0.3 3rd Qu.: 2.6706 3rd Qu.: 1.47238 3rd Qu.: 1.62626
## Max. :0.4 Max. : 8.1559 Max. : 5.76075 Max. : 6.12148
## X4 Output
## Min. :-8.79904 Min. :-8.7165
## 1st Qu.:-2.18124 1st Qu.:-0.2230
## Median : 0.08573 Median : 0.3281
## Mean : 0.01156 Mean : 0.1085
## 3rd Qu.: 2.44413 3rd Qu.: 0.8544
## Max. : 8.06003 Max. : 4.9507
head(df)
## Time X1 X2 X3 X4 Output
## 1 0.000 -3.0615656 -1.8651967 -2.90981227 -0.3394280 -1.11678535
## 2 0.002 0.2446936 2.0391609 0.99930151 0.6122276 1.33388908
## 3 0.004 -1.2009670 0.1774090 -0.79654755 -2.6770103 0.21606254
## 4 0.006 -0.7279816 -1.5566153 0.01039548 -1.6230083 0.17645157
## 5 0.008 2.2049284 0.7661962 1.86810725 1.4313685 0.01104331
## 6 0.010 3.1672849 1.8484216 1.73971053 2.4726877 1.03716149
rcm <- rcorr(as.matrix(select(df, -Time)))
col <-
colorRampPalette(c("#BB4444", "#EE9988", "#FFFFFF", "#77AADD", "#4477AA"))
corrplot(rcm$r, method="color", col=col(50),
type="upper", order="hclust",
addCoef.col = "black",
tl.col="black", tl.srt=45,
diag=FALSE
)
select(df,-Time) %>%
chart.Correlation(histogram=T, pch=15)
Examining time series plot of each input.
inputsPlot <- subset(df, select=-Output) %>%
melt(
id.vars = 'Time',
variable.name = 'Inputs'
) %>%
ggplot(aes(Time, value)) +
geom_line() +
facet_grid(Inputs ~ .)
inputsPlot
w <- c(0.5,0.3,0.2,0.3,0.5)
plots <- lapply(
2:ncol(df), function(col) ggplot(df) +
geom_freqpoly(mapping=aes(x=df[,col]), binwidth=w[col-1]) +
labs(x=names(df)[col])
)
do.call(gridExtra::grid.arrange, plots)
model1 <- lm(Output ~ cbind(X4,X1^2,X1^3,X3^4), data=df)
model2 <- lm(Output ~ cbind(X3^3,X3^4), data=df)
model3 <- lm(Output ~ cbind(X2,X1^3,X3^4), data=df)
model4 <- lm(Output ~ cbind(X4,X1^3,X3^4), data=df)
model5 <- lm(Output ~ cbind(X4,X1^2,X1^3,X3^4,X1^4), data=df)
coef(summary(model1))
## Estimate Std. Error t value
## (Intercept) 0.468551685 0.0494023872 9.4843936
## cbind(X4, X1^2, X1^3, X3^4)X4 -0.034101975 0.0171038896 -1.9938141
## cbind(X4, X1^2, X1^3, X3^4) -0.001849575 0.0027246892 -0.6788205
## cbind(X4, X1^2, X1^3, X3^4) 0.010381813 0.0004055467 25.5995519
## cbind(X4, X1^2, X1^3, X3^4) -0.001949154 0.0001780802 -10.9453703
## Pr(>|t|)
## (Intercept) 8.430471e-18
## cbind(X4, X1^2, X1^3, X3^4)X4 4.755883e-02
## cbind(X4, X1^2, X1^3, X3^4) 4.980523e-01
## cbind(X4, X1^2, X1^3, X3^4) 2.007091e-64
## cbind(X4, X1^2, X1^3, X3^4) 4.578999e-22
coef(summary(model2))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.303501144 0.1013986746 2.993147 3.112801e-03
## cbind(X3^3, X3^4)1 0.016334677 0.0020951161 7.796550 3.544085e-13
## cbind(X3^3, X3^4)2 -0.002713985 0.0004248354 -6.388322 1.172923e-09
coef(summary(model3))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.448299550 0.0413103291 10.851997 8.264351e-22
## cbind(X2, X1^3, X3^4)X2 0.038109255 0.0176230811 2.162463 3.178761e-02
## cbind(X2, X1^3, X3^4) 0.009827804 0.0002728205 36.022974 1.250731e-88
## cbind(X2, X1^3, X3^4) -0.002092558 0.0001691519 -12.370879 2.190506e-26
coef(summary(model4))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.450329209 0.0414158255 10.873361 7.139303e-22
## cbind(X4, X1^3, X3^4)X4 -0.034881772 0.0170418986 -2.046824 4.200324e-02
## cbind(X4, X1^3, X3^4) 0.010482178 0.0003771179 27.795494 4.255355e-70
## cbind(X4, X1^3, X3^4) -0.001990496 0.0001671131 -11.911071 5.450644e-25
coef(summary(model5))
## Estimate Std. Error t value
## (Intercept) 4.606560e-01 0.0580232330 7.9391648
## cbind(X4, X1^2, X1^3, X3^4, X1^4)X4 -3.395313e-02 0.0171541681 -1.9792931
## cbind(X4, X1^2, X1^3, X3^4, X1^4) -2.701644e-04 0.0066370020 -0.0407058
## cbind(X4, X1^2, X1^3, X3^4, X1^4) 1.034764e-02 0.0004270604 24.2299296
## cbind(X4, X1^2, X1^3, X3^4, X1^4) -1.943452e-03 0.0001798357 -10.8068216
## cbind(X4, X1^2, X1^3, X3^4, X1^4) -3.083194e-05 0.0001180836 -0.2611027
## Pr(>|t|)
## (Intercept) 1.571546e-13
## cbind(X4, X1^2, X1^3, X3^4, X1^4)X4 4.919018e-02
## cbind(X4, X1^2, X1^3, X3^4, X1^4) 9.675721e-01
## cbind(X4, X1^2, X1^3, X3^4, X1^4) 1.008750e-60
## cbind(X4, X1^2, X1^3, X3^4, X1^4) 1.239243e-21
## cbind(X4, X1^2, X1^3, X3^4, X1^4) 7.942888e-01
anova(model1, model2, model3, model4, model5)
## Analysis of Variance Table
##
## Model 1: Output ~ cbind(X4, X1^2, X1^3, X3^4)
## Model 2: Output ~ cbind(X3^3, X3^4)
## Model 3: Output ~ cbind(X2, X1^3, X3^4)
## Model 4: Output ~ cbind(X4, X1^3, X3^4)
## Model 5: Output ~ cbind(X4, X1^2, X1^3, X3^4, X1^4)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 196 57.33
## 2 198 351.41 -2 -294.085 500.3265 <2e-16 ***
## 3 197 57.33 1 294.089 1000.6662 <2e-16 ***
## 4 197 57.46 0 -0.139
## 5 195 57.31 2 0.155 0.2634 0.7687
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
N <- dim(df)[1]
RSS <- anova(model1, model2, model3, model4, model5)[,2]
sigma_2 <- RSS/(N-1)
log_likelihood <- -0.5*(N*log(2*pi*sigma_2) + N-1)
log_likelihood
## [1] -159.1313 -341.3534 -159.1244 -159.3673 -159.0962
# LONG WAY
k <- c(
dim(coef(summary(model1)))[1],
dim(coef(summary(model2)))[1],
dim(coef(summary(model3)))[1],
dim(coef(summary(model4)))[1],
dim(coef(summary(model5)))[1]
)
AIC <- data.frame(2*(k -log_likelihood))
BIC <- data.frame(k*log(N) - 2*log_likelihood)
# MORE CONVENIENT ALTERNATIVE:
AIC_BIC <- cbind(
data.frame(AIC(model1, model2, model3, model4, model5)),
data.frame(BIC(model1, model2, model3, model4, model5))) %>%
select(-df) %>%
tibble::rownames_to_column(var="MODEL NAME")
AIC_BIC
## MODEL NAME AIC BIC
## 1 model1 330.2601 350.0799
## 2 model2 690.7044 703.9176
## 3 model3 328.2464 344.7629
## 4 model4 328.7321 345.2486
## 5 model5 332.1898 355.3130
df$Predicted1 <- predict(model1)
df$Residuals1 <- residuals(model1)
df$Predicted2 <- predict(model2)
df$Residuals2 <- residuals(model2)
df$Predicted3 <- predict(model3)
df$Residuals3 <- residuals(model3)
df$Predicted4 <- predict(model4)
df$Residuals4 <- residuals(model4)
df$Predicted5 <- predict(model5)
df$Residuals5 <- residuals(model5)
plot(model1)
ols_test_normality(model1)
## -----------------------------------------------
## Test Statistic pvalue
## -----------------------------------------------
## Shapiro-Wilk 0.9946 0.6874
## Kolmogorov-Smirnov 0.0491 0.7177
## Cramer-von Mises 20.7473 0.0000
## Anderson-Darling 0.3359 0.5046
## -----------------------------------------------
ggplot(df, aes(Residuals1)) + geom_histogram(aes(y=..density..), color="black",
bins=15, alpha=0.2, fill="#69b3a2", size=.2) +
geom_density(col = "blue") +
labs(title="model1 Residuals Distribution Plot",
y="Density", x="model1 Residuals")
plot(model2)
ols_test_normality(model2)
## -----------------------------------------------
## Test Statistic pvalue
## -----------------------------------------------
## Shapiro-Wilk 0.8907 0.0000
## Kolmogorov-Smirnov 0.1425 6e-04
## Cramer-von Mises 12.2383 0.0000
## Anderson-Darling 5.6785 0.0000
## -----------------------------------------------
ggplot(df, aes(Residuals2)) + geom_histogram(aes(y=..density..), color="black",
bins=15, alpha=0.2, fill="#69b3a2", size=.2) +
geom_density(col = "blue") +
labs(title="model2 Residuals Distribution Plot",
y="Density", x="model2 Residuals")
# EXTRA: CURVE PLOT VS OUTPUT
cx2 <- coef(summary(model2)) # cx2: the coefficients of model2 (parameters)
model2_curve = function(x) {cx2[2,1]*x^3+cx2[3,1]*x^4+cx2[1,1]}
plot(model2_curve,xlim=c(-6,7),ylim=c(-7,7), main="model2 Polynomial Curve and Output Plot",
ylab="Curve / Output", xlab="X2")
points(df$X3, df$Output)
plot(model3)
ols_test_normality(model3)
## -----------------------------------------------
## Test Statistic pvalue
## -----------------------------------------------
## Shapiro-Wilk 0.9927 0.4166
## Kolmogorov-Smirnov 0.0333 0.9790
## Cramer-von Mises 20.9389 0.0000
## Anderson-Darling 0.3779 0.4050
## -----------------------------------------------
ggplot(df, aes(Residuals3)) + geom_histogram(aes(y=..density..), color="black",
bins=15, alpha=0.2, fill="#69b3a2", size=.2) +
geom_density(col = "blue") +
labs(title="model3 Residuals Distribution Plot",
y="Density", x="model3 Residuals")
plot(model4)
ols_test_normality(model4)
## -----------------------------------------------
## Test Statistic pvalue
## -----------------------------------------------
## Shapiro-Wilk 0.9945 0.6674
## Kolmogorov-Smirnov 0.0512 0.6667
## Cramer-von Mises 20.8712 0.0000
## Anderson-Darling 0.3256 0.5197
## -----------------------------------------------
ggplot(df, aes(Residuals4)) + geom_histogram(aes(y=..density..), color="black",
bins=15, alpha=0.2, fill="#69b3a2", size=.2) +
geom_density(col = "blue") +
labs(title="model4 Residuals Distribution Plot",
y="Density", x="model4 Residuals")
plot(model5)
ols_test_normality(model5)
## -----------------------------------------------
## Test Statistic pvalue
## -----------------------------------------------
## Shapiro-Wilk 0.9945 0.6752
## Kolmogorov-Smirnov 0.0488 0.7242
## Cramer-von Mises 20.7947 0.0000
## Anderson-Darling 0.3318 0.5100
## -----------------------------------------------
ggplot(df, aes(Residuals5)) + geom_histogram(aes(y=..density..), color="black",
bins=15, alpha=0.2, fill="#69b3a2", size=.2) +
geom_density(col = "blue") +
labs(title="model5 Residuals Distribution Plot",
y="Density", x="model5 Residuals")
After examining the correlation plot, the five candidate models and before performing any other type of analysis the following can be intuitively deduced:
MODEL1: Contains X4, X1 and X3 as inputs where the correlation between X1 and X4 is 0.81, X1 and X3 is 0.58, X3 and X4 is 0.71. All are highly correlated which indicate this model does not provide any meaningful information
MODEL2: Contains the only X3, and given that correlation between X3 and Output is only 0.41, this model most probably under fits the data.
MODEL3: The highest correlation between two of its variables is 0.58 and contains only three terms. The most promising model thus far
MODEL4: Similar to MODEL1, its inputs X4, X1 and X3 are highly correlated which indicate this model does not provide meaningful information
MODEL5: In contrast to model 2, this model is over fits the data. It contains all inputs except X2. This model might provide a very good result in the train set but not necessarily in the test set.
RSS is the Residual Sum of Squares. It is calculated by summing over all values the square of predicted from the actual output. Therefore, a low RSS indicates the predicted output is close to the actual. The RSS values indicate that Model2 has the worst with 351, all others are around 57.
AIC is used for model selection where a lower value indicates a better fit of the model. AIC introduces a penalty with increased number of parameters (since AIC=2k-2ln(L) where k is the number of parameters and L is the log likelihood). BIC is also used for model selection where a lower value indicates a better fit of the model. BIC introduces a larger penalty term than AIC with increased number of parameters. (since BIC=k.ln(n) - 2ln(L), where the number of parameters is multiplied by the natural log of number of data points as opposed to 2 in AIC). Since Both depend on the log-likelihood, where the log-likelihood of model2 is the smallest (in the negative direction), model 2 is the worst of the five. The other models still provide a relatively similar AIC and BIC values where model3 and model5 are the lowest.
Residuals vs fitted plot for model2 is far from a straight line at zero confirming model2 is the worst. Model3 has the “straightest” line at 0, but still competing with other models and this test is inconclusive.
QQplot shows that model3 is the best model where the points fit nicely to the line.
Residuals plot show that model3 is the best since the distribution follows closely the normal distribution out of all other models.
Finally, Shapiro-Wilk normality test is around 99% for models 1,3,4,5.
Combining the results from the above analysis, model3 is the selected best.
set.seed(19)
#splitting the data 75% training set
split <- select(df, Time:Output) %>% initial_split(prop=3/4)
train <- training(split)
test <- testing(split)
head(train)
## Time X1 X2 X3 X4 Output
## 1 0.000 -3.0615656 -1.8651967 -2.90981227 -0.3394280 -1.1167854
## 2 0.002 0.2446936 2.0391609 0.99930151 0.6122276 1.3338891
## 3 0.004 -1.2009670 0.1774090 -0.79654755 -2.6770103 0.2160625
## 4 0.006 -0.7279816 -1.5566153 0.01039548 -1.6230083 0.1764516
## 6 0.010 3.1672849 1.8484216 1.73971053 2.4726877 1.0371615
## 7 0.012 4.1845487 0.7879847 -0.35208411 2.6905576 1.4908308
head(test)
## Time X1 X2 X3 X4 Output
## 5 0.008 2.2049284 0.7661962 1.868107 1.431368 0.01104331
## 8 0.014 4.6492372 1.9528545 3.017156 5.093125 0.33569177
## 11 0.020 0.3266145 1.2761762 -1.516941 -1.261791 0.40687177
## 14 0.026 -3.3070320 -4.5427411 -0.911867 -4.749581 -0.60102205
## 15 0.028 -4.1011878 -6.0209836 -3.211420 -6.434943 -0.29157064
## 26 0.050 3.3008929 1.1795140 -1.113149 3.600715 0.47886999
# model3 is the selected best
modelBest <-lm(Output ~ cbind(X2,X1^3,X3^4), data=train)
coef(summary(modelBest))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.491008388 0.0480111092 10.226974 7.093669e-19
## cbind(X2, X1^3, X3^4)X2 0.039808766 0.0208583205 1.908532 5.827170e-02
## cbind(X2, X1^3, X3^4) 0.009764381 0.0003012000 32.418266 7.556224e-69
## cbind(X2, X1^3, X3^4) -0.002140227 0.0001782022 -12.010106 1.387663e-23
anova(modelBest)
## Analysis of Variance Table
##
## Response: Output
## Df Sum Sq Mean Sq F value Pr(>F)
## cbind(X2, X1^3, X3^4) 3 374.17 124.722 425.68 < 2.2e-16 ***
## Residuals 147 43.07 0.293
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
train$Predicted <- predict(modelBest)
train %>% head()
## Time X1 X2 X3 X4 Output Predicted
## 1 0.000 -3.0615656 -1.8651967 -2.90981227 -0.3394280 -1.1167854 -0.01688085
## 2 0.002 0.2446936 2.0391609 0.99930151 0.6122276 1.3338891 0.57019367
## 3 0.004 -1.2009670 0.1774090 -0.79654755 -2.6770103 0.2160625 0.48029555
## 4 0.006 -0.7279816 -1.5566153 0.01039548 -1.6230083 0.1764516 0.42527436
## 6 0.010 3.1672849 1.8484216 1.73971053 2.4726877 1.0371615 0.85523267
## 7 0.012 4.1845487 0.7879847 -0.35208411 2.6905576 1.4908308 1.23781283
modelBest <-lm(Output ~ cbind(X2,X1^3,X3^4), data=train)
test <- cbind(test, data.frame(predict(modelBest, test, interval="confidence"))) %>%
rename(
Predicted = fit,
Lower = lwr,
Upper = upr
)
test %>% head()
## Time X1 X2 X3 X4 Output Predicted
## 5 0.008 2.2049284 0.7661962 1.868107 1.431368 0.01104331 0.60011559
## 8 0.014 4.6492372 1.9528545 3.017156 5.093125 0.33569177 1.37266405
## 11 0.020 0.3266145 1.2761762 -1.516941 -1.261791 0.40687177 0.53081887
## 14 0.026 -3.3070320 -4.5427411 -0.911867 -4.749581 -0.60102205 -0.04446285
## 15 0.028 -4.1011878 -6.0209836 -3.211420 -6.434943 -0.29157064 -0.64987527
## 26 0.050 3.3008929 1.1795140 -1.113149 3.600715 0.47886999 0.88586483
## Lower Upper
## 5 0.5018061 0.6984250
## 8 1.2470767 1.4982514
## 11 0.4238554 0.6377823
## 14 -0.2506059 0.1616802
## 15 -0.9128611 -0.3868894
## 26 0.7780388 0.9936908
ggplot() +
geom_errorbar(data=test, mapping=aes(x=Time, ymin=Lower, ymax=Upper))+
geom_point(data=test, mapping=aes(x=Time, y=Predicted), color='red')+
geom_point(data=test, mapping=aes(x=Time, y=Output), color='green') +
ggtitle("Predicted (Red) vs Output (Green) with 95% Confidence Intervals Error Bars")
ggplot() +
geom_errorbar(data=test, mapping=aes(x=X1, ymin=Lower, ymax=Upper))+
geom_point(data=test, mapping=aes(x=X1, y=Predicted), color='red')+
geom_point(data=test, mapping=aes(x=X1, y=Output), color='green') +
ggtitle("Predicted (Red) vs Output (Green) with 95% Confidence Intervals Error Bars")
ggplot() +
geom_errorbar(data=test, mapping=aes(x=X2, ymin=Lower, ymax=Upper))+
geom_point(data=test, mapping=aes(x=X2, y=Predicted), color='red')+
geom_point(data=test, mapping=aes(x=X2, y=Output), color='green') +
ggtitle("Predicted (Red) vs Output (Green) with 95% Confidence Intervals Error Bars")
ggplot() +
geom_errorbar(data=test, mapping=aes(x=X3, ymin=Lower, ymax=Upper))+
geom_point(data=test, mapping=aes(x=X3, y=Predicted), color='red')+
geom_point(data=test, mapping=aes(x=X3, y=Output), color='green') +
ggtitle("Predicted (Red) vs Output (Green) with 95% Confidence Intervals Error Bars")
ggplot() +
geom_errorbar(data=test, mapping=aes(x=X4, ymin=Lower, ymax=Upper))+
geom_point(data=test, mapping=aes(x=X4, y=Predicted), color='red')+
geom_point(data=test, mapping=aes(x=X4, y=Output), color='green') +
ggtitle("Predicted (Red) vs Output (Green) with 95% Confidence Intervals Error Bars")
print(coefficients <- as.matrix(coef(summary(model3))))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.448299550 0.0413103291 10.851997 8.264351e-22
## cbind(X2, X1^3, X3^4)X2 0.038109255 0.0176230811 2.162463 3.178761e-02
## cbind(X2, X1^3, X3^4) 0.009827804 0.0002728205 36.022974 1.250731e-88
## cbind(X2, X1^3, X3^4) -0.002092558 0.0001691519 -12.370879 2.190506e-26
bias <- coefficients[1,1]
theta1 <- coefficients[2,1]
stdBias <- coefficients[1,2]
stdTheta <- coefficients[2,2]
theta2 <- coefficients[3,1]
theta3 <- coefficients[4,1]
iterations <- 10000
biasPrior <- data.frame(runif(iterations,bias-50*stdBias, bias+50*stdBias))
thetaPrior <- data.frame(runif(iterations,theta1-50*stdTheta, theta1+50*stdTheta))
prior <- cbind(biasPrior, thetaPrior)
y <- as.matrix(df$Output)
ABC_ACCEPT <- c()
ABC_REJECT <- c()
theta1_posterior <- c()
bias_posterior <- c()
theta1_reject <- c()
bias_reject <- c()
for(i in 1:iterations)
{
yApprox <- as.matrix(prior[i,1]*df$X2 + theta2*df$X1^3 + theta3*df$X3^4 + prior[i,2])
SSE <- sum((y-yApprox)^2)
if(SSE < 70) #within approx. 10 from result obtained earlier
{
ABC_ACCEPT[i] <- SSE
theta1_posterior[i] <- prior[i,1]
bias_posterior[i] <- prior[i,2]
}
else
{
ABC_REJECT[i] <- SSE
theta1_reject[i] <- prior[i,1]
bias_reject[i] <- prior[i,2]
}
}
ABC_ACCEPT<-data.frame(ABC_ACCEPT[!is.na(ABC_ACCEPT)])
ABC_REJECT <- data.frame(ABC_REJECT[!is.na(ABC_REJECT)])
theta1_posterior<-data.frame(theta1_posterior[!is.na(theta1_posterior)])
bias_posterior<-data.frame(bias_posterior[!is.na(bias_posterior)])
theta1_reject<- data.frame(theta1_reject[!is.na(theta1_reject)])
bias_reject<- data.frame(bias_reject[!is.na(bias_reject)])
posterior <- data.frame(cbind(ABC_ACCEPT,theta1_posterior,bias_posterior))
posterior <- rename(posterior, ABC = ABC_ACCEPT..is.na.ABC_ACCEPT..,
theta1 = theta1_posterior..is.na.theta1_posterior..,
thetaBias = bias_posterior..is.na.bias_posterior..
)
reject <- data.frame(cbind(ABC_REJECT, theta1_reject, bias_reject))
reject <- rename(reject, ABC = ABC_REJECT..is.na.ABC_REJECT..,
theta1 = theta1_reject..is.na.theta1_reject..,
thetaBias = bias_reject..is.na.bias_reject..
)
plt<- ggplot() +
geom_point(reject, mapping=aes(x=theta1, y=thetaBias), colour="red")+
geom_point(posterior, mapping=aes(x=theta1, y=thetaBias), colour="green") +
xlim(c(-0.5,0.5))
plt
ggMarginal(plt, aes(x=theta1, y=thetaBias),type='densigram', fill = "blue",
xparams = list(bins=20))
ggplot(posterior, aes(theta1)) + geom_histogram(aes(y=..density..), color="black",
bins=10, alpha=0.2, fill="#69b3a2", size=.2) +
geom_density(col = "blue") +
labs(title="theta1",
y="Density", x="theta1")
ggplot(posterior, aes(thetaBias)) + geom_histogram(aes(y=..density..), color="black",
bins=10, alpha=0.2, fill="#69b3a2", size=.2) +
geom_density(col = "blue") +
labs(title="Theta Bias",
y="Density", x="thetaBias")
#Best estimated theta 1 and theta bias values
posterior[which.min(posterior$ABC),]
## ABC theta1 thetaBias
## 17 57.69842 0.05172484 0.4782768
Five candidate models are analyzed using various metrics and the best model is model 3 which has a normal distribution of residuals, lowest AIC and BIC values. The data set is split into train and test set. Model 3 parameters estimated from the train set are performed on the test set where the predicted values with 95% confidence intervals and actual output are plotted. ABC rejection method was used to estimate the biggest two parameters in our model namely theta bias and theta one. The approximation was successful in producing a low SSE of 57.6 which closely matches the regression estimate of 57.3
The source code is available on https://github.com/vxky19/7089/blob/main/8887140-code.Rmd