library("magrittr")
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("skimr")
library("olsrr")
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
library("lmtest")
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
This is an example of working with data resulting from Experimental Design. As we can read in the description: “Experimental data; testing amount of 4 materials added (A, B, C, D) in order to achieve a certain volumetric heat capacity, y.”
In this exercise, we will focus on: - finding best regression fitting the data to be able to predict heat capacity - finding interactions between the
First, let’s upload data:
mix1 <- read.csv("https://openmv.net/file/blender-efficiency.csv")
We start with data exploration. As the dataset consists of 18 observations, we can print it as it is, along with the summary
mix1
## ParticleSize MixerDiameter MixerRotation BlendingTime BlendingEfficiency
## 1 2 4 200 60 84.5
## 2 8 4 200 60 62.9
## 3 2 8 200 60 90.7
## 4 8 8 200 60 63.2
## 5 5 6 175 45 70.9
## 6 5 6 225 45 69.2
## 7 5 6 175 75 80.1
## 8 5 6 225 45 79.8
## 9 5 6 200 60 75.1
## 10 2 6 200 45 81.8
## 11 8 6 200 45 61.8
## 12 2 6 200 75 92.2
## 13 8 6 200 75 70.7
## 14 5 4 175 60 72.4
## 15 5 8 175 60 76.4
## 16 5 4 225 60 71.9
## 17 5 8 225 60 74.9
## 18 5 6 200 60 74.5
Below we summarize the dataset:
pacman::p_load("skimr")
skim(mix1)
| Name | mix1 |
| Number of rows | 18 |
| Number of columns | 5 |
| _______________________ | |
| Column type frequency: | |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| ParticleSize | 0 | 1 | 5.00 | 2.06 | 2.0 | 5.00 | 5.0 | 5.00 | 8.0 | ▃▁▇▁▃ |
| MixerDiameter | 0 | 1 | 6.00 | 1.37 | 4.0 | 6.00 | 6.0 | 6.00 | 8.0 | ▃▁▇▁▃ |
| MixerRotation | 0 | 1 | 200.00 | 17.15 | 175.0 | 200.00 | 200.0 | 200.00 | 225.0 | ▃▁▇▁▃ |
| BlendingTime | 0 | 1 | 58.33 | 10.15 | 45.0 | 48.75 | 60.0 | 60.00 | 75.0 | ▃▁▇▁▂ |
| BlendingEfficiency | 0 | 1 | 75.17 | 8.66 | 61.8 | 70.75 | 74.7 | 80.02 | 92.2 | ▅▇▇▅▃ |
Let’s group observations by independent variables, then calculate average value for replicated points and standart deviation (where applicable):
pacman::p_load("dplyr")
mix2 <- mix1 %>% group_by(ParticleSize,MixerDiameter ,MixerRotation,BlendingTime) %>% summarize(BlendingEfficiency=mean(BlendingEfficiency))
## `summarise()` has grouped output by 'ParticleSize', 'MixerDiameter',
## 'MixerRotation'. You can override using the `.groups` argument.
(mix2)
## # A tibble: 16 × 5
## # Groups: ParticleSize, MixerDiameter, MixerRotation [13]
## ParticleSize MixerDiameter MixerRotation BlendingTime BlendingEfficiency
## <int> <int> <int> <int> <dbl>
## 1 2 4 200 60 84.5
## 2 2 6 200 45 81.8
## 3 2 6 200 75 92.2
## 4 2 8 200 60 90.7
## 5 5 4 175 60 72.4
## 6 5 4 225 60 71.9
## 7 5 6 175 45 70.9
## 8 5 6 175 75 80.1
## 9 5 6 200 60 74.8
## 10 5 6 225 45 74.5
## 11 5 8 175 60 76.4
## 12 5 8 225 60 74.9
## 13 8 4 200 60 62.9
## 14 8 6 200 45 61.8
## 15 8 6 200 75 70.7
## 16 8 8 200 60 63.2
We see there are 16 distinct points in the experimental space, with replicates for 2 of them
Now we will focus on the regression: main effects interactions. We also limit the number of variables to 5, and want only fits where R²Adj>0.8. For clarity, we will replace the variables’ names with consecutive letters, and the dependent variable (BlendingEfficiency) with “y”.
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
options(warn = -1)
mix1a <- mix1
colnames(mix1a) <- c("A","B","C","D","y")
fit.all <- lm(y~A*B*C*D,data=mix1a)
mod1 <- ols_step_all_possible(fit.all)
nrow(mod1)
## [1] 32767
mod2 <- mod1[order(mod1$aic),]
mod3 <- mod2 %>% filter((n<5) & adjr>0.8)
(head(mod3,10))
## Index N Predictors R-Square Adj. R-Square Mallow's Cp
## 1 121 3 A A:B B:D 0.9186272 0.9011902 0.4926296
## 2 122 3 A A:B:C B:C:D 0.9141117 0.8957071 1.0748785
## 3 123 3 A:B A:D B:D 0.9136492 0.8951455 1.1345192
## 4 124 3 A D B:C 0.9133598 0.8947940 1.1718359
## 5 125 3 A:B A:C B:C:D 0.9125514 0.8938125 1.2760693
## 6 126 3 A B D 0.9125027 0.8937532 1.2823603
## 7 127 3 A B B:D 0.9125027 0.8937532 1.2823603
## 8 128 3 A D B:D 0.9125027 0.8937532 1.2823603
## 9 129 3 A D B:C:D 0.9119391 0.8930689 1.3550320
## 10 576 4 A D B:C A:B:C:D 0.9211318 0.8968647 2.1696713
We can see there were 32767 possible combinations. Out of all these combinations we want to select only those, where all of the terms (prime factors or combinations) are statistically significant (i.e. Pr(>|t|)> 0.05).
list.a <- list()
vector.a <- vector()
for(i in 1:nrow(mod3)){
myForm <- as.formula(paste("y ~ ", paste (gsub(" ","+",mod3$predictors[i])), sep=""))
selectedMod <- lm(myForm, data=mix1a) # re-build model with new formula
pvals <- summary(selectedMod)[[4]][, 4]
summ <- summary(selectedMod)
adj1 <- summ$adj
if(length(pvals[pvals<0.05])== (mod3$n[i]+1)){
# print(myForm)
# print(adj1)
list.a[[i]] <- myForm
vector.a[i] <- adj1
}
}
list.a <- list.a[lapply(list.a,length)>0] ## you can use sapply,rapply
vector.a <- round(vector.a[!is.na(vector.a)],4)
results.a <- data.frame(matrix(ncol=2,nrow=length(list.a))); colnames(results.a) <- c("formula", "R2Adj")
for(j in 1:length(list.a)){
results.a[j,1] <- paste(deparse(list.a[[j]], width.cutoff = 500), collapse="")
results.a[j,2] <- vector.a[j]
}
results1 <- results.a[order(-results.a$R2Adj),]
print(nrow(results1))
## [1] 106
print(results1)
## formula R2Adj
## 1 y ~ A + A:B + B:D 0.9012
## 2 y ~ A + A:B:C + B:C:D 0.8957
## 3 y ~ A:B + A:C + B:C:D 0.8938
## 4 y ~ D + A:B + A:C + B:C 0.8937
## 6 y ~ D + A:B + B:C + A:C:D 0.8924
## 5 y ~ A + B:C + B:C:D 0.8885
## 8 y ~ D + B:C + A:B:C 0.8840
## 9 y ~ B + D + A:B 0.8829
## 11 y ~ B + A:C + B:C + B:D 0.8821
## 7 y ~ A + D 0.8806
## 18 y ~ B + C + D + A:B:C 0.8750
## 10 y ~ A:B + B:D 0.8746
## 15 y ~ C + D + A:C 0.8738
## 23 y ~ A:B + B:C + A:B:D + A:C:D 0.8732
## 16 y ~ B + A:D + B:D 0.8720
## 17 y ~ B + C:D + A:B:C 0.8715
## 12 y ~ A + B:D 0.8713
## 19 y ~ B:C + B:D + A:B:C 0.8703
## 29 y ~ B + C + B:D + A:C:D 0.8700
## 13 y ~ A:C + C:D 0.8696
## 21 y ~ C + B:D + A:B:C 0.8694
## 14 y ~ A:B:C + B:C:D 0.8690
## 22 y ~ D + B:C + A:B:C:D 0.8688
## 31 y ~ A:B + A:C + B:C + A:B:D 0.8686
## 32 y ~ D + A:B + A:C + A:B:C 0.8683
## 24 y ~ A:D + B:D + A:B:C 0.8679
## 34 y ~ C + B:C + B:D + A:C:D 0.8666
## 35 y ~ A:D + C:D + A:B:C + A:B:D 0.8661
## 28 y ~ B + A:B + C:D 0.8651
## 30 y ~ D + A:C + B:C 0.8647
## 20 y ~ A + C:D 0.8640
## 39 y ~ C + A:B + B:C + C:D 0.8631
## 33 y ~ B + D + A:B:D 0.8628
## 42 y ~ A:B + A:C + B:C + A:D 0.8620
## 36 y ~ B + A:B + A:B:D 0.8605
## 44 y ~ C + B:C + A:D + B:D 0.8605
## 25 y ~ D + A:D 0.8603
## 45 y ~ A:D + B:D + C:D + B:C:D 0.8601
## 26 y ~ A + B:C:D 0.8600
## 27 y ~ A + A:D 0.8596
## 37 y ~ A:B + A:D + A:B:D 0.8596
## 49 y ~ A:B + B:C + A:B:D + A:B:C:D 0.8585
## 40 y ~ C + D + A:C:D 0.8576
## 41 y ~ B:C + A:D + B:C:D 0.8573
## 52 y ~ C + A:B + A:B:D + A:C:D 0.8572
## 53 y ~ B:C + A:D + B:D + C:D 0.8566
## 54 y ~ B:C + C:D + A:B:D + A:B:C:D 0.8565
## 46 y ~ B:C + A:B:C + A:B:C:D 0.8547
## 47 y ~ B:D + C:D + A:B:C:D 0.8546
## 48 y ~ B:C + A:B:C + A:B:D 0.8539
## 38 y ~ A:C + B:C:D 0.8535
## 50 y ~ B:C + C:D + A:B:C 0.8529
## 58 y ~ C + A:B:C + A:B:D + A:C:D 0.8529
## 60 y ~ B + A:C + B:C + A:D 0.8518
## 55 y ~ C + A:C + A:D 0.8502
## 43 y ~ A:B + B:C:D 0.8499
## 67 y ~ C + B:C + C:D + A:B:C:D 0.8466
## 68 y ~ B + C + A:B:C + A:B:D 0.8463
## 69 y ~ A:D + C:D + A:B:D + A:B:C:D 0.8460
## 51 y ~ A + A:C:D 0.8456
## 71 y ~ C + A:D + A:B:C + A:B:D 0.8451
## 61 y ~ C + A:C + A:C:D 0.8448
## 62 y ~ B + A:C:D + B:C:D 0.8447
## 63 y ~ D + A:B + B:C 0.8444
## 64 y ~ B + C:D + A:B:C:D 0.8443
## 72 y ~ B:C + A:D + B:D + A:C:D 0.8443
## 74 y ~ B + B:C + A:D + C:D 0.8440
## 65 y ~ B + A:B + A:B:C:D 0.8437
## 56 y ~ C:D + A:C:D 0.8436
## 76 y ~ A:B + A:C + A:B:C + A:B:D 0.8435
## 66 y ~ A:D + A:B:C + A:B:C:D 0.8425
## 57 y ~ B:D + A:B:D 0.8424
## 59 y ~ A + A:B:D 0.8398
## 82 y ~ A:B + A:C + A:D + A:B:C 0.8375
## 83 y ~ B + C + C:D + A:B:D 0.8374
## 78 y ~ C + A:D + C:D 0.8363
## 85 y ~ B + C + A:D + B:C:D 0.8362
## 79 y ~ B:C + A:C:D + B:C:D 0.8360
## 80 y ~ A:B + A:C:D + A:B:C:D 0.8346
## 88 y ~ B + B:C + C:D + A:B:D 0.8346
## 70 y ~ A:C + B:D 0.8328
## 90 y ~ C + A:B + B:C + A:B:C:D 0.8326
## 92 y ~ C + B:C + C:D + A:B:D 0.8319
## 81 y ~ B + B:D + A:C:D 0.8314
## 73 y ~ A + A:B:C:D 0.8313
## 84 y ~ B + D + A:B:C 0.8312
## 75 y ~ B:C:D + A:B:C:D 0.8311
## 77 y ~ D + A:C 0.8302
## 87 y ~ C + A:C + A:B:D 0.8284
## 96 y ~ A:D + A:B:C + A:B:D + A:C:D 0.8269
## 91 y ~ D + B:C + A:B:D 0.8260
## 94 y ~ B + D + A:B:C:D 0.8237
## 98 y ~ A:B + A:D + C:D + A:B:C 0.8226
## 86 y ~ D + A:C:D 0.8224
## 95 y ~ D + A:B:D + B:C:D 0.8224
## 89 y ~ B:D + A:B:C 0.8206
## 101 y ~ C + A:B + A:D + A:B:C:D 0.8200
## 97 y ~ B:C + A:B:D + B:C:D 0.8178
## 93 y ~ B:D + A:B:C:D 0.8176
## 103 y ~ A:D + A:B:D + A:C:D + B:C:D 0.8154
## 99 y ~ A:B + B:C + A:B:D 0.8148
## 100 y ~ A:B + A:B:D + A:C:D 0.8143
## 102 y ~ C + A:B:D + B:C:D 0.8109
## 104 y ~ A:B:C + A:C:D + A:B:C:D 0.8033
## 106 y ~ A:B + B:C + A:D + A:C:D 0.8025
## 105 y ~ B:C + A:D + B:D 0.8006
As we can see, there are only 106 equations which meet all previously set criteria. We select the 10th results.
results1[10,]
## formula R2Adj
## 7 y ~ A + D 0.8806
mix.fit.all <- lm(y~A+D, data=mix1a)
summary(mix.fit.all)
##
## Call:
## lm(formula = y ~ A + D, data = mix1a)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.693 -1.346 -0.775 1.228 8.043
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 79.12500 4.58465 17.259 2.64e-11 ***
## A -3.77500 0.35267 -10.704 2.03e-08 ***
## D 0.25571 0.07153 3.575 0.00277 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.992 on 15 degrees of freedom
## Multiple R-squared: 0.8946, Adjusted R-squared: 0.8806
## F-statistic: 63.68 on 2 and 15 DF, p-value: 4.681e-08
We need to check if the prepositions of linearity are met. Please note there are just 18 data points, while the theory says that if we want to approximate our dataset distribution with the Normal Distribution, there should be at least 15-40 points (depending on the distribution).
Let’s start checking if the how much the independent factors are correlated.
library(psych)
round(cor(mix1a$A,mix1a$C),3)
## [1] 0
As we can see, the selected factors are not correlated.
Next, lets inspect the fitted linear regression model.
par(mfrow=c(2,2))
plot(mix.fit.all)
We can conclude the linearity assumptions are met, as: - Residuals vs Fitted: visually, linearity seems to hold reasonably well, as the red line is close to the dashed line; the formal test (Shapiro-Wilk) does not support that (with p-value < 0.05 we reject the normality of the residuals), however the value is reasonably close to the 0.05 cut-off:
shapiro.test(mix.fit.all[['residuals']])
##
## Shapiro-Wilk normality test
##
## data: mix.fit.all[["residuals"]]
## W = 0.895, p-value = 0.04706
lmtest::bptest(mix.fit.all)
##
## studentized Breusch-Pagan test
##
## data: mix.fit.all
## BP = 1.6821, df = 2, p-value = 0.4313
We can’t reject the null hypothesis of homoscedasticity with a p-value > 0.05
cat(paste("Cook's Distance for point 8: ",round(cooks.distance(mix.fit.all)[8],2)))
## Cook's Distance for point 8: 0.53
Let’s see what happens when we remove the point 8:
mix.fit.8 <- lm(y~A+D, data=mix1a[-8,])
summary(mix.fit.8)
##
## Call:
## lm(formula = y ~ A + D, data = mix1a[-8, ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2839 -0.9589 -0.0839 0.7648 4.1911
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 74.35381 3.29738 22.55 2.10e-12 ***
## A -3.77500 0.23899 -15.79 2.56e-10 ***
## D 0.32842 0.05132 6.40 1.65e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.028 on 14 degrees of freedom
## Multiple R-squared: 0.954, Adjusted R-squared: 0.9474
## F-statistic: 145.2 on 2 and 14 DF, p-value: 4.347e-10
par(mfrow=c(2,2))
plot(mix.fit.8)
Now both indicators for the Normal Distribution and homoscedasticity of the residuals are even stronger, with even higher value for Shapiro-Wilk test and the Breusch Pagan Test:
cat(paste("- the residuals show homogeneity of variance with Breusch Pagan Test p-value = ",round(lmtest::bptest(mix.fit.8)$p.value,4)),"\n")
## - the residuals show homogeneity of variance with Breusch Pagan Test p-value = 0.7918
cat(paste("- the residuals are distributed normally - the Shapiro-Wilk test p-value = ",round(shapiro.test(as.numeric((mix.fit.8$residuals)))$p.value,4)))
## - the residuals are distributed normally - the Shapiro-Wilk test p-value = 0.9074
Also, the value of R²Adj is higher. At the same time,the coefficients in the equations did not change a lot.
A very important indicators for the fit is the “Fitted vs Actual” plot, as well as R²Adj value. Both seem to support the validity of using the Linear regression in this case. Let’s see what is the influence of the point 8:
library(latex2exp)
mix.fit.all <- lm(y~A+D, data=mix1a)
mix.fit.8 <- lm(y~A+D, data=mix1a[-8,])
par(mfrow=c(1,2))
plot(as.numeric(mix.fit.all$fitted.values),mix1a$y, main = "All data", xlab="Predicted",ylab="Actual",yaxt="n"); text(67,90,bquote(R[Adj]^2 == .(round(summary(mix.fit.all)$adj.r.squared,3)))); abline(a=0,b=1)
plot(x=as.numeric(mix.fit.8$fitted.values),y=mix1a$y[-8], main = "Point 8 removed", xlab="Predicted",ylab="Actual"); text(67,90,bquote(R[Adj]^2 == .(round(summary(mix.fit.8)$adj.r.squared,3)))); abline(a=0,b=1); points(x=as.numeric(mix.fit.all$fitted.values)[8],mix1a$y[8], pch=4, cex=1, col="red"); text(x=as.numeric(mix.fit.all$fitted.values)[8],mix1a$y[8],
labels = row.names(mix1a)[8],
cex = 0.8, pos = 2, col = "red")
Let’s now check if there are any interactions between the two variables.
df <- as.data.frame(mix1a %>% select(A,D,y) %>% group_by(A,D) %>% summarise(Av=mean(y), .groups = 'keep'))
par(bg = "#fef7f2")
interaction.plot(x.factor = df[,1], #x-axis variable
trace.factor = df[,2], #variable for lines
response = df$Av, #y-axis variable
fun = median, #metric to plot
ylab = "Blending efficiency",
xlab = paste(colnames(df)[1]),
col = c("#d8c400", "#36989a", "#632340"),
lty = 1, #line type
lwd = 2, #line width
trace.label = paste0(colnames(df)[2]," level"),
main=paste("Interaction plot for pair ",colnames(df)[1],"-",colnames(df)[2])
)
From the plot above we can deduct there is no interaction between the
selected factors, which confirms our observation from the linear
regression.
We showed an analysis of data coming from Experimental Design, where our primary goal was to study Factor Effects. We saw that point 8 was an outlier, with Cook’s distance 0.53. When removed, the linear model and its all indicators improved. In practice, the two fits (with and without point 8) don’t deviate far one from each other. We would need to run confirmatory runs to reinforce our understanding of the process and be able decide which of the fits is closer to the reality.