pth = "/Users/jheintz/Box/02_Data_Analytics_Projects_Delivered/003_Softmatter_Friction/2017-08-03 22:03:41_2_ToobaTable_Slope.csv"
pth = "/Users/jheintz/Box/02_Data_Analytics_Projects_Delivered/003_Softmatter_Friction/mydata_06_16.csv"
dt = read.csv(pth)[,-c(3:20)]
dt = subset(dt, system == "8kPaGel" & f_on_samp == "70nN" & half_loop == 386)
write.csv(dt, "data.csv")
dt = read.csv('data.csv')
\[\begin{aligned} Y &= \beta_0 + \beta_1 X_1 + \beta_2 X_2 + \beta_3 X_3 + ... + \beta_n X_n +\epsilon\\ \\ \text{Model 1: } Y &= 3 + 5x + \epsilon &\epsilon = N(0,1)\\ \text{Model 2: } Y &= 3 + 5x + \epsilon\ &\epsilon = N(0,x^2)\\ \text{Model 3: } Y &= 3 + 5x^2 + \epsilon\ &\epsilon = N(0,5)\\ \end{aligned}\]
set.seed(42)
sim_1 = function(sample_size = 500) {
set.seed(42)
x = runif(n = sample_size) * 5
y = 3 + 5 * x + rnorm(n = sample_size, mean = 0, sd = 1)
data.frame(x, y)
}
sim_2 = function(sample_size = 500) {
set.seed(42)
x = runif(n = sample_size) * 5
y = 3 + 5 * x + rnorm(n = sample_size, mean = 0, sd = x)
data.frame(x, y)
}
sim_3 = function(sample_size = 500) {
set.seed(42)
x = runif(n = sample_size) * 5
y = 3 + 5 * x ^ 2 + rnorm(n = sample_size, mean = 0, sd = 5)
data.frame(x, y)
}
# function to plot
fit.plot = function(data = df, ttl = "Titel"){
set.seed(42)
lm_fit = lm(y~x, data = data)
predicted_df <- data.frame(x = data$x, y = predict(lm_fit, data))
ggplot() +
geom_point(data = data, aes(y = y, x = x), colour = "dodgerblue") +
geom_line(data = predicted_df, aes(x = x, y=y), colour = "darkorange", size = 1.5 )+
ggtitle(ttl)
}
p1 = fit.plot(sim_1(), ttl = "Model 1")
p2 = fit.plot(sim_2(), ttl = "Model 2")
p3 = fit.plot(sim_3(), ttl = "Model 3")
grid.arrange(p1, p2, p3, nrow = 1)
res.plot = function(data = df, ttl = "Titel"){
set.seed(42)
lm_fit = lm(y~x, data = data)
data = data.frame(x = fitted(lm_fit), y = resid(lm_fit))
ggplot(data = data) +
geom_point(aes(y = y, x = x), colour = "dodgerblue") +
geom_line(aes(x = x, y=0), colour = "darkorange", size = 1.5 )+
xlab("fitted") +
ylab("residuals") +
ggtitle(ttl)
}
rp1 = res.plot(data = sim_1(), ttl = "")
rp2 = res.plot(data = sim_2(), ttl = "")
rp3 = res.plot(data = sim_3(), ttl = "")
grid.arrange(rp1, rp2, rp3, nrow = 1)
We should look for two things in this plot.
• At any fitted value, the mean of the residuals should be roughly 0. If this is the case, the linearity assumption is valid. For this reason, we generally add a horizontal line at 𝑦 = 0 to emphasize this point.
• At every fitted value, the spread of the residuals should be roughly the same. If this is the case, the constant variance assumption is valid.
Model 1:
Here we see both check points reveal nothing remarkable. We can assume the model is linear and the variance is constant.
Model 2:
The mean of the fitted value are close to 0, although the variance is changing. We can assume linearity between y and x. However the variance changes.
Model 3:
This model not meet the linearity assumption. Model 3 is an example of a model where 𝑌 is not a linear combination of the predictor x. In this case the predictor is 𝑥, but the model uses 𝑥2. (We’ll see later that this is something that a “linear” model can deal with. The fix is simple, just make \(x^2\) a predictor! and consider that when we interpret the model’s performance.
Notice that the first, for fit_1 appears normal. The third, for fit_3, appears to be very non-normal. However fit_2 is not as clear. Is this bell shaped curvea rough bell shape, however, it also has a very sharp peak.
Shaprio-Wilk Test [normal distribtion]
H0: The null hypothesis assumes a normal distribution, linearity is given H1: The alternaive hypothesis assumes a abnormal distribution, linearity is not given.
shap = lapply(list(sim_1(), sim_2(), sim_3()), function(x){
lm_fit = lm(y ~ x, data = data.frame(x))
unlist(shapiro.test(resid(lm_fit)))
})
shap = as.data.frame(matrix(unlist(shap), ncol = 3)[-c(3,4),])
shap = data.frame(sapply(shap, as.numeric ))
colnames(shap) = c("Sim_1", "Sim_2", "Sim_3")
rownames(shap) = c("W Statistic", "p-value")
t.kable(format(shap, scientific = T))
| Sim_1 | Sim_2 | Sim_3 | |
|---|---|---|---|
| W Statistic | 9.985804e-01 | 9.369750e-01 | 9.764328e-01 |
| p-value | 9.621945e-01 | 1.056352e-13 | 3.231089e-07 |
The p-value < 0.05 for model 2, 3 and therefore we reject the null hypothesis that the distribution is ‘normal’. Only the p-value of model 1 is > 0.05 and we can’t reject the null hypothesis.
Breusch-Pagan Test [constant variance]
H0: Homoscedasticity. The errors have constant variance about the true model.
H1: Heteroscedasticity. The errors have non-constant variance about the true model.
pagan = lapply(list(sim_1(), sim_2(), sim_3()), function(x){
lm_fit = lm(y ~ x, data = data.frame(x))
unlist(bptest(lm_fit))
})
pagan = as.data.frame(matrix(unlist(pagan), ncol = 3))[-c(3,5),]
pagan = data.frame(sapply(pagan, as.numeric))
colnames(pagan) = c("Sim_1", "Sim_2", "Sim_3")
rownames(pagan) = c("BP Statistic","df", "p-value")
t.kable(format(pagan, scientific = T))
| Sim_1 | Sim_2 | Sim_3 | |
|---|---|---|---|
| BP Statistic | 1.023354e+00 | 7.669284e+01 | 3.346566e-01 |
| df | 1.000000e+00 | 1.000000e+00 | 1.000000e+00 |
| p-value | 3.117247e-01 | 1.997261e-18 | 5.629299e-01 |
# Model y = x + c
set.seed(42)
dt = sim_3()
lm_fit = lm(y ~ x, data = dt)
predicted_df <- data.frame(x = dt$x, y = predict(lm_fit, dt))
p1 = ggplot() +
geom_point(data = dt, aes(y = y, x = x), colour = "dodgerblue") +
geom_line(data = predicted_df, aes(x = x, y=y), colour = "darkorange", size = 1.5 )+
ggtitle("Sim 3: y ~ x")
# Model y = x + x^2 + c
set.seed(42)
dt = sim_3()
lm_fit_x2 = lm(y ~ I(x^2), data = dt)
predicted_df <- data.frame(x = dt$x, y = predict(lm_fit_x2, dt))
p2 = ggplot() +
geom_point(data = dt, aes(y = y, x = x), colour = "dodgerblue") +
geom_line(data = predicted_df, aes(x = x, y=y), colour = "darkorange", size = 1.5 )+
ggtitle("Sim 3: y ~ x^2")
grid.arrange(p1, p2, nrow = 1)
set.seed(42)
lm_fit = lm(y~x, data = sim_3())
data = data.frame(x = fitted(lm_fit), y = resid(lm_fit))
p.res = ggplot(data = data) +
geom_point(aes(y = y, x = x), colour = "dodgerblue") +
geom_line(aes(x = x, y=0), colour = "darkorange", size = 1.5 )+
xlab("fitted") +
ylab("residuals") +
ggtitle("y = x")
lm_fit.x2 = lm(y~I(x^2), data = sim_3())
data.x2 = data.frame(x = fitted(lm_fit.x2), y = resid(lm_fit.x2))
p.res.x2 = ggplot(data = data.x2) +
geom_point(aes(y = y, x = x), colour = "dodgerblue") +
geom_line(aes(x = x, y=0), colour = "darkorange", size = 1.5 )+
xlab("fitted") +
ylab("residuals") +
ggtitle("y = x^2")
grid.arrange(p.res, p.res.x2, nrow = 1)
H0: The null hypothesis assumes a normal distribution, linearity is given.
H1: The alternative hypothesis assumes a abnormal distribution, linearity is not given.
data = sim_3()
lm_fit = lm(y ~ x, data = data)
lm_fit.x2 = lm(y ~ I(x^2), data = data)
shap = list(unlist(shapiro.test(resid(lm_fit))), unlist(shapiro.test(resid(lm_fit.x2))))
shap = as.data.frame(matrix(unlist(shap), ncol = 2)[-c(3,4),])
shap = data.frame(sapply(shap, as.numeric ))
colnames(shap) = c("y = x", "Y = x^2")
rownames(shap) = c("W Statistic", "p-value")
t.kable(format(shap, scientific = T))
| y = x | Y = x^2 | |
|---|---|---|
| W Statistic | 9.764328e-01 | 9.986650e-01 |
| p-value | 3.231089e-07 | 9.728471e-01 |
list(summary(lm_fit), summary(lm_fit.x2))
## [[1]]
##
## Call:
## lm(formula = y ~ x, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.910 -8.489 -1.503 7.584 31.861
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.4688 0.9478 -18.43 <0.0000000000000002 ***
## x 24.8767 0.3323 74.86 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.88 on 498 degrees of freedom
## Multiple R-squared: 0.9184, Adjusted R-squared: 0.9182
## F-statistic: 5604 on 1 and 498 DF, p-value: < 0.00000000000000022
##
##
## [[2]]
##
## Call:
## lm(formula = y ~ I(x^2), data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.8156 -3.1337 0.0467 3.4449 16.4135
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.76349 0.32259 8.566 <0.0000000000000002 ***
## I(x^2) 4.99839 0.02908 171.855 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.903 on 498 degrees of freedom
## Multiple R-squared: 0.9834, Adjusted R-squared: 0.9834
## F-statistic: 2.953e+04 on 1 and 498 DF, p-value: < 0.00000000000000022
#
set.seed(42)
dt = sim_2()
lm_fit = lm(y ~ x, data = dt)
predicted_df <- data.frame(x = dt$x, y = predict(lm_fit, dt))
p1 = ggplot() +
geom_point(data = dt, aes(y = y, x = x), colour = "dodgerblue") +
geom_line(data = predicted_df, aes(x = x, y=y), colour = "darkorange", size = 1.5 )+
ggtitle("Sim 3: y ~ x")
# Model y = x + c
set.seed(42)
dt = sim_2()
lm_fit.log = lm(log(y) ~ x, data = dt)
predicted_df <- data.frame(x = dt$x, y = predict(lm_fit.log, dt))
p2 = ggplot() +
geom_point(data = dt, aes(y = log(y), x = x), colour = "dodgerblue") +
geom_line(data = predicted_df, aes(x = x, y = y), colour = "darkorange", size = 1.5 )+
ggtitle("Sim 3: log(y) ~ x")
grid.arrange(p1, p2, nrow = 1)
pagan = lapply(list(sim_1(), sim_2(), sim_3()), function(x){
lm_fit = lm(y ~ x, data = data.frame(x))
unlist(bptest(lm_fit))
})
pagan = list(unlist(bptest(lm_fit)), unlist(bptest(lm_fit.log)))
pagan = as.data.frame(matrix(unlist(pagan), ncol = 2))[-c(3,5),]
pagan = data.frame(sapply(pagan, as.numeric))
colnames(pagan) = c("y=x", "log(y) = x")
rownames(pagan) = c("BP Statistic","df", "p-value")
t.kable(format(pagan, scientific = T))
| y=x | log(y) = x | |
|---|---|---|
| BP Statistic | 7.669284e+01 | 6.867532e-01 |
| df | 1.000000e+00 | 1.000000e+00 |
| p-value | 1.997261e-18 | 4.072709e-01 |
H0: Homoscedasticity. The errors have constant variance about the true model.
H1: Heteroscedasticity. The errors have non-constant variance about the true model.
With a p-value < 0.05 we can reject the hypothesis and conclude that the data variance is not constant.
With a p-value > 0.05 we can not reject the hypothesis and conclude that the data variance is constant.
list(summary(lm_fit), summary(lm_fit.log))
## [[1]]
##
## Call:
## lm(formula = y ~ x, data = dt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.3385 -1.1284 0.0469 1.3153 14.5049
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.94492 0.25144 11.71 <0.0000000000000002 ***
## x 4.96439 0.08816 56.31 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.886 on 498 degrees of freedom
## Multiple R-squared: 0.8643, Adjusted R-squared: 0.864
## F-statistic: 3171 on 1 and 498 DF, p-value: < 0.00000000000000022
##
##
## [[2]]
##
## Call:
## lm(formula = log(y) ~ x, data = dt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.93242 -0.13026 0.01562 0.16094 0.47377
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.621743 0.018871 85.94 <0.0000000000000002 ***
## x 0.381594 0.006616 57.67 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2166 on 498 degrees of freedom
## Multiple R-squared: 0.8698, Adjusted R-squared: 0.8695
## F-statistic: 3326 on 1 and 498 DF, p-value: < 0.00000000000000022
If can accept limitations the data set to values > 1
#
set.seed(42)
dt = sim_2()
dt = subset(dt, x >= 1)
lm_fit = lm(y ~ x, data = dt)
predicted_df <- data.frame(x = dt$x, y = predict(lm_fit, dt))
p1 = ggplot() +
geom_point(data = dt, aes(y = y, x = x), colour = "dodgerblue") +
geom_line(data = predicted_df, aes(x = x, y=y), colour = "darkorange", size = 1.5 )+
ggtitle("Sim 3: y ~ x")
# Model y = x + c
lm_fit.log = lm(log(y) ~ x, data = dt)
predicted_df <- data.frame(x = dt$x, y = predict(lm_fit.log, dt))
p2 = ggplot() +
geom_point(data = dt, aes(y = log(y), x = x), colour = "dodgerblue") +
geom_line(data = predicted_df, aes(x = x, y = y), colour = "darkorange", size = 1.5 )+
ggtitle("Sim 3: log(y) ~ x")
grid.arrange(p1, p2, nrow = 1)
Residuals
set.seed(42)
dt = sim_2()
dt = subset(dt, x >= 1)
lm_fit = lm(y~x, data = dt)
data = data.frame(x = fitted(lm_fit), y = resid(lm_fit))
p.res = ggplot(data = data) +
geom_point(aes(y = y, x = x), colour = "dodgerblue") +
geom_line(aes(x = x, y=0), colour = "darkorange", size = 1.5 )+
xlab("fitted") +
ylab("residuals") +
ggtitle("")
lm_fit.log = lm(log(y)~x, data = dt)
data.x2 = data.frame(x = fitted(lm_fit.log), y = resid(lm_fit.log))
p.res.x2 = ggplot(data = data.x2) +
geom_point(aes(y = y, x = x), colour = "dodgerblue") +
geom_line(aes(x = x, y=0), colour = "darkorange", size = 1.5 )+
xlab("fitted") +
ylab("residuals") +
ggtitle("")
grid.arrange(p.res, p.res.x2, nrow = 1)
list(summary(lm_fit), summary(lm_fit.log))
## [[1]]
##
## Call:
## lm(formula = y ~ x, data = dt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.3913 -1.6822 0.0305 1.7238 14.4690
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.7933 0.4682 5.966 0.00000000553 ***
## x 5.0068 0.1449 34.545 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.272 on 384 degrees of freedom
## Multiple R-squared: 0.7566, Adjusted R-squared: 0.7559
## F-statistic: 1193 on 1 and 384 DF, p-value: < 0.00000000000000022
##
##
## [[2]]
##
## Call:
## lm(formula = log(y) ~ x, data = dt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.81256 -0.10397 0.01253 0.12913 0.44778
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.932636 0.025627 75.42 <0.0000000000000002 ***
## x 0.292341 0.007933 36.85 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1791 on 384 degrees of freedom
## Multiple R-squared: 0.7796, Adjusted R-squared: 0.779
## F-statistic: 1358 on 1 and 384 DF, p-value: < 0.00000000000000022