pth = "/Users/jheintz/Box/02_Data_Analytics_Projects_Delivered/003_Softmatter_Friction/2017-08-03 22:03:41_2_ToobaTable_Slope.csv"
pth = "/Users/jheintz/Box/02_Data_Analytics_Projects_Delivered/003_Softmatter_Friction/mydata_06_16.csv"

dt = read.csv(pth)[,-c(3:20)]
dt = subset(dt, system == "8kPaGel" & f_on_samp == "70nN" & half_loop == 386)
write.csv(dt, "data.csv")
dt = read.csv('data.csv')

   

Scenarios

\[\begin{aligned} Y &= \beta_0 + \beta_1 X_1 + \beta_2 X_2 + \beta_3 X_3 + ... + \beta_n X_n +\epsilon\\ \\ \text{Model 1: } Y &= 3 + 5x + \epsilon &\epsilon = N(0,1)\\ \text{Model 2: } Y &= 3 + 5x + \epsilon\ &\epsilon = N(0,x^2)\\ \text{Model 3: } Y &= 3 + 5x^2 + \epsilon\ &\epsilon = N(0,5)\\ \end{aligned}\]

set.seed(42)
sim_1 = function(sample_size = 500) {
      set.seed(42)
      x = runif(n = sample_size) * 5
      y = 3 + 5 * x + rnorm(n = sample_size, mean = 0, sd = 1)
      data.frame(x, y)
}
sim_2 = function(sample_size = 500) {
      set.seed(42)
      x = runif(n = sample_size) * 5
      y = 3 + 5 * x + rnorm(n = sample_size, mean = 0, sd = x)
      data.frame(x, y)
}
sim_3 = function(sample_size = 500) {
      set.seed(42)
      x = runif(n = sample_size) * 5
      y = 3 + 5 * x ^ 2 + rnorm(n = sample_size, mean = 0, sd = 5)
      data.frame(x, y)
}
# function to plot 
fit.plot = function(data = df, ttl = "Titel"){
      set.seed(42)
      lm_fit = lm(y~x, data = data)
      predicted_df <- data.frame(x = data$x, y = predict(lm_fit, data))
      ggplot() + 
            geom_point(data = data, aes(y = y, x = x), colour = "dodgerblue") +
            geom_line(data = predicted_df, aes(x = x, y=y), colour = "darkorange", size = 1.5 )+
            ggtitle(ttl)
}


p1 = fit.plot(sim_1(), ttl = "Model 1")
p2 = fit.plot(sim_2(), ttl = "Model 2")
p3 = fit.plot(sim_3(), ttl = "Model 3")

grid.arrange(p1, p2, p3, nrow = 1)

Residuals Plot

res.plot = function(data = df, ttl = "Titel"){
      set.seed(42)
      lm_fit = lm(y~x, data = data)
      data = data.frame(x = fitted(lm_fit), y =  resid(lm_fit))
      ggplot(data = data) + 
            geom_point(aes(y = y, x = x), colour = "dodgerblue") +
            geom_line(aes(x = x, y=0), colour = "darkorange", size = 1.5 )+
            xlab("fitted") + 
            ylab("residuals") +
            ggtitle(ttl)
}

rp1 = res.plot(data = sim_1(), ttl = "")
rp2 = res.plot(data = sim_2(), ttl = "")
rp3 = res.plot(data = sim_3(), ttl = "")

grid.arrange(rp1, rp2, rp3, nrow = 1)

We should look for two things in this plot.
• At any fitted value, the mean of the residuals should be roughly 0. If this is the case, the linearity assumption is valid. For this reason, we generally add a horizontal line at 𝑦 = 0 to emphasize this point.
• At every fitted value, the spread of the residuals should be roughly the same. If this is the case, the constant variance assumption is valid.

Model 1:
Here we see both check points reveal nothing remarkable. We can assume the model is linear and the variance is constant.

Model 2:
The mean of the fitted value are close to 0, although the variance is changing. We can assume linearity between y and x. However the variance changes.

Model 3:
This model not meet the linearity assumption. Model 3 is an example of a model where 𝑌 is not a linear combination of the predictor x. In this case the predictor is 𝑥, but the model uses 𝑥2. (We’ll see later that this is something that a “linear” model can deal with. The fix is simple, just make \(x^2\) a predictor! and consider that when we interpret the model’s performance.

Histogram Residuals

Notice that the first, for fit_1 appears normal. The third, for fit_3, appears to be very non-normal. However fit_2 is not as clear. Is this bell shaped curvea rough bell shape, however, it also has a very sharp peak.

Testing normal distribution and variance

  • Model 1 had no violation of assumptions
  • Model 2 violated the constant variance assumption, but not linearity
  • Model 3 violated linearity, but not constant variance.
       

Shaprio-Wilk Test [normal distribtion]
H0: The null hypothesis assumes a normal distribution, linearity is given H1: The alternaive hypothesis assumes a abnormal distribution, linearity is not given.

shap = lapply(list(sim_1(), sim_2(), sim_3()), function(x){
                  lm_fit = lm(y ~ x, data = data.frame(x))
                  unlist(shapiro.test(resid(lm_fit)))
      })

shap = as.data.frame(matrix(unlist(shap), ncol = 3)[-c(3,4),])
shap = data.frame(sapply(shap, as.numeric ))
colnames(shap) = c("Sim_1", "Sim_2", "Sim_3")
rownames(shap) = c("W Statistic", "p-value")

t.kable(format(shap, scientific = T))
Sim_1 Sim_2 Sim_3
W Statistic 9.985804e-01 9.369750e-01 9.764328e-01
p-value 9.621945e-01 1.056352e-13 3.231089e-07

The p-value < 0.05 for model 2, 3 and therefore we reject the null hypothesis that the distribution is ‘normal’. Only the p-value of model 1 is > 0.05 and we can’t reject the null hypothesis.
   

Breusch-Pagan Test [constant variance]

H0: Homoscedasticity. The errors have constant variance about the true model.
H1: Heteroscedasticity. The errors have non-constant variance about the true model.

pagan = lapply(list(sim_1(), sim_2(), sim_3()), function(x){
                  lm_fit = lm(y ~ x, data = data.frame(x))
                  unlist(bptest(lm_fit))
      })

pagan = as.data.frame(matrix(unlist(pagan), ncol = 3))[-c(3,5),]
pagan = data.frame(sapply(pagan, as.numeric))

colnames(pagan) = c("Sim_1", "Sim_2", "Sim_3")
rownames(pagan) = c("BP Statistic","df", "p-value")

t.kable(format(pagan, scientific = T))
Sim_1 Sim_2 Sim_3
BP Statistic 1.023354e+00 7.669284e+01 3.346566e-01
df 1.000000e+00 1.000000e+00 1.000000e+00
p-value 3.117247e-01 1.997261e-18 5.629299e-01

Quiz

  1. Why what could that actually mean for you when the residuals of your fitted model are not normally distributed but show a clear pattern?
    • 10 points (if correct)
    • 20 points (if wrong)
    • 50 points (if no response at all)
       
       
  2. Why is change in variance not bad or is it good?
    1. because we need iid (independent and identically distributed ) for making statistical tests e.g. t-test, z-test
    2. because it could tell us something we first didn’t see first in the relationship y = f(x)
    3. it tells us we need a better model to make reliable predictions

Transformations

Model 3 (non-linear)

# Model y = x + c
      set.seed(42)
      dt = sim_3()
      lm_fit = lm(y ~ x, data = dt)
      predicted_df <- data.frame(x = dt$x, y = predict(lm_fit, dt))
      p1 = ggplot() + 
            geom_point(data = dt, aes(y = y, x = x), colour = "dodgerblue") +
            geom_line(data = predicted_df, aes(x = x, y=y), colour = "darkorange", size = 1.5 )+
            ggtitle("Sim 3:  y ~ x")

      # Model y = x + x^2 + c 
      set.seed(42)
      dt = sim_3()
      lm_fit_x2 = lm(y ~ I(x^2), data = dt)
      predicted_df <- data.frame(x = dt$x, y = predict(lm_fit_x2, dt))
      p2 = ggplot() + 
            geom_point(data = dt, aes(y = y, x = x), colour = "dodgerblue") +
            geom_line(data = predicted_df, aes(x = x, y=y), colour = "darkorange", size = 1.5 )+
            ggtitle("Sim 3: y ~ x^2")

      
grid.arrange(p1, p2, nrow = 1)

set.seed(42)

lm_fit = lm(y~x, data = sim_3())
data = data.frame(x = fitted(lm_fit), y =  resid(lm_fit))
p.res =    ggplot(data = data) + 
            geom_point(aes(y = y, x = x), colour = "dodgerblue") +
            geom_line(aes(x = x, y=0), colour = "darkorange", size = 1.5 )+
            xlab("fitted") + 
            ylab("residuals") +
            ggtitle("y = x")



lm_fit.x2 = lm(y~I(x^2), data = sim_3())
data.x2 = data.frame(x = fitted(lm_fit.x2), y =  resid(lm_fit.x2))
p.res.x2 =  ggplot(data = data.x2) + 
            geom_point(aes(y = y, x = x), colour = "dodgerblue") +
            geom_line(aes(x = x, y=0), colour = "darkorange", size = 1.5 )+
            xlab("fitted") + 
            ylab("residuals") +
            ggtitle("y = x^2")

grid.arrange(p.res, p.res.x2, nrow = 1)

Test for Linearity

H0: The null hypothesis assumes a normal distribution, linearity is given.
H1: The alternative hypothesis assumes a abnormal distribution, linearity is not given.

data = sim_3()

lm_fit = lm(y ~ x, data = data)
lm_fit.x2 = lm(y ~ I(x^2), data = data)

shap = list(unlist(shapiro.test(resid(lm_fit))), unlist(shapiro.test(resid(lm_fit.x2))))

shap = as.data.frame(matrix(unlist(shap), ncol = 2)[-c(3,4),])
shap = data.frame(sapply(shap, as.numeric ))
colnames(shap) = c("y = x", "Y = x^2")
rownames(shap) = c("W Statistic", "p-value")

t.kable(format(shap, scientific = T))
y = x Y = x^2
W Statistic 9.764328e-01 9.986650e-01
p-value 3.231089e-07 9.728471e-01

Model Improvements

list(summary(lm_fit), summary(lm_fit.x2))
## [[1]]
## 
## Call:
## lm(formula = y ~ x, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -23.910  -8.489  -1.503   7.584  31.861 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) -17.4688     0.9478  -18.43 <0.0000000000000002 ***
## x            24.8767     0.3323   74.86 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.88 on 498 degrees of freedom
## Multiple R-squared:  0.9184, Adjusted R-squared:  0.9182 
## F-statistic:  5604 on 1 and 498 DF,  p-value: < 0.00000000000000022
## 
## 
## [[2]]
## 
## Call:
## lm(formula = y ~ I(x^2), data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.8156  -3.1337   0.0467   3.4449  16.4135 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)  2.76349    0.32259   8.566 <0.0000000000000002 ***
## I(x^2)       4.99839    0.02908 171.855 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.903 on 498 degrees of freedom
## Multiple R-squared:  0.9834, Adjusted R-squared:  0.9834 
## F-statistic: 2.953e+04 on 1 and 498 DF,  p-value: < 0.00000000000000022

Model 2 (variance stabilization)

# 
      set.seed(42)
      dt = sim_2()
      lm_fit = lm(y ~ x, data = dt)
      predicted_df <- data.frame(x = dt$x, y = predict(lm_fit, dt))
      p1 = ggplot() + 
            geom_point(data = dt, aes(y = y, x = x), colour = "dodgerblue") +
            geom_line(data = predicted_df, aes(x = x, y=y), colour = "darkorange", size = 1.5 )+
            ggtitle("Sim 3: y ~ x")


# Model y = x + c
      set.seed(42)
      dt = sim_2()
      lm_fit.log = lm(log(y) ~ x, data = dt)
      predicted_df <- data.frame(x = dt$x, y = predict(lm_fit.log, dt))
      p2 = ggplot() + 
            geom_point(data = dt, aes(y = log(y), x = x), colour = "dodgerblue") +
            geom_line(data = predicted_df, aes(x = x, y = y), colour = "darkorange", size = 1.5 )+
            ggtitle("Sim 3:  log(y) ~ x")
      
grid.arrange(p1, p2, nrow = 1)

Test for constant variance

pagan = lapply(list(sim_1(), sim_2(), sim_3()), function(x){
                  lm_fit = lm(y ~ x, data = data.frame(x))
                  unlist(bptest(lm_fit))
      })

pagan = list(unlist(bptest(lm_fit)), unlist(bptest(lm_fit.log)))

pagan = as.data.frame(matrix(unlist(pagan), ncol = 2))[-c(3,5),]
pagan = data.frame(sapply(pagan, as.numeric))

colnames(pagan) = c("y=x", "log(y) = x")
rownames(pagan) = c("BP Statistic","df", "p-value")

t.kable(format(pagan, scientific = T))
y=x log(y) = x
BP Statistic 7.669284e+01 6.867532e-01
df 1.000000e+00 1.000000e+00
p-value 1.997261e-18 4.072709e-01

H0: Homoscedasticity. The errors have constant variance about the true model.
H1: Heteroscedasticity. The errors have non-constant variance about the true model.

With a p-value < 0.05 we can reject the hypothesis and conclude that the data variance is not constant.
With a p-value > 0.05 we can not reject the hypothesis and conclude that the data variance is constant.

list(summary(lm_fit), summary(lm_fit.log))
## [[1]]
## 
## Call:
## lm(formula = y ~ x, data = dt)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.3385  -1.1284   0.0469   1.3153  14.5049 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)  2.94492    0.25144   11.71 <0.0000000000000002 ***
## x            4.96439    0.08816   56.31 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.886 on 498 degrees of freedom
## Multiple R-squared:  0.8643, Adjusted R-squared:  0.864 
## F-statistic:  3171 on 1 and 498 DF,  p-value: < 0.00000000000000022
## 
## 
## [[2]]
## 
## Call:
## lm(formula = log(y) ~ x, data = dt)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.93242 -0.13026  0.01562  0.16094  0.47377 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) 1.621743   0.018871   85.94 <0.0000000000000002 ***
## x           0.381594   0.006616   57.67 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2166 on 498 degrees of freedom
## Multiple R-squared:  0.8698, Adjusted R-squared:  0.8695 
## F-statistic:  3326 on 1 and 498 DF,  p-value: < 0.00000000000000022

Model Improvements

If can accept limitations the data set to values > 1

# 
      set.seed(42)
      dt = sim_2()
      dt = subset(dt, x >= 1)
      
      lm_fit = lm(y ~ x, data = dt)
      predicted_df <- data.frame(x = dt$x, y = predict(lm_fit, dt))
      p1 = ggplot() + 
            geom_point(data = dt, aes(y = y, x = x), colour = "dodgerblue") +
            geom_line(data = predicted_df, aes(x = x, y=y), colour = "darkorange", size = 1.5 )+
            ggtitle("Sim 3: y ~ x")


# Model y = x + c
      
      lm_fit.log = lm(log(y) ~ x, data = dt)
      predicted_df <- data.frame(x = dt$x, y = predict(lm_fit.log, dt))
      p2 = ggplot() + 
            geom_point(data = dt, aes(y = log(y), x = x), colour = "dodgerblue") +
            geom_line(data = predicted_df, aes(x = x, y = y), colour = "darkorange", size = 1.5 )+
            ggtitle("Sim 3:  log(y) ~ x")
      
grid.arrange(p1, p2, nrow = 1)

Residuals

set.seed(42)
      dt = sim_2()
      dt = subset(dt, x >= 1)

lm_fit = lm(y~x, data = dt)
data = data.frame(x = fitted(lm_fit), y =  resid(lm_fit))
p.res =    ggplot(data = data) + 
            geom_point(aes(y = y, x = x), colour = "dodgerblue") +
            geom_line(aes(x = x, y=0), colour = "darkorange", size = 1.5 )+
            xlab("fitted") + 
            ylab("residuals") +
            ggtitle("")


lm_fit.log = lm(log(y)~x, data = dt)
data.x2 = data.frame(x = fitted(lm_fit.log), y =  resid(lm_fit.log))
p.res.x2 =  ggplot(data = data.x2) + 
            geom_point(aes(y = y, x = x), colour = "dodgerblue") +
            geom_line(aes(x = x, y=0), colour = "darkorange", size = 1.5 )+
            xlab("fitted") + 
            ylab("residuals") +
            ggtitle("")

grid.arrange(p.res, p.res.x2, nrow = 1)

list(summary(lm_fit), summary(lm_fit.log))
## [[1]]
## 
## Call:
## lm(formula = y ~ x, data = dt)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.3913  -1.6822   0.0305   1.7238  14.4690 
## 
## Coefficients:
##             Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)   2.7933     0.4682   5.966        0.00000000553 ***
## x             5.0068     0.1449  34.545 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.272 on 384 degrees of freedom
## Multiple R-squared:  0.7566, Adjusted R-squared:  0.7559 
## F-statistic:  1193 on 1 and 384 DF,  p-value: < 0.00000000000000022
## 
## 
## [[2]]
## 
## Call:
## lm(formula = log(y) ~ x, data = dt)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.81256 -0.10397  0.01253  0.12913  0.44778 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) 1.932636   0.025627   75.42 <0.0000000000000002 ***
## x           0.292341   0.007933   36.85 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1791 on 384 degrees of freedom
## Multiple R-squared:  0.7796, Adjusted R-squared:  0.779 
## F-statistic:  1358 on 1 and 384 DF,  p-value: < 0.00000000000000022

t-test