Alternative 2 - Calculate the regression based on H
Hypothesis 0: D, t and N matter
delay = c(1,2,3,4)
dimension = c(3,4,5,6)
N = c(1e+04, 2e+04, 3e+04, 4e+04, 5e+04, 6e+04, 7e+04, 8e+04, 9e+04, 1e+05)
HC.BP.regression.data = data.frame("H" = numeric(100),
"C" = numeric(100),
stringsAsFactors=FALSE)
shapiro.hypothesis.0 = rep(0, 160)
durbinWatson.hypothesis.0 = rep(0, 160)
lm.hypothesis.0 = array(list(), 160)
for(i in 1:160){
elements = c((((i-1)*100)+1):(i*100))
HC.BP.regression.data$H = HC.BP$H[elements]
HC.BP.regression.data$C = HC.BP$C[elements]
lm.hypothesis.0[[i]] = lm(data = HC.BP.regression.data, formula = C ~ H)
durbinWatson.hypothesis.0[i] = dwtest(lm.hypothesis.0[[i]])$p.value
shapiro.hypothesis.0[i] = shapiro.test(lm.hypothesis.0[[i]]$residuals)$p.value
}
cat("Total: ", 160, " Teste de normalidade: ", length(which(shapiro.hypothesis.0 > 0.05)), " Teste de correlação: ", length(which(durbinWatson.hypothesis.0 > 0.05)), '\n')
Total: 160 Teste de normalidade: 0 Teste de correlação: 152
plot(lm.hypothesis.0[[1]], which = c(1:4), pch = 20)




Hypothesis 1: D matter
#laplace distribution
dimension = c(3, 4, 5, 6)
HC.BP.regression.data = data.frame("H" = numeric(4000),
"C" = numeric(4000),
stringsAsFactors=FALSE)
shapiro.hypothesis.1 = rep(0, 4)
durbinWatson.hypothesis.1 = rep(0, 4)
lm.hypothesis.1 = array(list(), 4)
for(i in 1:length(dimension)){
HC.BP.regression.data$H = HC.BP$H[(((i-1)*4000)+1):(i*4000)]
HC.BP.regression.data$C = HC.BP$C[(((i-1)*4000)+1):(i*4000)]
lm.hypothesis.1[[i]] = lm(data = HC.BP.regression.data, formula = C ~ H)
durbinWatson.hypothesis.1[i] = dwtest(lm.hypothesis.1[[i]])$p.value
shapiro.hypothesis.1[i] = shapiro.test(lm.hypothesis.1[[i]]$residuals)$p.value
}
cat("Total: ", 4, " Teste de normalidade: ", length(which(shapiro.hypothesis.1 > 0.05)), " Teste de correlação: ", length(which(durbinWatson.hypothesis.1 > 0.05)), '\n')
Total: 4 Teste de normalidade: 0 Teste de correlação: 4
plot(lm.hypothesis.1[[1]], which = c(1:4), pch = 20)




Hypothesis 2: t matter
#laplace distribution
delay = c(1, 2, 3, 4)
HC.BP.regression.data = data.frame("H" = numeric(4000),
"C" = numeric(4000),
stringsAsFactors=FALSE)
a = c(1:1000)
shapiro.hypothesis.2 = rep(0, 4)
durbinWatson.hypothesis.2 = rep(0, 4)
lm.hypothesis.2 = array(list(), 4)
for(i in 1:length(delay)){
a = c((((i-1) * 1000) + 1):(i * 1000))
elements = rep(c(a, a + 4000, a + 8000, a + 12000))
HC.BP.regression.data$H = HC.BP$H[elements]
HC.BP.regression.data$C = HC.BP$C[elements]
lm.hypothesis.2[[i]] = lm(data = HC.BP.regression.data, formula = C ~ H)
durbinWatson.hypothesis.2[i] = dwtest(lm.hypothesis.2[[i]])$p.value
shapiro.hypothesis.2[i] = shapiro.test(lm.hypothesis.2[[i]]$residuals)$p.value
}
cat("Total: ", 4, " Teste de normalidade: ", length(which(shapiro.hypothesis.2 > 0.05)), " Teste de correlação: ", length(which(durbinWatson.hypothesis.2 > 0.05)), '\n')
Total: 4 Teste de normalidade: 0 Teste de correlação: 3
plot(lm.hypothesis.2[[1]], which = c(1:4), pch = 20)




Hypothesis 3: N matter
#laplace distribution
N = c(1e+04, 2e+04, 3e+04, 4e+04, 5e+04, 6e+04, 7e+04, 8e+04, 9e+04, 1e+05)
HC.BP.regression.data = data.frame("H" = numeric(1600),
"C" = numeric(1600),
stringsAsFactors=FALSE)
a = c(1:100)
shapiro.hypothesis.3 = rep(0, 10)
durbinWatson.hypothesis.3 = rep(0, 10)
lm.hypothesis.3 = array(list(), 10)
for(i in 1:length(N)){
a = c((((i-1) * 100) + 1):(i * 100))
elements = rep(c(a, a + 1000, a + 2000, a + 3000, a + 4000, a + 5000, a + 6000, a + 7000, a + 8000, a + 9000, a + 10000, a + 11000, a + 12000, a + 13000, a + 14000, a + 15000))
HC.BP.regression.data$H = HC.BP$H[elements]
HC.BP.regression.data$C = HC.BP$C[elements]
lm.hypothesis.3[[i]] = lm(data = HC.BP.regression.data, formula = C ~ H)
durbinWatson.hypothesis.3[i] = dwtest(lm.hypothesis.3[[i]])$p.value
shapiro.hypothesis.3[i] = shapiro.test(lm.hypothesis.3[[i]]$residuals)$p.value
}
cat("Total: ", 10, " Teste de normalidade: ", length(which(shapiro.hypothesis.3 > 0.05)), " Teste de correlação: ", length(which(durbinWatson.hypothesis.3 > 0.05)), '\n')
Total: 10 Teste de normalidade: 0 Teste de correlação: 10
plot(lm.hypothesis.3[[1]], which = c(1:4), pch = 20)




Hypothesis 4: D and t matter
#laplace distribution
delay = c(1,2,3,4)
dimension = c(3,4,5,6)
HC.BP.regression.data = data.frame("H" = numeric(1000),
"C" = numeric(1000),
stringsAsFactors=FALSE)
shapiro.hypothesis.4 = rep(0, 16)
durbinWatson.hypothesis.4 = rep(0, 16)
lm.hypothesis.4 = array(list(), 16)
for(i in 1:(length(dimension)*length(delay))){
elements = c((((i-1) * 1000) + 1):(i * 1000))
HC.BP.regression.data$H = HC.BP$H[elements]
HC.BP.regression.data$C = HC.BP$C[elements]
lm.hypothesis.4[[i]] = lm(data = HC.BP.regression.data, formula = C ~ H)
shapiro.hypothesis.4[i] = shapiro.test(lm.hypothesis.4[[i]]$residuals)$p.value
durbinWatson.hypothesis.4[i] = dwtest(lm.hypothesis.4[[i]])$p.value
}
cat("Total: ", 16, " Teste de normalidade: ", length(which(shapiro.hypothesis.4 > 0.05)), " Teste de correlação: ", length(which(durbinWatson.hypothesis.4 > 0.05)), '\n')
Total: 16 Teste de normalidade: 0 Teste de correlação: 15
plot(lm.hypothesis.4[[1]], which = c(1:4), pch = 20)




Hypothesis 5: D and N matter
dimension = c(3,4,5,6)
N = c(1e+04, 2e+04, 3e+04, 4e+04, 5e+04, 6e+04, 7e+04, 8e+04, 9e+04, 1e+05)
HC.BP.regression.data = data.frame("H" = numeric(400),
"C" = numeric(400),
stringsAsFactors=FALSE)
shapiro.hypothesis.5 = rep(0, 40)
durbinWatson.hypothesis.5 = rep(0, 40)
lm.hypothesis.5 = array(list(), 40)
b = cc = 0
for(i in 1:length(dimension)){
for(j in 1:length(N)){
cc = cc + 1
a = c((((j - 1) * 100) + 1):(j * 100))
elements = c(a + b, a + b + 1000, a + b + 2000, a + b + 3000)
HC.BP.regression.data$H = HC.BP$H[elements]
HC.BP.regression.data$C = HC.BP$C[elements]
lm.hypothesis.5[[cc]] = lm(data = HC.BP.regression.data, formula = C ~ H)
durbinWatson.hypothesis.5[cc] = dwtest(lm.hypothesis.5[[cc]])$p.value
shapiro.hypothesis.5[cc] = shapiro.test(lm.hypothesis.5[[cc]]$residuals)$p.value
}
b = b + 4000
}
cat("Total: ", 40, " Teste de normalidade: ", length(which(shapiro.hypothesis.5 > 0.05)), " Teste de correlação: ", length(which(durbinWatson.hypothesis.5 > 0.05)), '\n')
Total: 40 Teste de normalidade: 0 Teste de correlação: 39
plot(lm.hypothesis.5[[1]], which = c(1:4), pch = 20)




Hypothesis 6: t and N matter
delay = c(1,2,3,4)
N = c(1e+04, 2e+04, 3e+04, 4e+04, 5e+04, 6e+04, 7e+04, 8e+04, 9e+04, 1e+05)
HC.BP.regression.data = data.frame("H" = numeric(400),
"C" = numeric(400),
stringsAsFactors=FALSE)
shapiro.hypothesis.6 = rep(0, 40)
lm.hypothesis.6 = array(list(), 40)
durbinWatson.hypothesis.6 = rep(0, 40)
b = cc = 0
for(i in 1:length(delay)){
for(j in 1:length(N)){
cc = cc + 1
a = c((((j - 1) * 100) + 1):(j * 100))
elements = c(a + b, a + b + 4000, a + b + 8000, a + b + 12000)
HC.BP.regression.data$H = HC.BP$H[elements]
HC.BP.regression.data$C = HC.BP$C[elements]
lm.hypothesis.6[[cc]] = lm(data = HC.BP.regression.data, formula = C ~ H)
durbinWatson.hypothesis.6[cc] = dwtest(lm.hypothesis.6[[cc]])$p.value
shapiro.hypothesis.6[cc] = shapiro.test(lm.hypothesis.6[[cc]]$residuals)$p.value
}
b = b + 1000
}
cat("Total: ", 40, " Teste de normalidade: ", length(which(shapiro.hypothesis.6 > 0.05)), " Teste de correlação: ", length(which(durbinWatson.hypothesis.6 > 0.05)), '\n')
Total: 40 Teste de normalidade: 0 Teste de correlação: 39
plot(lm.hypothesis.6[[1]], which = c(1:4), pch = 20)




The problem
As seen in the normality tests above, the residues obtained do not follow a normal distribution and are therefore not i.i.d (independent and identically distributed). According to the Gauss-Markov theorem, the least squares regression is the best linear estimate of the underlying coefficients, when it is assumed that the errors are not correlated and have the same variance. However, in the presence of non-normal residues or heteroscedasticity, the estimator does not work efficiently (although it remains consistent), producing inaccurate confidence intervals.
One of the major problems with non-normality or heteroscedasticity is that the amount of error in the model generated is not consistent with the entire range of data observed, making the weight’s predictive capacity not the same over the entire dependent variable range. Thus, predictors technically mean different things at different levels of the dependent variable and cannot explain all trends in the data set.
Logarithmic transformation
Hypothesis 0: D, t and N matter
delay = c(1,2,3,4)
dimension = c(3,4,5,6)
N = c(1e+04, 2e+04, 3e+04, 4e+04, 5e+04, 6e+04, 7e+04, 8e+04, 9e+04, 1e+05)
HC.BP.regression.data = data.frame("H" = numeric(100),
"C" = numeric(100),
stringsAsFactors=FALSE)
shapiro.hypothesis.0 = rep(0, 160)
durbinWatson.hypothesis.0 = rep(0, 160)
lm.hypothesis.0 = array(list(), 160)
plots.hypothesis.0 = array(list(), 160)
for(i in 1:160){
elements = c((((i-1)*100)+1):(i*100))
HC.BP.regression.data$H = log10(HC.BP$H[elements])
HC.BP.regression.data$C = HC.BP$C[elements]
lm.hypothesis.0[[i]] = lm(data = HC.BP.regression.data, formula = C ~ H)
durbinWatson.hypothesis.0[i] = dwtest(lm.hypothesis.0[[i]])$p.value
shapiro.hypothesis.0[i] = shapiro.test(lm.hypothesis.0[[i]]$residuals)$p.value
HC.BP.regression.data$predicted = predict(lm.hypothesis.0[[i]])
HC.BP.regression.data$residuals = residuals(lm.hypothesis.0[[i]])
plots.hypothesis.0[[i]] = ggplot(HC.BP.regression.data, aes(x = H, y = C)) +
geom_point(colour = "blue") +
geom_smooth(method="lm", formula = y~x) +
geom_segment(aes(xend = H, yend = predicted)) +
geom_point(aes(y = predicted), shape = 1)
}
cat("Total: ", 160, " Teste de normalidade: ", length(which(shapiro.hypothesis.0 > 0.05)), " Teste de correlação: ", length(which(durbinWatson.hypothesis.0 > 0.05)), '\n')
Total: 160 Teste de normalidade: 0 Teste de correlação: 152
plot(lm.hypothesis.0[[1]], which = c(1:4), pch = 20)




Hypothesis 5: D and N matter
dimension = c(3,4,5,6)
N = c(1e+04, 2e+04, 3e+04, 4e+04, 5e+04, 6e+04, 7e+04, 8e+04, 9e+04, 1e+05)
HC.BP.regression.data = data.frame("H" = numeric(400),
"C" = numeric(400),
stringsAsFactors=FALSE)
shapiro.hypothesis.5 = rep(0, 40)
durbinWatson.hypothesis.5 = rep(0, 40)
lm.hypothesis.5 = array(list(), 40)
b = cc = 0
for(i in 1:length(dimension)){
for(j in 1:length(N)){
cc = cc + 1
a = c((((j - 1) * 100) + 1):(j * 100))
elements = c(a + b, a + b + 1000, a + b + 2000, a + b + 3000)
HC.BP.regression.data$H = log10(HC.BP$H[elements])
HC.BP.regression.data$C = HC.BP$C[elements]
lm.hypothesis.5[[cc]] = lm(data = HC.BP.regression.data, formula = C ~ H)
durbinWatson.hypothesis.5[cc] = dwtest(lm.hypothesis.5[[cc]])$p.value
shapiro.hypothesis.5[cc] = shapiro.test(lm.hypothesis.5[[cc]]$residuals)$p.value
}
b = b + 4000
}
cat("Total: ", 40, " Teste de normalidade: ", length(which(shapiro.hypothesis.5 > 0.05)), " Teste de correlação: ", length(which(durbinWatson.hypothesis.5 > 0.05)), '\n')
Total: 40 Teste de normalidade: 0 Teste de correlação: 39
plot(lm.hypothesis.5[[1]], which = c(1:4), pch = 20)




log(y/(1 − y)) transformation
Hypothesis 0: D, t and N matter
delay = c(1,2,3,4)
dimension = c(3,4,5,6)
N = c(1e+04, 2e+04, 3e+04, 4e+04, 5e+04, 6e+04, 7e+04, 8e+04, 9e+04, 1e+05)
HC.BP.regression.data = data.frame("H" = numeric(100),
"C" = numeric(100),
stringsAsFactors=FALSE)
shapiro.hypothesis.0 = rep(0, 160)
durbinWatson.hypothesis.0 = rep(0, 160)
lm.hypothesis.0 = array(list(), 160)
plots.hypothesis.0 = array(list(), 160)
for(i in 1:160){
elements = c((((i-1)*100)+1):(i*100))
x = HC.BP$H[elements]
y = HC.BP$C[elements]
HC.BP.regression.data$H = log(x/(1-x))
HC.BP.regression.data$C = log(y/(1-y))
lm.hypothesis.0[[i]] = lm(data = HC.BP.regression.data, formula = C ~ H)
durbinWatson.hypothesis.0[i] = dwtest(lm.hypothesis.0[[i]])$p.value
shapiro.hypothesis.0[i] = shapiro.test(lm.hypothesis.0[[i]]$residuals)$p.value
HC.BP.regression.data$predicted = predict(lm.hypothesis.0[[i]])
HC.BP.regression.data$residuals = residuals(lm.hypothesis.0[[i]])
plots.hypothesis.0[[i]] = ggplot(HC.BP.regression.data, aes(x = H, y = C)) +
geom_point(colour = "blue") +
geom_smooth(method="lm", formula = y~x) +
geom_segment(aes(xend = H, yend = predicted)) +
geom_point(aes(y = predicted), shape = 1)
}
cat("Total: ", 160, " Teste de normalidade: ", length(which(shapiro.hypothesis.0 > 0.05)), " Teste de correlação: ", length(which(durbinWatson.hypothesis.0 > 0.05)), '\n')
Total: 160 Teste de normalidade: 41 Teste de correlação: 147
plot(lm.hypothesis.0[[1]], which = c(1:4), pch = 20)




Hypothesis 5: D and N matter
dimension = c(3,4,5,6)
N = c(1e+04, 2e+04, 3e+04, 4e+04, 5e+04, 6e+04, 7e+04, 8e+04, 9e+04, 1e+05)
HC.BP.regression.data = data.frame("H" = numeric(400),
"C" = numeric(400),
stringsAsFactors=FALSE)
shapiro.hypothesis.5 = rep(0, 40)
durbinWatson.hypothesis.5 = rep(0, 40)
lm.hypothesis.5 = array(list(), 40)
b = cc = 0
for(i in 1:length(dimension)){
for(j in 1:length(N)){
cc = cc + 1
a = c((((j - 1) * 100) + 1):(j * 100))
elements = c(a + b, a + b + 1000, a + b + 2000, a + b + 3000)
x = HC.BP$H[elements]
y = HC.BP$C[elements]
HC.BP.regression.data$H = log(x/(1-x))
HC.BP.regression.data$C = log(y/(1-y))
lm.hypothesis.5[[cc]] = lm(data = HC.BP.regression.data, formula = C ~ H)
durbinWatson.hypothesis.5[cc] = dwtest(lm.hypothesis.5[[cc]])$p.value
shapiro.hypothesis.5[cc] = shapiro.test(lm.hypothesis.5[[cc]]$residuals)$p.value
}
b = b + 4000
}
cat("Total: ", 40, " Teste de normalidade: ", length(which(shapiro.hypothesis.5 > 0.05)), " Teste de correlação: ", length(which(durbinWatson.hypothesis.5 > 0.05)), '\n')
Total: 40 Teste de normalidade: 0 Teste de correlação: 38
plot(lm.hypothesis.5[[1]], which = c(1:4), pch = 20)




Square root transformation
Hypothesis 0: D, t and N matter
delay = c(1,2,3,4)
dimension = c(3,4,5,6)
N = c(1e+04, 2e+04, 3e+04, 4e+04, 5e+04, 6e+04, 7e+04, 8e+04, 9e+04, 1e+05)
HC.BP.regression.data = data.frame("H" = numeric(100),
"C" = numeric(100),
stringsAsFactors=FALSE)
shapiro.hypothesis.0 = rep(0, 160)
durbinWatson.hypothesis.0 = rep(0, 160)
lm.hypothesis.0 = array(list(), 160)
plots.hypothesis.0 = array(list(), 160)
for(i in 1:160){
elements = c((((i-1)*100)+1):(i*100))
HC.BP.regression.data$H = sqrt(HC.BP$H[elements])
HC.BP.regression.data$C = HC.BP$C[elements]
lm.hypothesis.0[[i]] = lm(data = HC.BP.regression.data, formula = C ~ H)
durbinWatson.hypothesis.0[i] = dwtest(lm.hypothesis.0[[i]])$p.value
shapiro.hypothesis.0[i] = shapiro.test(lm.hypothesis.0[[i]]$residuals)$p.value
HC.BP.regression.data$predicted = predict(lm.hypothesis.0[[i]])
HC.BP.regression.data$residuals = residuals(lm.hypothesis.0[[i]])
plots.hypothesis.0[[i]] = ggplot(HC.BP.regression.data, aes(x = H, y = C)) +
geom_point(colour = "blue") +
geom_smooth(method="lm", formula = y~x) +
geom_segment(aes(xend = H, yend = predicted)) +
geom_point(aes(y = predicted), shape = 1)
}
cat("Total: ", 160, " Teste de normalidade: ", length(which(shapiro.hypothesis.0 > 0.05)), " Teste de correlação: ", length(which(durbinWatson.hypothesis.0 > 0.05)), '\n')
Total: 160 Teste de normalidade: 0 Teste de correlação: 150
plot(lm.hypothesis.0[[1]], which = c(1:4), pch = 20)




Hypothesis 5: D and N matter
dimension = c(3,4,5,6)
N = c(1e+04, 2e+04, 3e+04, 4e+04, 5e+04, 6e+04, 7e+04, 8e+04, 9e+04, 1e+05)
HC.BP.regression.data = data.frame("H" = numeric(400),
"C" = numeric(400),
stringsAsFactors=FALSE)
shapiro.hypothesis.5 = rep(0, 40)
durbinWatson.hypothesis.5 = rep(0, 40)
lm.hypothesis.5 = array(list(), 40)
b = cc = 0
for(i in 1:length(dimension)){
for(j in 1:length(N)){
cc = cc + 1
a = c((((j - 1) * 100) + 1):(j * 100))
elements = c(a + b, a + b + 1000, a + b + 2000, a + b + 3000)
HC.BP.regression.data$H = sqrt(HC.BP$H[elements])
HC.BP.regression.data$C = HC.BP$C[elements]
lm.hypothesis.5[[cc]] = lm(data = HC.BP.regression.data, formula = C ~ H)
durbinWatson.hypothesis.5[cc] = dwtest(lm.hypothesis.5[[cc]])$p.value
shapiro.hypothesis.5[cc] = shapiro.test(lm.hypothesis.5[[cc]]$residuals)$p.value
}
b = b + 4000
}
cat("Total: ", 40, " Teste de normalidade: ", length(which(shapiro.hypothesis.5 > 0.05)), " Teste de correlação: ", length(which(durbinWatson.hypothesis.5 > 0.05)), '\n')
Total: 40 Teste de normalidade: 0 Teste de correlação: 39
plot(lm.hypothesis.5[[1]], which = c(1:4), pch = 20)




Inverse transformation
Hypothesis 0: D, t and N matter
delay = c(1,2,3,4)
dimension = c(3,4,5,6)
N = c(1e+04, 2e+04, 3e+04, 4e+04, 5e+04, 6e+04, 7e+04, 8e+04, 9e+04, 1e+05)
HC.BP.regression.data = data.frame("H" = numeric(100),
"C" = numeric(100),
stringsAsFactors=FALSE)
shapiro.hypothesis.0 = rep(0, 160)
durbinWatson.hypothesis.0 = rep(0, 160)
lm.hypothesis.0 = array(list(), 160)
plots.hypothesis.0 = array(list(), 160)
for(i in 1:160){
elements = c((((i-1)*100)+1):(i*100))
HC.BP.regression.data$H = 1/(HC.BP$H[elements])
HC.BP.regression.data$C = HC.BP$C[elements]
lm.hypothesis.0[[i]] = lm(data = HC.BP.regression.data, formula = C ~ H)
durbinWatson.hypothesis.0[i] = dwtest(lm.hypothesis.0[[i]])$p.value
shapiro.hypothesis.0[i] = shapiro.test(lm.hypothesis.0[[i]]$residuals)$p.value
HC.BP.regression.data$predicted = predict(lm.hypothesis.0[[i]])
HC.BP.regression.data$residuals = residuals(lm.hypothesis.0[[i]])
plots.hypothesis.0[[i]] = ggplot(HC.BP.regression.data, aes(x = H, y = C)) +
geom_point(colour = "blue") +
geom_smooth(method="lm", formula = y~x) +
geom_segment(aes(xend = H, yend = predicted)) +
geom_point(aes(y = predicted), shape = 1)
}
cat("Total: ", 160, " Teste de normalidade: ", length(which(shapiro.hypothesis.0 > 0.05)), " Teste de correlação: ", length(which(durbinWatson.hypothesis.0 > 0.05)), '\n')
Total: 160 Teste de normalidade: 0 Teste de correlação: 154
plot(lm.hypothesis.0[[1]], which = c(1:4), pch = 20)




Hypothesis 5: D and N matter
dimension = c(3,4,5,6)
N = c(1e+04, 2e+04, 3e+04, 4e+04, 5e+04, 6e+04, 7e+04, 8e+04, 9e+04, 1e+05)
HC.BP.regression.data = data.frame("H" = numeric(400),
"C" = numeric(400),
stringsAsFactors=FALSE)
shapiro.hypothesis.5 = rep(0, 40)
durbinWatson.hypothesis.5 = rep(0, 40)
lm.hypothesis.5 = array(list(), 40)
b = cc = 0
for(i in 1:length(dimension)){
for(j in 1:length(N)){
cc = cc + 1
a = c((((j - 1) * 100) + 1):(j * 100))
elements = c(a + b, a + b + 1000, a + b + 2000, a + b + 3000)
HC.BP.regression.data$H = 1/(HC.BP$H[elements])
HC.BP.regression.data$C = HC.BP$C[elements]
lm.hypothesis.5[[cc]] = lm(data = HC.BP.regression.data, formula = C ~ H)
durbinWatson.hypothesis.5[cc] = dwtest(lm.hypothesis.5[[cc]])$p.value
shapiro.hypothesis.5[cc] = shapiro.test(lm.hypothesis.5[[cc]]$residuals)$p.value
}
b = b + 4000
}
cat("Total: ", 40, " Teste de normalidade: ", length(which(shapiro.hypothesis.5 > 0.05)), " Teste de correlação: ", length(which(durbinWatson.hypothesis.5 > 0.05)), '\n')
Total: 40 Teste de normalidade: 0 Teste de correlação: 38
plot(lm.hypothesis.5[[1]], which = c(1:4), pch = 20)




Solution 2: Change the model
Another alternative is to find a model that fully explains the system’s behavior. For that, one option is to use generalized linear models if we want to relax in normality.
MLGs (Generalized Linear Models) are an extension of ordinary least squares regression models. They make it possible to use other distributions for errors and a link function relating the average of the response variable to the linear combination of the explanatory variables.
Solution 3: Alternative residuals distribution
To obtain an estimate of the model parameters, it is necessary to make assumptions about the distribution of its residues. In the traditional implementation of the linear regression algorithm, using the method of ordinary least squares, this assumption is that the residuals are normally distributed.
In this way, it is possible to make other assumptions of the residues, defining an alternative distribution of the residues through:
- Generalized mixed methods;
- Maximum likelihood estimators;
- Bayesian statistics.
Another option is to use the absolute error as a loss function, since it is the maximum likelihood estimator of the Laplace distribution.
---
title: "Report 1 - The study of the delay's influence and residue analysis"
author: "Eduarda Chagas"
date: "Apr 17, 2020"
output:
  html_notebook: default
  pdf_document: default
---

```{r}
require(lmtest)
require(ggplot2)
require(ggExtra)
require(ggthemes)
require(ggpubr)
require(hrbrthemes)
require(extrafont)
require(gridExtra)
require(ggrepel)
require(lawstat)
require(nortest)
require(NormalLaplace)

theme_set(theme_ipsum(base_family = "Times New Roman", 
              base_size = 10, axis_title_size = 10))
```


##The study of the influence of parameters

To this analysis we used the data set with 16.000 samples, applying all possible configurations with $D \in \{3, 4, 5, 6\}$ and $\tau \in \{1, 2, 3, 4\}$ to the descriptors. 

```{r}
HC.BP = data.frame("H" = numeric(16000), 
                   "C" = numeric(16000),
                   "Dist" = numeric(16000),
                   "D" = numeric(16000),
                   "t" = numeric(16000), 
                   "N" = numeric(16000), 
                   stringsAsFactors=FALSE)

HC.BP$N = as.factor(rep(c(rep(1e+04, 100), rep(2e+04, 100), rep(3e+04, 100), rep(4e+04, 100), rep(5e+04, 100), rep(6e+04, 100), rep(7e+04, 100), rep(8e+04, 100), rep(9e+04, 100), rep(1e+05, 100)), 16))

HC.BP$t = as.factor(rep(c(rep(1,1000), rep(2,1000), rep(3,1000), rep(4,1000)), 4))

file.csv = data.frame(read.csv("../Data/HC_series_fk0_16000.csv"))

HC.BP$H = file.csv[,1]
HC.BP$C = file.csv[,2]
HC.BP$Dist = HC.BP$C / HC.BP$H
HC.BP$D= as.factor(file.csv[,3])

summary(HC.BP)
```

##Alternative 1 - Calculate the regression as a function of H, D, t and N.

```{r}
lm.alternative.1 = lm(data = HC.BP, formula = C ~ H + D + t + N)
summary(lm.alternative.1)
```

```{r}
ad.test(lm.alternative.1$residuals)
plot(lm.alternative.1, which = c(1:4), pch = 20)
hist(lm.alternative.1$residuals, breaks = 500)
```

##Alternative 2 - Calculate the regression based on H 

###Hypothesis 0: D, t and N matter

```{r}
delay = c(1,2,3,4)
dimension = c(3,4,5,6)
N = c(1e+04, 2e+04, 3e+04, 4e+04, 5e+04, 6e+04, 7e+04, 8e+04, 9e+04, 1e+05)
HC.BP.regression.data = data.frame("H" = numeric(100), 
                                   "C" = numeric(100),
                                   stringsAsFactors=FALSE)

shapiro.hypothesis.0 = rep(0, 160)
durbinWatson.hypothesis.0 = rep(0, 160)
lm.hypothesis.0 = array(list(), 160)

for(i in 1:160){
  elements = c((((i-1)*100)+1):(i*100))
  HC.BP.regression.data$H = HC.BP$H[elements]
  HC.BP.regression.data$C = HC.BP$C[elements]
  lm.hypothesis.0[[i]] = lm(data = HC.BP.regression.data, formula = C ~ H)
  durbinWatson.hypothesis.0[i] = dwtest(lm.hypothesis.0[[i]])$p.value
  shapiro.hypothesis.0[i] = shapiro.test(lm.hypothesis.0[[i]]$residuals)$p.value
}
cat("Total: ", 160, " Teste de normalidade: ", length(which(shapiro.hypothesis.0 > 0.05)), " Teste de correlação: ", length(which(durbinWatson.hypothesis.0 > 0.05)), '\n')
```

```{r}
plot(lm.hypothesis.0[[1]], which = c(1:4), pch = 20)
```

###Hypothesis 1: D matter

```{r}
#laplace distribution
dimension = c(3, 4, 5, 6)
HC.BP.regression.data = data.frame("H" = numeric(4000), 
                                   "C" = numeric(4000),
                                   stringsAsFactors=FALSE)
shapiro.hypothesis.1 = rep(0, 4)
durbinWatson.hypothesis.1 = rep(0, 4)
lm.hypothesis.1 = array(list(), 4)

for(i in 1:length(dimension)){
  HC.BP.regression.data$H = HC.BP$H[(((i-1)*4000)+1):(i*4000)]
  HC.BP.regression.data$C = HC.BP$C[(((i-1)*4000)+1):(i*4000)]
  lm.hypothesis.1[[i]] = lm(data = HC.BP.regression.data, formula = C ~ H)
  durbinWatson.hypothesis.1[i] = dwtest(lm.hypothesis.1[[i]])$p.value
  shapiro.hypothesis.1[i] = shapiro.test(lm.hypothesis.1[[i]]$residuals)$p.value
}
cat("Total: ", 4, " Teste de normalidade: ", length(which(shapiro.hypothesis.1 > 0.05)), " Teste de correlação: ", length(which(durbinWatson.hypothesis.1 > 0.05)), '\n')
```

```{r}
plot(lm.hypothesis.1[[1]], which = c(1:4), pch = 20)
```

###Hypothesis 2: t matter

```{r}
#laplace distribution
delay = c(1, 2, 3, 4)
HC.BP.regression.data = data.frame("H" = numeric(4000), 
                                   "C" = numeric(4000),
                                   stringsAsFactors=FALSE)
a = c(1:1000)
shapiro.hypothesis.2 = rep(0, 4)
durbinWatson.hypothesis.2 = rep(0, 4)
lm.hypothesis.2 = array(list(), 4)

for(i in 1:length(delay)){
  a = c((((i-1) * 1000) + 1):(i * 1000))
  elements = rep(c(a, a + 4000, a + 8000, a + 12000))
  HC.BP.regression.data$H = HC.BP$H[elements]
  HC.BP.regression.data$C = HC.BP$C[elements]
  lm.hypothesis.2[[i]] = lm(data = HC.BP.regression.data, formula = C ~ H)
  durbinWatson.hypothesis.2[i] = dwtest(lm.hypothesis.2[[i]])$p.value
  shapiro.hypothesis.2[i] = shapiro.test(lm.hypothesis.2[[i]]$residuals)$p.value
}
cat("Total: ", 4, " Teste de normalidade: ", length(which(shapiro.hypothesis.2 > 0.05)), " Teste de correlação: ", length(which(durbinWatson.hypothesis.2 > 0.05)), '\n')
```

```{r}
plot(lm.hypothesis.2[[1]], which = c(1:4), pch = 20)
```

###Hypothesis 3: N matter

```{r}
#laplace distribution
N = c(1e+04, 2e+04, 3e+04, 4e+04, 5e+04, 6e+04, 7e+04, 8e+04, 9e+04, 1e+05)
HC.BP.regression.data = data.frame("H" = numeric(1600), 
                                   "C" = numeric(1600),
                                   stringsAsFactors=FALSE)
a = c(1:100)
shapiro.hypothesis.3 = rep(0, 10)
durbinWatson.hypothesis.3 = rep(0, 10)
lm.hypothesis.3 = array(list(), 10)

for(i in 1:length(N)){
  a = c((((i-1) * 100) + 1):(i * 100))
  elements = rep(c(a, a + 1000, a + 2000, a + 3000, a + 4000, a + 5000, a + 6000, a + 7000, a + 8000, a + 9000, a + 10000, a + 11000, a + 12000, a + 13000, a + 14000, a + 15000))
  HC.BP.regression.data$H = HC.BP$H[elements]
  HC.BP.regression.data$C = HC.BP$C[elements]
  lm.hypothesis.3[[i]] = lm(data = HC.BP.regression.data, formula = C ~ H)
  durbinWatson.hypothesis.3[i] = dwtest(lm.hypothesis.3[[i]])$p.value
  shapiro.hypothesis.3[i] = shapiro.test(lm.hypothesis.3[[i]]$residuals)$p.value
}
cat("Total: ", 10, " Teste de normalidade: ", length(which(shapiro.hypothesis.3 > 0.05)), " Teste de correlação: ", length(which(durbinWatson.hypothesis.3 > 0.05)), '\n')
```

```{r}
plot(lm.hypothesis.3[[1]], which = c(1:4), pch = 20)
```

###Hypothesis 4: D and t matter

```{r}
#laplace distribution
delay = c(1,2,3,4)
dimension = c(3,4,5,6)
HC.BP.regression.data = data.frame("H" = numeric(1000), 
                                   "C" = numeric(1000),
                                   stringsAsFactors=FALSE)

shapiro.hypothesis.4 = rep(0, 16)
durbinWatson.hypothesis.4 = rep(0, 16)
lm.hypothesis.4 = array(list(), 16)

for(i in 1:(length(dimension)*length(delay))){
  elements = c((((i-1) * 1000) + 1):(i * 1000))
  HC.BP.regression.data$H = HC.BP$H[elements]
  HC.BP.regression.data$C = HC.BP$C[elements]
  lm.hypothesis.4[[i]] = lm(data = HC.BP.regression.data, formula = C ~ H)
  shapiro.hypothesis.4[i] = shapiro.test(lm.hypothesis.4[[i]]$residuals)$p.value
  durbinWatson.hypothesis.4[i] = dwtest(lm.hypothesis.4[[i]])$p.value
}
cat("Total: ", 16, " Teste de normalidade: ", length(which(shapiro.hypothesis.4 > 0.05)), " Teste de correlação: ", length(which(durbinWatson.hypothesis.4 > 0.05)), '\n')
```

```{r}
plot(lm.hypothesis.4[[1]], which = c(1:4), pch = 20)
```

###Hypothesis 5: D and N matter

```{r}
dimension = c(3,4,5,6)
N = c(1e+04, 2e+04, 3e+04, 4e+04, 5e+04, 6e+04, 7e+04, 8e+04, 9e+04, 1e+05)
HC.BP.regression.data = data.frame("H" = numeric(400), 
                                   "C" = numeric(400),
                                   stringsAsFactors=FALSE)

shapiro.hypothesis.5 = rep(0, 40)
durbinWatson.hypothesis.5 = rep(0, 40)
lm.hypothesis.5 = array(list(), 40)

b = cc = 0
for(i in 1:length(dimension)){
  for(j in 1:length(N)){
    cc = cc + 1
    a = c((((j - 1) * 100) + 1):(j * 100))
    elements = c(a + b, a + b + 1000, a + b + 2000, a + b + 3000)
    HC.BP.regression.data$H = HC.BP$H[elements]
    HC.BP.regression.data$C = HC.BP$C[elements]
    lm.hypothesis.5[[cc]] = lm(data = HC.BP.regression.data, formula = C ~ H)
    durbinWatson.hypothesis.5[cc] = dwtest(lm.hypothesis.5[[cc]])$p.value
    shapiro.hypothesis.5[cc] = shapiro.test(lm.hypothesis.5[[cc]]$residuals)$p.value
  }
  b = b + 4000
}
cat("Total: ", 40, " Teste de normalidade: ", length(which(shapiro.hypothesis.5 > 0.05)), " Teste de correlação: ", length(which(durbinWatson.hypothesis.5 > 0.05)), '\n')
```

```{r}
plot(lm.hypothesis.5[[1]], which = c(1:4), pch = 20)
```

###Hypothesis 6: t and N matter

```{r}
delay = c(1,2,3,4)
N = c(1e+04, 2e+04, 3e+04, 4e+04, 5e+04, 6e+04, 7e+04, 8e+04, 9e+04, 1e+05)
HC.BP.regression.data = data.frame("H" = numeric(400), 
                                   "C" = numeric(400),
                                   stringsAsFactors=FALSE)

shapiro.hypothesis.6 = rep(0, 40)
lm.hypothesis.6 = array(list(), 40)
durbinWatson.hypothesis.6 = rep(0, 40)

b = cc = 0
for(i in 1:length(delay)){
  for(j in 1:length(N)){
    cc = cc + 1
    a = c((((j - 1) * 100) + 1):(j * 100))
    elements = c(a + b, a + b + 4000, a + b + 8000, a + b + 12000)
    HC.BP.regression.data$H = HC.BP$H[elements]
    HC.BP.regression.data$C = HC.BP$C[elements]
    lm.hypothesis.6[[cc]] = lm(data = HC.BP.regression.data, formula = C ~ H)
    durbinWatson.hypothesis.6[cc] = dwtest(lm.hypothesis.6[[cc]])$p.value
    shapiro.hypothesis.6[cc] = shapiro.test(lm.hypothesis.6[[cc]]$residuals)$p.value
  }
  b = b + 1000
}
cat("Total: ", 40, " Teste de normalidade: ", length(which(shapiro.hypothesis.6 > 0.05)), " Teste de correlação: ", length(which(durbinWatson.hypothesis.6 > 0.05)), '\n')
```

```{r}
plot(lm.hypothesis.6[[1]], which = c(1:4), pch = 20)
```

###The problem

As seen in the normality tests above, the residues obtained do not follow a normal distribution and are therefore not i.i.d (independent and identically distributed).
According to the Gauss-Markov theorem, the least squares regression is the best linear estimate of the underlying coefficients, when it is assumed that the errors are not correlated and have the same variance.
However, in the presence of non-normal residues or heteroscedasticity, the estimator does not work efficiently (although it remains consistent), producing inaccurate confidence intervals.

One of the major problems with non-normality or heteroscedasticity is that the amount of error in the model generated is not consistent with the entire range of data observed, making the weight's predictive capacity not the same over the entire dependent variable range.
Thus, predictors technically mean different things at different levels of the dependent variable and cannot explain all trends in the data set.

###Possible solutions

####Solution 1: Transformation of the dependent variable

The transformation of the dependent variable can be an option to correct the problem encountered, but at the same time it makes the interpretation of the general model a little more opaque.

The strength of the transformations tends to follow the order below:

1. Logarithmic
2. Square root
3. Reverse

###Logarithmic transformation

###Hypothesis 0: D, t and N matter

```{r}
delay = c(1,2,3,4)
dimension = c(3,4,5,6)
N = c(1e+04, 2e+04, 3e+04, 4e+04, 5e+04, 6e+04, 7e+04, 8e+04, 9e+04, 1e+05)
HC.BP.regression.data = data.frame("H" = numeric(100), 
                                   "C" = numeric(100),
                                   stringsAsFactors=FALSE)

shapiro.hypothesis.0 = rep(0, 160)
durbinWatson.hypothesis.0 = rep(0, 160)
lm.hypothesis.0 = array(list(), 160)
plots.hypothesis.0 = array(list(), 160)

for(i in 1:160){
  elements = c((((i-1)*100)+1):(i*100))
  HC.BP.regression.data$H = log10(HC.BP$H[elements])
  HC.BP.regression.data$C = HC.BP$C[elements]
  lm.hypothesis.0[[i]] = lm(data = HC.BP.regression.data, formula = C ~ H)
  durbinWatson.hypothesis.0[i] = dwtest(lm.hypothesis.0[[i]])$p.value
  shapiro.hypothesis.0[i] = shapiro.test(lm.hypothesis.0[[i]]$residuals)$p.value
  HC.BP.regression.data$predicted = predict(lm.hypothesis.0[[i]]) 
  HC.BP.regression.data$residuals = residuals(lm.hypothesis.0[[i]])
  plots.hypothesis.0[[i]] = ggplot(HC.BP.regression.data, aes(x = H, y = C)) +
                                geom_point(colour = "blue") +
                                geom_smooth(method="lm", formula = y~x) +
                                geom_segment(aes(xend = H, yend = predicted)) +
                                geom_point(aes(y = predicted), shape = 1)
}
cat("Total: ", 160, " Teste de normalidade: ", length(which(shapiro.hypothesis.0 > 0.05)), " Teste de correlação: ", length(which(durbinWatson.hypothesis.0 > 0.05)), '\n')
```

```{r}
plot(lm.hypothesis.0[[1]], which = c(1:4), pch = 20)
```

###Hypothesis 5: D and N matter

```{r}
dimension = c(3,4,5,6)
N = c(1e+04, 2e+04, 3e+04, 4e+04, 5e+04, 6e+04, 7e+04, 8e+04, 9e+04, 1e+05)
HC.BP.regression.data = data.frame("H" = numeric(400), 
                                   "C" = numeric(400),
                                   stringsAsFactors=FALSE)

shapiro.hypothesis.5 = rep(0, 40)
durbinWatson.hypothesis.5 = rep(0, 40)
lm.hypothesis.5 = array(list(), 40)

b = cc = 0
for(i in 1:length(dimension)){
  for(j in 1:length(N)){
    cc = cc + 1
    a = c((((j - 1) * 100) + 1):(j * 100))
    elements = c(a + b, a + b + 1000, a + b + 2000, a + b + 3000)
    HC.BP.regression.data$H = log10(HC.BP$H[elements])
    HC.BP.regression.data$C = HC.BP$C[elements]
    lm.hypothesis.5[[cc]] = lm(data = HC.BP.regression.data, formula = C ~ H)
    durbinWatson.hypothesis.5[cc] = dwtest(lm.hypothesis.5[[cc]])$p.value
    shapiro.hypothesis.5[cc] = shapiro.test(lm.hypothesis.5[[cc]]$residuals)$p.value
  }
  b = b + 4000
}
cat("Total: ", 40, " Teste de normalidade: ", length(which(shapiro.hypothesis.5 > 0.05)), " Teste de correlação: ", length(which(durbinWatson.hypothesis.5 > 0.05)), '\n')
```

```{r}
plot(lm.hypothesis.5[[1]], which = c(1:4), pch = 20)
```


### log(y/(1 − y)) transformation

###Hypothesis 0: D, t and N matter

```{r}
delay = c(1,2,3,4)
dimension = c(3,4,5,6)
N = c(1e+04, 2e+04, 3e+04, 4e+04, 5e+04, 6e+04, 7e+04, 8e+04, 9e+04, 1e+05)
HC.BP.regression.data = data.frame("H" = numeric(100), 
                                   "C" = numeric(100),
                                   stringsAsFactors=FALSE)

shapiro.hypothesis.0 = rep(0, 160)
durbinWatson.hypothesis.0 = rep(0, 160)
lm.hypothesis.0 = array(list(), 160)
plots.hypothesis.0 = array(list(), 160)

for(i in 1:160){
  elements = c((((i-1)*100)+1):(i*100))
  x = HC.BP$H[elements]
  y = HC.BP$C[elements]
  HC.BP.regression.data$H = log(x/(1-x))
  HC.BP.regression.data$C = log(y/(1-y))
  lm.hypothesis.0[[i]] = lm(data = HC.BP.regression.data, formula = C ~ H)
  durbinWatson.hypothesis.0[i] = dwtest(lm.hypothesis.0[[i]])$p.value
  shapiro.hypothesis.0[i] = shapiro.test(lm.hypothesis.0[[i]]$residuals)$p.value
  HC.BP.regression.data$predicted = predict(lm.hypothesis.0[[i]]) 
  HC.BP.regression.data$residuals = residuals(lm.hypothesis.0[[i]])
  plots.hypothesis.0[[i]] = ggplot(HC.BP.regression.data, aes(x = H, y = C)) +
                                geom_point(colour = "blue") +
                                geom_smooth(method="lm", formula = y~x) +
                                geom_segment(aes(xend = H, yend = predicted)) +
                                geom_point(aes(y = predicted), shape = 1)
}
cat("Total: ", 160, " Teste de normalidade: ", length(which(shapiro.hypothesis.0 > 0.05)), " Teste de correlação: ", length(which(durbinWatson.hypothesis.0 > 0.05)), '\n')
```

```{r}
plot(lm.hypothesis.0[[1]], which = c(1:4), pch = 20)
```

###Hypothesis 5: D and N matter

```{r}
dimension = c(3,4,5,6)
N = c(1e+04, 2e+04, 3e+04, 4e+04, 5e+04, 6e+04, 7e+04, 8e+04, 9e+04, 1e+05)
HC.BP.regression.data = data.frame("H" = numeric(400), 
                                   "C" = numeric(400),
                                   stringsAsFactors=FALSE)

shapiro.hypothesis.5 = rep(0, 40)
durbinWatson.hypothesis.5 = rep(0, 40)
lm.hypothesis.5 = array(list(), 40)

b = cc = 0
for(i in 1:length(dimension)){
  for(j in 1:length(N)){
    cc = cc + 1
    a = c((((j - 1) * 100) + 1):(j * 100))
    elements = c(a + b, a + b + 1000, a + b + 2000, a + b + 3000)
    x = HC.BP$H[elements]
    y = HC.BP$C[elements]
    HC.BP.regression.data$H = log(x/(1-x))
    HC.BP.regression.data$C = log(y/(1-y))
    lm.hypothesis.5[[cc]] = lm(data = HC.BP.regression.data, formula = C ~ H)
    durbinWatson.hypothesis.5[cc] = dwtest(lm.hypothesis.5[[cc]])$p.value
    shapiro.hypothesis.5[cc] = shapiro.test(lm.hypothesis.5[[cc]]$residuals)$p.value
  }
  b = b + 4000
}
cat("Total: ", 40, " Teste de normalidade: ", length(which(shapiro.hypothesis.5 > 0.05)), " Teste de correlação: ", length(which(durbinWatson.hypothesis.5 > 0.05)), '\n')
```

```{r}
plot(lm.hypothesis.5[[1]], which = c(1:4), pch = 20)
```

###Square root transformation

###Hypothesis 0: D, t and N matter

```{r}
delay = c(1,2,3,4)
dimension = c(3,4,5,6)
N = c(1e+04, 2e+04, 3e+04, 4e+04, 5e+04, 6e+04, 7e+04, 8e+04, 9e+04, 1e+05)
HC.BP.regression.data = data.frame("H" = numeric(100), 
                                   "C" = numeric(100),
                                   stringsAsFactors=FALSE)

shapiro.hypothesis.0 = rep(0, 160)
durbinWatson.hypothesis.0 = rep(0, 160)
lm.hypothesis.0 = array(list(), 160)
plots.hypothesis.0 = array(list(), 160)

for(i in 1:160){
  elements = c((((i-1)*100)+1):(i*100))
  HC.BP.regression.data$H = sqrt(HC.BP$H[elements])
  HC.BP.regression.data$C = HC.BP$C[elements]
  lm.hypothesis.0[[i]] = lm(data = HC.BP.regression.data, formula = C ~ H)
  durbinWatson.hypothesis.0[i] = dwtest(lm.hypothesis.0[[i]])$p.value
  shapiro.hypothesis.0[i] = shapiro.test(lm.hypothesis.0[[i]]$residuals)$p.value
  HC.BP.regression.data$predicted = predict(lm.hypothesis.0[[i]]) 
  HC.BP.regression.data$residuals = residuals(lm.hypothesis.0[[i]])
  plots.hypothesis.0[[i]] = ggplot(HC.BP.regression.data, aes(x = H, y = C)) +
                                geom_point(colour = "blue") +
                                geom_smooth(method="lm", formula = y~x) +
                                geom_segment(aes(xend = H, yend = predicted)) +
                                geom_point(aes(y = predicted), shape = 1)
}
cat("Total: ", 160, " Teste de normalidade: ", length(which(shapiro.hypothesis.0 > 0.05)), " Teste de correlação: ", length(which(durbinWatson.hypothesis.0 > 0.05)), '\n')
```

```{r}
plot(lm.hypothesis.0[[1]], which = c(1:4), pch = 20)
```

###Hypothesis 5: D and N matter

```{r}
dimension = c(3,4,5,6)
N = c(1e+04, 2e+04, 3e+04, 4e+04, 5e+04, 6e+04, 7e+04, 8e+04, 9e+04, 1e+05)
HC.BP.regression.data = data.frame("H" = numeric(400), 
                                   "C" = numeric(400),
                                   stringsAsFactors=FALSE)

shapiro.hypothesis.5 = rep(0, 40)
durbinWatson.hypothesis.5 = rep(0, 40)
lm.hypothesis.5 = array(list(), 40)

b = cc = 0
for(i in 1:length(dimension)){
  for(j in 1:length(N)){
    cc = cc + 1
    a = c((((j - 1) * 100) + 1):(j * 100))
    elements = c(a + b, a + b + 1000, a + b + 2000, a + b + 3000)
    HC.BP.regression.data$H = sqrt(HC.BP$H[elements])
    HC.BP.regression.data$C = HC.BP$C[elements]
    lm.hypothesis.5[[cc]] = lm(data = HC.BP.regression.data, formula = C ~ H)
    durbinWatson.hypothesis.5[cc] = dwtest(lm.hypothesis.5[[cc]])$p.value
    shapiro.hypothesis.5[cc] = shapiro.test(lm.hypothesis.5[[cc]]$residuals)$p.value
  }
  b = b + 4000
}
cat("Total: ", 40, " Teste de normalidade: ", length(which(shapiro.hypothesis.5 > 0.05)), " Teste de correlação: ", length(which(durbinWatson.hypothesis.5 > 0.05)), '\n')
```

```{r}
plot(lm.hypothesis.5[[1]], which = c(1:4), pch = 20)
```

###Inverse transformation

###Hypothesis 0: D, t and N matter

```{r}
delay = c(1,2,3,4)
dimension = c(3,4,5,6)
N = c(1e+04, 2e+04, 3e+04, 4e+04, 5e+04, 6e+04, 7e+04, 8e+04, 9e+04, 1e+05)
HC.BP.regression.data = data.frame("H" = numeric(100), 
                                   "C" = numeric(100),
                                   stringsAsFactors=FALSE)

shapiro.hypothesis.0 = rep(0, 160)
durbinWatson.hypothesis.0 = rep(0, 160)
lm.hypothesis.0 = array(list(), 160)
plots.hypothesis.0 = array(list(), 160)

for(i in 1:160){
  elements = c((((i-1)*100)+1):(i*100))
  HC.BP.regression.data$H = 1/(HC.BP$H[elements])
  HC.BP.regression.data$C = HC.BP$C[elements]
  lm.hypothesis.0[[i]] = lm(data = HC.BP.regression.data, formula = C ~ H)
  durbinWatson.hypothesis.0[i] = dwtest(lm.hypothesis.0[[i]])$p.value
  shapiro.hypothesis.0[i] = shapiro.test(lm.hypothesis.0[[i]]$residuals)$p.value
  HC.BP.regression.data$predicted = predict(lm.hypothesis.0[[i]]) 
  HC.BP.regression.data$residuals = residuals(lm.hypothesis.0[[i]])
  plots.hypothesis.0[[i]] = ggplot(HC.BP.regression.data, aes(x = H, y = C)) +
                                geom_point(colour = "blue") +
                                geom_smooth(method="lm", formula = y~x) +
                                geom_segment(aes(xend = H, yend = predicted)) +
                                geom_point(aes(y = predicted), shape = 1)
}
cat("Total: ", 160, " Teste de normalidade: ", length(which(shapiro.hypothesis.0 > 0.05)), " Teste de correlação: ", length(which(durbinWatson.hypothesis.0 > 0.05)), '\n')
```

```{r}
plot(lm.hypothesis.0[[1]], which = c(1:4), pch = 20)
```

###Hypothesis 5: D and N matter

```{r}
dimension = c(3,4,5,6)
N = c(1e+04, 2e+04, 3e+04, 4e+04, 5e+04, 6e+04, 7e+04, 8e+04, 9e+04, 1e+05)
HC.BP.regression.data = data.frame("H" = numeric(400), 
                                   "C" = numeric(400),
                                   stringsAsFactors=FALSE)

shapiro.hypothesis.5 = rep(0, 40)
durbinWatson.hypothesis.5 = rep(0, 40)
lm.hypothesis.5 = array(list(), 40)

b = cc = 0
for(i in 1:length(dimension)){
  for(j in 1:length(N)){
    cc = cc + 1
    a = c((((j - 1) * 100) + 1):(j * 100))
    elements = c(a + b, a + b + 1000, a + b + 2000, a + b + 3000)
    HC.BP.regression.data$H = 1/(HC.BP$H[elements])
    HC.BP.regression.data$C = HC.BP$C[elements]
    lm.hypothesis.5[[cc]] = lm(data = HC.BP.regression.data, formula = C ~ H)
    durbinWatson.hypothesis.5[cc] = dwtest(lm.hypothesis.5[[cc]])$p.value
    shapiro.hypothesis.5[cc] = shapiro.test(lm.hypothesis.5[[cc]]$residuals)$p.value
  }
  b = b + 4000
}
cat("Total: ", 40, " Teste de normalidade: ", length(which(shapiro.hypothesis.5 > 0.05)), " Teste de correlação: ", length(which(durbinWatson.hypothesis.5 > 0.05)), '\n')
```

```{r}
plot(lm.hypothesis.5[[1]], which = c(1:4), pch = 20)
```

####Solution 2: Change the model

Another alternative is to find a model that fully explains the system's behavior.
For that, one option is to use generalized linear models if we want to relax in normality.

MLGs (Generalized Linear Models) are an extension of ordinary least squares regression models.
They make it possible to use other distributions for errors and a link function relating the average of the response variable to the linear combination of the explanatory variables.

####Solution 3: Alternative residuals distribution

To obtain an estimate of the model parameters, it is necessary to make assumptions about the distribution of its residues.
In the traditional implementation of the linear regression algorithm, using the method of ordinary least squares, this assumption is that the residuals are normally distributed.

In this way, it is possible to make other assumptions of the residues, defining an alternative distribution of the residues through:

1. Generalized mixed methods;
2. Maximum likelihood estimators;
3. Bayesian statistics.

Another option is to use the absolute error as a loss function, since it is the maximum likelihood estimator of the Laplace distribution.
