data(cloud, package = "robustbase")
cloud %>%
slice_head(n=10) %>%
gt::gt()| Percentage | CloudPoint |
|---|---|
| 0 | 22.1 |
| 1 | 24.5 |
| 2 | 26.0 |
| 3 | 26.8 |
| 4 | 28.2 |
| 5 | 28.9 |
| 6 | 30.0 |
| 7 | 30.4 |
| 8 | 31.4 |
| 0 | 21.9 |
cloud que se encontra no pacote robustbase, no qual a descrição segue abaixo.Descrição: This data set contains the measurements concerning the cloud point of a Liquid, from Draper and Smith (1969). The cloud point is a measure of the degree of crystallization in a stock.
data(cloud, package = "robustbase")
cloud %>%
slice_head(n=10) %>%
gt::gt()| Percentage | CloudPoint |
|---|---|
| 0 | 22.1 |
| 1 | 24.5 |
| 2 | 26.0 |
| 3 | 26.8 |
| 4 | 28.2 |
| 5 | 28.9 |
| 6 | 30.0 |
| 7 | 30.4 |
| 8 | 31.4 |
| 0 | 21.9 |
glimpse(cloud)Rows: 19
Columns: 2
$ Percentage <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 2, 4, 6, 8, 10, 0, 3, 6, 9
$ CloudPoint <dbl> 22.1, 24.5, 26.0, 26.8, 28.2, 28.9, 30.0, 30.4, 31.4, 21.9,…
Desc do pacote DescToolsDescTools::Desc(cloud,plotit = F, digits = 3)------------------------------------------------------------------------------
Describe cloud (data.frame):
data frame: 19 obs. of 2 variables
19 complete cases (100.0%)
Nr ColName Class NAs Levels
1 Percentage integer .
2 CloudPoint numeric .
------------------------------------------------------------------------------
1 - Percentage (integer)
length n NAs unique 0s mean meanCI'
19 19 0 11 3 4.421 2.903
100.0% 0.0% 15.8% 5.939
.05 .10 .25 median .75 .90 .95
0.000 0.000 2.000 4.000 6.500 8.200 9.100
range sd vcoef mad IQR skew kurt
10.000 3.150 0.713 2.965 4.500 0.096 -1.312
value freq perc cumfreq cumperc
1 0 3 15.8% 3 15.8%
2 1 1 5.3% 4 21.1%
3 2 2 10.5% 6 31.6%
4 3 2 10.5% 8 42.1%
5 4 2 10.5% 10 52.6%
6 5 1 5.3% 11 57.9%
7 6 3 15.8% 14 73.7%
8 7 1 5.3% 15 78.9%
9 8 2 10.5% 17 89.5%
10 9 1 5.3% 18 94.7%
11 10 1 5.3% 19 100.0%
' 95%-CI (classic)
------------------------------------------------------------------------------
2 - CloudPoint (numeric)
length n NAs unique 0s mean meanCI'
19 19 0 = n 0 27.968 26.344
100.0% 0.0% 0.0% 29.593
.05 .10 .25 median .75 .90 .95
22.080 22.660 26.050 28.500 30.350 31.560 31.930
range sd vcoef mad IQR skew kurt
11.200 3.370 0.121 3.558 4.300 -0.417 -1.072
lowest : 21.900, 22.100, 22.800, 24.500, 26.000
highest: 30.400, 31.400, 31.500, 31.800, 33.100
' 95%-CI (classic)
ggplot(cloud, aes(x = Percentage, y = CloudPoint)) +
geom_point()+
xlab("Porcentagem") + ylab("Ponto de Nuvem")+
labs(title = "Gráfico de dispersão entre o Ponto de Nuvem de Acordo com a Porcentagem")De acordo com a visualização gráfica tem bastante evidências de que o ajuste através de regressão robusta seja adequado.
Observação: Vamos remover os outliers e proceder o ajuste utilizando os mesmos métodos, só que não será exibido a saída do ajuste sem os ouliers a não ser na comparação final
dados <- cloud[-c(1,10,16),]mod.lm <- lm(CloudPoint~Percentage, data = cloud)
summary(mod.lm)
Call:
lm(formula = CloudPoint ~ Percentage, data = cloud)
Residuals:
Min 1Q Median 3Q Max
-1.4464 -0.4282 0.1809 0.6127 0.9718
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 23.34638 0.29675 78.67 < 2e-16 ***
Percentage 1.04546 0.05516 18.95 7.18e-13 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.7372 on 17 degrees of freedom
Multiple R-squared: 0.9548, Adjusted R-squared: 0.9522
F-statistic: 359.3 on 1 and 17 DF, p-value: 7.182e-13
Resultado: De acordo com o ajuste, tivemos um coeficente de determinação R^2 de 95.48%, que é um bom valor, tivemos que o modelo foi significativo, ou seja, que a regressão está bem ajustada que no teste t, o intercepto e a variável são significativas para o ajuste, pois a hipótese de que o \beta_i = 0 foram rejeitadas.
mod.lmo <- lm(CloudPoint~Percentage, data = dados)xmod = cbind(1,cloud$Percentage)
sigma2 = (summary(mod.lm)$sigma)^2
peso = 1/diag(sigma2*xmod%*%(solve(t(xmod)%*%xmod))%*%t(xmod))
mod.lm2 = lm(CloudPoint ~ Percentage, weights = peso, data = cloud)
summary(mod.lm2)
Call:
lm(formula = CloudPoint ~ Percentage, data = cloud, weights = peso)
Weighted Residuals:
Min 1Q Median 3Q Max
-5.8819 -2.2849 -0.1868 1.5833 4.2636
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 23.6455 0.2985 79.20 < 2e-16 ***
Percentage 1.0317 0.0594 17.37 2.96e-12 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2.935 on 17 degrees of freedom
Multiple R-squared: 0.9466, Adjusted R-squared: 0.9435
F-statistic: 301.6 on 1 and 17 DF, p-value: 2.964e-12
Resultado: De acordo com o ajuste, tivemos um coeficente de determinação R^2 de 94.66%, que é um bom valor, tivemos que o modelo foi significativo, ou seja, que a regressão está bem ajustada que no teste t, o intercepto e a variável são significativas para o ajuste, pois a hipótese de que o \beta_i = 0 foram rejeitadas. Todavia, essa foi menos precisa que a linear simples, pois a estimativa de \hat{\sigma} foi maior nessa sendo de 2.9351, enquanto na linear simples foi de 0.7372, esse valor mais baixo produzirá intervalos de confiança com uma menor variabilidade.
xmod1 = cbind(1,dados$Percentage)
sigma21 = (summary(mod.lmo)$sigma)^2
peso1 = 1/diag(sigma21*xmod1%*%(solve(t(xmod1)%*%xmod1))%*%t(xmod1))
mod.lmo2 = lm(CloudPoint ~ Percentage, weights = peso1, data = dados)mod.M <- MASS::rlm(CloudPoint~Percentage, data = cloud)
summary(mod.M)
Call: rlm(formula = CloudPoint ~ Percentage, data = cloud)
Residuals:
Min 1Q Median 3Q Max
-1.5093 -0.4527 0.1756 0.5690 0.9473
Coefficients:
Value Std. Error t value
(Intercept) 23.4093 0.3170 73.8421
Percentage 1.0359 0.0589 17.5799
Residual standard error: 0.9035 on 17 degrees of freedom
Resultado: Na resposta, vemos que a estimativa para \hat{\sigma} foi de 0.9035, um bom valor se comparado com as demais estimativas por outros métodos.
mod.Mo <- MASS::rlm(CloudPoint~Percentage, data = dados)mod.rob <- robustbase::lmrob(CloudPoint~Percentage, data = cloud)
summary(mod.rob)
Call:
robustbase::lmrob(formula = CloudPoint ~ Percentage, data = cloud)
\--> method = "MM"
Residuals:
Min 1Q Median 3Q Max
-2.23435 -0.42215 -0.03478 0.22785 0.66543
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 24.1344 0.9897 24.386 1.15e-14 ***
Percentage 0.9251 0.1536 6.021 1.37e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Robust residual standard error: 0.4894
Multiple R-squared: 0.9661, Adjusted R-squared: 0.9641
Convergence in 22 IRWLS iterations
Robustness weights:
observation 10 is an outlier with |weight| <= 0.0025 ( < 0.0053);
2 weights are ~= 1. The remaining 16 ones are summarized as
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.04529 0.85620 0.95620 0.85540 0.99270 0.99540
Algorithmic parameters:
tuning.chi bb tuning.psi refine.tol
1.548e+00 5.000e-01 4.685e+00 1.000e-07
rel.tol scale.tol solve.tol eps.outlier
1.000e-07 1.000e-10 1.000e-07 5.263e-03
eps.x warn.limit.reject warn.limit.meanrw
1.819e-11 5.000e-01 5.000e-01
nResample max.it best.r.s k.fast.s k.max
500 50 2 1 200
maxit.scale trace.lev mts compute.rd fast.s.large.n
200 0 1000 0 2000
psi subsampling cov
"bisquare" "nonsingular" ".vcov.avar1"
compute.outlier.stats
"SM"
seed : int(0)
Resposta: De acordo com o ajuste através do método MM, é verificado que o modelo ficou bem ajustado, pois no ajuste, o intercepto e a variável Percentage foram significativas, rejeitando a hipótese nula de que \beta = 0.
mod.robo <- robustbase::lmrob(CloudPoint~Percentage, data = dados)mod.l1 <- quantreg::rq(CloudPoint~Percentage, data = cloud, 0.5)Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
summary(mod.l1)Warning in rq.fit.br(x, y, tau = tau, ci = TRUE, ...): Solution may be
nonunique
Call: quantreg::rq(formula = CloudPoint ~ Percentage, tau = 0.5, data = cloud)
tau: [1] 0.5
Coefficients:
coefficients lower bd upper bd
(Intercept) 24.16667 23.03751 24.72624
Percentage 0.91667 0.87786 1.16819
mod.l1o <- quantreg::rq(CloudPoint~Percentage, data = dados, 0.5)Warning in rq.fit.br(x, y, tau = tau, ...): Solution may be nonunique
mod.k1 <- kernel.reg1(cloud$Percentage, cloud$CloudPoint, 1e-10,100)
summary(mod.k1) Length Class Mode
coef 2 -none- numeric
fitted 19 -none- numeric
criterion 5 -none- numeric
weigth 19 -none- numeric
mod.k1o <- kernel.reg1(dados$Percentage, dados$CloudPoint, 1e-10,100)mod.k2 <- kernel.reg2(cloud$Percentage, cloud$CloudPoint, 1e-10,100)
summary(mod.k2) Length Class Mode
coef 2 -none- numeric
fitted 19 -none- numeric
criterion 5 -none- numeric
weigth 19 -none- numeric
mod.k2o <- kernel.reg2(dados$Percentage, dados$CloudPoint, 1e-10,100)mod.k3 <- kernel.reg3(cloud$Percentage, cloud$CloudPoint, 1e-10,100)
summary(mod.k3) Length Class Mode
coef 2 -none- numeric
fitted 19 -none- numeric
criterion 5 -none- numeric
weigth 19 -none- numeric
mod.k3o <- kernel.reg3(dados$Percentage, dados$CloudPoint, 1e-10,100)mod.k4 <- kernel.reg4(cloud$Percentage, cloud$CloudPoint, 1e-10,100)
summary(mod.k4) Length Class Mode
coef 2 -none- numeric
fitted 19 -none- numeric
criterion 7 -none- numeric
weigth 19 -none- numeric
mod.k4o <- kernel.reg4(dados$Percentage, dados$CloudPoint, 1e-10,100)mod.k5 <- kernel.reg5(cloud$Percentage, cloud$CloudPoint, 1e-10,100)
summary(mod.k5) Length Class Mode
coef 2 -none- numeric
fitted 19 -none- numeric
criterion 5 -none- numeric
weigth 19 -none- numeric
mod.k5o <- kernel.reg5(dados$Percentage, dados$CloudPoint, 1e-10,100)mod.k6 <- kernel.reg6(cloud$Percentage, cloud$CloudPoint, 1e-10,100)
summary(mod.k6) Length Class Mode
coef 2 -none- numeric
fitted 19 -none- numeric
criterion 18 -none- numeric
weigth 19 -none- numeric
mod.k6o <- kernel.reg6(dados$Percentage, dados$CloudPoint, 1e-10,100)coef = rbind(mod.lm$coef,
mod.lm2$coef,
mod.M$coef,
mod.rob$coef,
mod.l1$coef,
mod.k1$coef,
mod.k2$coef,
mod.k3$coef,
mod.k4$coef,
mod.k5$coef,
mod.k6$coef)
rownames(coef) = c("OLS","WLS","M","MM","L1","KRR.1","KRR.2","KRR.3","KRR.4","KRR.5","KRR.6")
knitr::kable(coef)| (Intercept) | Percentage | |
|---|---|---|
| OLS | 23.34638 | 1.0454626 |
| WLS | 23.64546 | 1.0316776 |
| M | 23.40931 | 1.0358511 |
| MM | 24.13435 | 0.9250537 |
| L1 | 24.16667 | 0.9166667 |
| KRR.1 | 23.37065 | 1.0418823 |
| KRR.2 | 23.36844 | 1.0422092 |
| KRR.3 | 23.37464 | 1.0412924 |
| KRR.4 | 23.40054 | 1.0374452 |
| KRR.5 | 23.36612 | 1.0425527 |
| KRR.6 | 24.47923 | 0.8723326 |
# Percentual de Mudança
coef.sout = rbind(mod.lmo$coef,
mod.lmo2$coef,
mod.Mo$coef,
mod.robo$coef,
mod.l1o$coef,
mod.k1o$coef,
mod.k2o$coef,
mod.k3o$coef,
mod.k4o$coef,
mod.k5o$coef,
mod.k6o$coef)
rownames(coef.sout) = c("OLS","WLS","M","MM","L1","KRR.1","KRR.2","KRR.3","KRR.4","KRR.5","KRR.6")
knitr::kable(abs((coef.sout-coef)/coef)*100)| (Intercept) | Percentage | |
|---|---|---|
| OLS | 4.3754712 | 14.922872 |
| WLS | 3.6409832 | 14.409546 |
| M | 4.3998281 | 15.183480 |
| MM | 1.1866182 | 4.724062 |
| L1 | 0.6896552 | 3.636364 |
| KRR.1 | 4.2818226 | 14.681615 |
| KRR.2 | 4.2958562 | 14.722800 |
| KRR.3 | 4.2733930 | 14.665674 |
| KRR.4 | 4.1805732 | 14.427954 |
| KRR.5 | 4.3054279 | 14.748126 |
| KRR.6 | 0.1186932 | 0.612713 |
x <- cloud$Percentage
plot(cloud$Percentage, cloud$CloudPoint, xlab = "Porcentagem", ylab = "Ponto de Nuvem", pch = 20)#, ylim = c(250,850)) +
lines(x,mod.lm$fitted)
lines(x,mod.lm2$fitted, col=5)
lines(x,mod.M$fitted, col = 6)
lines(x,mod.rob$fitted, col=7)
lines(x,mod.l1$fitted, col=8)
lines(x,mod.k6$fitted, col=2)
legend("topleft", c("OLS","WLS","M","MM","L1","KRR"), lty = 1, col = c(1,5,6,7,8,2), bty = "n")