Dados

X <- c(1, 1, 2, 3, 4, 4, 5, 6, 6, 7, 8, 8, 9, 9, 1, 2, 3, 4, 3, 2, 5, 6, 7)
Y <- c(25, 21, 31, 30, 38, 31, 43, 39, 44, 48, 53, 59, 49, 55, 19, 25, 35, 44, 43, 25, 40, 45, 50)
mean(X)
## [1] 4.608696
mean(Y) 
## [1] 38.78261
train <- data.frame(X,Y)

s1 <- sum((X-mean(X))*(Y-mean(Y)))
s1
## [1] 593.0435

\[\sum_{i=1}^n (X_i-\bar{X})(Y_i-\bar{Y})=593.0435\]

s2<- sum((X-mean(X))**2)
s2
## [1] 147.4783

\[\sum_{i=1}^n (X^2_i-\bar{X})=147.4783\]

mean(Y)-((s1/s2)*mean(X))
## [1] 20.25

\[\hat{b}_1 = \frac{593.0435}{147.4783} = 4.021225\]

\[\hat{b}_0 = 38.78261 - 4.021225*4.608696 = 20.25001\]

\[y = 20.25001 + 4.021225x_i\]

Y_hat <- (X*4.021225) + 20.25001
SQRes <- sum((Y-Y_hat)^2)
SQRes
## [1] 425.1509

\[SQRes = \sum_{i=1}^n (Y_i-\hat{Y}_i)^2 = 425.1509\]

SQReg <- sum((mean(Y)-Y_hat)^2)
SQReg
## [1] 2384.76

\[SQReg = \sum_{i=1}^n (\bar{Y}-\hat{Y}_i)^2 = 2384.76\]

SQT <- SQReg+SQRes
SQT
## [1] 2809.911

\[SQT = SQRes + SQReg = 2809.911\] \[MSE = Bias + Variance = 2809.911\]

R_squared <- SQReg/SQT
R_squared
## [1] 0.848696
regressao <- lm(train$Y ~ train$X)
summary(regressao)
## 
## Call:
## lm(formula = train$Y ~ train$X)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.4410 -3.2818 -0.3561  2.1545 10.6863 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  20.2500     1.9483   10.39 9.81e-10 ***
## train$X       4.0212     0.3705   10.85 4.53e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.499 on 21 degrees of freedom
## Multiple R-squared:  0.8487, Adjusted R-squared:  0.8415 
## F-statistic: 117.8 on 1 and 21 DF,  p-value: 4.533e-10

Test

X <- c(1, 1, 2, 2, 4, 4, 5, 7, 6, 7, 8, 8, 9, 10, 1, 3, 3, 4, 2, 2, 6, 6, 7)
Y <- c(23, 25, 27, 35, 38, 31, 43, 35, 44, 48, 52, 59, 42, 55, 23, 25, 35, 44, 43, 25, 40, 45, 50)
test = data.frame(X,Y)

predictions <- predict(regressao,test)
axisRange = extendrange(c(test$Y,predictions))
plot(test$Y,predictions,main="Predição")
abline(0,1,col="blue",lty=2,lwd=2)

Quanto maior o RMSE, melhor sera o modelo

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
lm_prediction <- data.frame(pred = predictions, obs = test$Y)
round(defaultSummary(lm_prediction), digits = 3)
##     RMSE Rsquared 
##    5.100    0.771

Codigo