Which model is more suitable to data at hand?

library(tidyverse)
rdata <- tribble(
  ~x, ~y,
  1,  23,
  2,  29,
  3,  49,
  4,  64,
  4,  74,
  5,  87,
  6,  96,
  6,  97,
  7,  109,
  8,  119,
  9,  149,
  9,  145,
  10, 154,
  10, 166
)

rdata
## # A tibble: 14 x 2
##        x     y
##    <dbl> <dbl>
##  1     1    23
##  2     2    29
##  3     3    49
##  4     4    64
##  5     4    74
##  6     5    87
##  7     6    96
##  8     6    97
##  9     7   109
## 10     8   119
## 11     9   149
## 12     9   145
## 13    10   154
## 14    10   166

A

For first model we need to assign rdata to new tibble. Because we will create specific columns for each model.

model_a <- rdata

Total observation count:

n <- nrow(model_a)
n
## [1] 14

Let’s see scatter plot of data:

ggplot(model_a, aes(x = x, y = y)) +
  geom_point()

Means

mean_x <- sum(model_a$x) / nrow(model_a)
mean_y <- sum(model_a$y) / nrow(model_a)
#X
mean_x
## [1] 6
#Y
mean_y
## [1] 97.21429

We will use formula which you see below for determinating \(\hat{\beta_0}\) and \(\hat{\beta_1}\)

\[\hat{\beta_1} = \dfrac{S_{xy}}{S_{xx}} = \dfrac{\sum{XY} - n\bar{X}\bar{Y}}{\sum{X^2} - n\bar{X}^2}\] \[\hat{\beta_0} = \bar{Y} - \hat{\beta_1}\bar{X}\]

So, we need \(X^2\) and \(XY\):

model_a <- model_a %>%
  mutate(xsquare = x ^ 2,
         xy = x * y)

model_a
## # A tibble: 14 x 4
##        x     y xsquare    xy
##    <dbl> <dbl>   <dbl> <dbl>
##  1     1    23       1    23
##  2     2    29       4    58
##  3     3    49       9   147
##  4     4    64      16   256
##  5     4    74      16   296
##  6     5    87      25   435
##  7     6    96      36   576
##  8     6    97      36   582
##  9     7   109      49   763
## 10     8   119      64   952
## 11     9   149      81  1341
## 12     9   145      81  1305
## 13    10   154     100  1540
## 14    10   166     100  1660

Thus,

slope <- (sum(model_a$xy) - (n * mean_x * mean_y)) / (sum(model_a$xsquare) - (n * mean_x ^ 2))


intercept <- mean_y - slope * mean_x

paramaters <- list('intercept' = intercept, 'slope' = slope)

paramaters
## $intercept
## [1] 4.161654
## 
## $slope
## [1] 15.50877

Determine residual:

model_a <- model_a %>% 
  mutate(y_estimator = paramaters$intercept + x * paramaters$slope,
         e = y - y_estimator,
         esquare = e ^ 2)

model_a
## # A tibble: 14 x 7
##        x     y xsquare    xy y_estimator          e     esquare
##    <dbl> <dbl>   <dbl> <dbl>       <dbl>      <dbl>       <dbl>
##  1     1    23       1    23    19.67043  3.3295739 11.08606259
##  2     2    29       4    58    35.17920 -6.1791980 38.18248786
##  3     3    49       9   147    50.68797 -1.6879699  2.84924247
##  4     4    64      16   256    66.19674 -2.1967419  4.82567478
##  5     4    74      16   296    66.19674  7.8032581 60.89083768
##  6     5    87      25   435    81.70551  5.2944862 28.03158429
##  7     6    96      36   576    97.21429 -1.2142857  1.47448980
##  8     6    97      36   582    97.21429 -0.2142857  0.04591837
##  9     7   109      49   763   112.72306 -3.7230576 13.86115822
## 10     8   119      64   952   128.23183 -9.2318296 85.22667728
## 11     9   149      81  1341   143.74060  5.2593985 27.66127254
## 12     9   145      81  1305   143.74060  1.2593985  1.58608457
## 13    10   154     100  1540   159.24937 -5.2493734 27.55592145
## 14    10   166     100  1660   159.24937  6.7506266 45.57095904

Sum of residual must be 0;

round(sum(model_a$e),2)
## [1] 0

Mean Square Error:

mse <- sum(model_a$esquare / (n - 2))

mse
## [1] 29.0707

Add standardized residual as a column:

model_a <- model_a %>%
  mutate(estd = e / sqrt(mse))


model_a$estd
##  [1]  0.61753409 -1.14605216 -0.31306677 -0.40742840  1.44726563
##  [6]  0.98196520 -0.22521285 -0.03974344 -0.69051328 -1.71222192
## [11]  0.97545750  0.23357989 -0.97359816  1.25203468

Random pattern appears so it is valid for further analysis and the form of the estimated model is appropriate.

ggplot(model_a,aes(x,estd)) +
  geom_point()

Let’s draw regression line.

ggplot(model_a, mapping = aes(x,y)) + 
  geom_point() + geom_line(aes(y=model_a$y_estimator))

B

First of all;

model_b <- rdata

model_b
## # A tibble: 14 x 2
##        x     y
##    <dbl> <dbl>
##  1     1    23
##  2     2    29
##  3     3    49
##  4     4    64
##  5     4    74
##  6     5    87
##  7     6    96
##  8     6    97
##  9     7   109
## 10     8   119
## 11     9   149
## 12     9   145
## 13    10   154
## 14    10   166

Our polynomial model is \(\hat{Y} = \hat{\beta_0} + \hat{\beta_1}X + \hat{\beta_2}X^2\)

\[e_i = Y_i - \hat{Y_i} = Y_i - \hat{\beta_0} - \hat{\beta_1}X_i- \hat{\beta_2}X_i^2 \] And then,

\[S = \sum{e_i^2} = \sum{(Y_i - \hat{Y_i})^2} = \sum{(Y_i - \hat{\beta_0} - \hat{\beta_1}X - \hat{\beta_2}X^2)^2}\]

When we derivate of \(\beta_0\), \(\beta_1\), \(\beta_2\) and then equal to zero, we get three system of equations:

\[\sum{Y_i} = n\hat{\beta_0} + \hat{\beta_1}\sum{X_i} + \hat{\beta_2}\sum{X_i^2}\] \[\sum{X_iY_i} = \hat{\beta_0}\sum{X_i} + \hat{\beta_1}\sum{X_i^2} + \hat{\beta_2}\sum{X_i^3}\] \[\sum{X_i^2Y_i} = \hat{\beta_0}\sum{X_i^2} + \hat{\beta_1}\sum{X_i^3} + \hat{\beta_2}\sum{X_i^4}\] Thus,

We create matrix of cofficients and equations vector.

system_equation <- matrix(c(n, sum(model_b$x), sum(model_b$x ^ 2),
                            sum(model_b$x), sum(model_b$x ^ 2), sum(model_b$x ^ 3),
                            sum(model_b$x ^ 2), sum(model_b$x ^ 3), sum(model_b$x ^4)),
                          ncol = 3,
                          byrow = T)

equal_to <- c(sum(model_b$y),
              sum(model_b$x * model_b$y), 
              sum(model_b$x ^ 2 * model_b$y))


coefficients_b <- solve(system_equation,equal_to)

names(coefficients_b) <- c("beta0", "beta1", "beta2")

coefficients_b <- as.list(coefficients_b)

coefficients_b
## $beta0
## [1] 5.507142
## 
## $beta1
## [1] 14.89855
## 
## $beta2
## [1] 0.05246257
model_b <- model_b %>% 
  mutate(y_estimator = coefficients_b$beta0 + x * coefficients_b$beta1 + coefficients_b$beta2 * x ^ 2,
         e = y - y_estimator,
         esquare = e ^ 2)

model_b
## # A tibble: 14 x 5
##        x     y y_estimator          e     esquare
##    <dbl> <dbl>       <dbl>      <dbl>       <dbl>
##  1     1    23    20.45815  2.5418465  6.46098349
##  2     2    29    35.51409 -6.5140906 42.43337689
##  3     3    49    50.67495 -1.6749529  2.80546720
##  4     4    64    65.94074 -1.9407403  3.76647288
##  5     4    74    65.94074  8.0592597 64.95166706
##  6     5    87    81.31145  5.6885472 32.35956896
##  7     6    96    96.78709 -0.7870905  0.61951146
##  8     6    97    96.78709  0.2129095  0.04533046
##  9     7   109   112.36765 -3.3676533 11.34108885
## 10     8   119   128.05314 -9.0531413 81.95936687
## 11     9   149   143.84355  5.1564456 26.58893157
## 12     9   145   143.84355  1.1564456  1.33736650
## 13    10   154   159.73889 -5.7388926 32.93488831
## 14    10   166   159.73889  6.2611074 39.20146583

Sum of residual must be 0;

round(sum(model_b$e),2)
## [1] 0

Mean Square Error:

mse <- sum(model_b$esquare / (n - 3))

mse
## [1] 31.52777

Add standardized residual as a column:

model_b <- model_b %>%
  mutate(estd = e / sqrt(mse))


model_b$estd
##  [1]  0.45269185 -1.16013134 -0.29830186 -0.34563744  1.43531927
##  [6]  1.01310563 -0.14017741  0.03791826 -0.59976448 -1.61232527
## [11]  0.91834064  0.20595796 -1.02207193  1.11507612

Random pattern appears:

ggplot(model_b,aes(x,estd)) +
  geom_point()

Regression line:

ggplot(model_b, mapping = aes(x,y)) + 
  geom_point() + geom_line(aes(y=model_b$y_estimator))