Set Up

Clean the Environment

rm(list = ls())
packages <- c(
              "stargazer", 
              "tidyverse", 
              "ggplot2",
              "gptstudio",
              "bruceR"

              
              )
 for (i in 1:length(packages)) {
    if (!packages[i] %in% rownames(installed.packages())) {
      install.packages(packages[i]
                       , repos = "http://cran.rstudio.com/"
                       , dependencies = TRUE
                       )
    }
    library(packages[i], character.only = TRUE)
  }
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## 
## bruceR (v0.8.10)
## BRoadly Useful Convenient and Efficient R functions
## 
## Packages also loaded:
## ✔ data.table ✔ emmeans
## ✔ dplyr      ✔ lmerTest
## ✔ tidyr      ✔ effectsize
## ✔ stringr    ✔ performance
## ✔ ggplot2    ✔ interactions
## 
## Main functions of `bruceR`:
## cc()             Describe()  TTEST()
## add()            Freq()      MANOVA()
## .mean()          Corr()      EMMEANS()
## set.wd()         Alpha()     PROCESS()
## import()         EFA()       model_summary()
## print_table()    CFA()       lavaan_summary()
## 
## For full functionality, please install all dependencies:
## install.packages("bruceR", dep=TRUE)
## 
## Online documentation:
## https://psychbruce.github.io/bruceR

prepare Matrix

y <- c(7.3, 15.1, 17.2, 61.9, 12.3, 8, 11.6, 22.5, 6, 16.6, 32.2)

x_feature <- c(35.7, 55.9, 58.2, 81.9, 56.3, 48.9, 33.9, 21.8, 48.4, 60.4, 68.4)

x2 <- x_feature^2

Use OLS to estimate the beta hat without feature normalization

Create design Matrix

df <- cbind.data.frame( y, x_feature, x2)

dfx <- data.frame(x_feature, x2)

design_matrix <- cbind(1, x_feature, x2)

Get beta

beta <- solve(t(design_matrix) %*% design_matrix) %*% t(design_matrix) %*% y
beta
##                  [,1]
##           72.74005747
## x_feature -3.08939325
## x2         0.03619598

compute the residual vector

residuals <- y- design_matrix %*% beta
residuals
##              [,1]
##  [1,] -1.28012694
##  [2,]  1.95147965
##  [3,]  1.65817432
##  [4,] -0.60723630
##  [5,] -1.23723842
##  [6,] -0.22090560
##  [7,]  1.99359700
##  [8,] -0.09305986
##  [9,] -2.00466802
## [10,] -1.58941442
## [11,]  1.42939859

norm of residual vector

residual_norm <- sqrt(sum(residuals^2))
residual_norm
## [1] 4.766134

Plot

df_plot <- data.frame(y = y, x = x_feature, x_squared = x2)

ggplot(df_plot, aes(x = x_feature, y = y)) +
  geom_point() +
  geom_smooth(method = "lm", formula = y ~ poly(x, 2), color = "blue", se = FALSE) +
  ggtitle("Best Fit Function") +
  xlab("x_feature") +
  ylab("y")

Compute the RMSE

RMSE <- sqrt(mean(residuals^2))
RMSE
## [1] 1.437044

OLS After feature normalizaton

Create design Matrix

x_feature_norm <- scaler(x_feature)


x2_norm <- scaler(x2)


design_matrix_norm <- cbind(1, x_feature_norm, x2_norm)

get beta

beta_norm <- solve(t(design_matrix_norm) %*% design_matrix_norm) %*% t(design_matrix_norm) %*% y
beta_norm
##                      [,1]
##                  22.59306
## x_feature_norm -185.67253
## x2_norm         225.58671

compute the residual vector

residuals_norm <- y- design_matrix_norm %*% beta_norm
residuals_norm
##              [,1]
##  [1,] -1.28012694
##  [2,]  1.95147965
##  [3,]  1.65817432
##  [4,] -0.60723630
##  [5,] -1.23723842
##  [6,] -0.22090560
##  [7,]  1.99359700
##  [8,] -0.09305986
##  [9,] -2.00466802
## [10,] -1.58941442
## [11,]  1.42939859

norm of residual vector

norm_residual_norm <- sqrt(sum(residuals_norm^2))
norm_residual_norm
## [1] 4.766134

Plot

norm_df_plot <- data.frame(y = y, x = x_feature_norm, x_squared = x2_norm)

ggplot(norm_df_plot, aes(x = x_feature_norm, y = y)) +
  geom_point() +
  geom_smooth(method = "lm", formula = y ~ poly(x, 2), color = "green", se = FALSE) +
  ggtitle("Best Fit Function") +
  xlab("x_feature_norm") +
  ylab("y")