knitr::opts_chunk$set(echo = TRUE)
library(readr)

x_1 <- rnorm(1000, 5, 7) 
hist(x_1, col = "grey")

true_error <- rnorm(1000,0,2)
true_beta_0 <- 1.1
true_beta_1 <- -8.2

y_1 <- true_beta_0 + true_beta_1*x_1 + true_error

hist(y_1)

plot(x_1,y_1, pch=20, col="red")

x_2 <- rnorm(1000, 3, 5)
true_beta_2 <- -3.0

y_2 <- true_beta_0 + true_beta_2*x_2 + true_error
hist(y_2)

plot(x_2, y_2, pch=20, col="blue")

y_3 <- true_beta_0 + true_beta_1*x_1 + true_beta_2*x_2 + true_error
hist(y_3)

x_3 <- x_1 + x_2
dataExample <- cbind(y_3, x_3)
View(dataExample)
plot(y_3, x_1+x_2, pch=20, col="yellow")

model_1 <- lm(y_1 ~ x_1)
model_2 <- lm(y_2 ~ x_2)
model_3 <- lm(y_3 ~ x_1 + x_2)

summary(model_1)
## 
## Call:
## lm(formula = y_1 ~ x_1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.5132 -1.3947  0.0384  1.3624  6.6912 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.240839   0.079000   15.71   <2e-16 ***
## x_1         -8.204580   0.009026 -909.00   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.023 on 998 degrees of freedom
## Multiple R-squared:  0.9988, Adjusted R-squared:  0.9988 
## F-statistic: 8.263e+05 on 1 and 998 DF,  p-value: < 2.2e-16
coefs_1 <- coef(model_1)

summary(model_2)
## 
## Call:
## lm(formula = y_2 ~ x_2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.5879 -1.3796  0.0555  1.3654  6.7006 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.21728    0.07504   16.22   <2e-16 ***
## x_2         -2.99999    0.01283 -233.83   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.023 on 998 degrees of freedom
## Multiple R-squared:  0.9821, Adjusted R-squared:  0.9821 
## F-statistic: 5.468e+04 on 1 and 998 DF,  p-value: < 2.2e-16
coefs_2 <- coef(model_2)

summary(model_3)
## 
## Call:
## lm(formula = y_3 ~ x_1 + x_2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.5132 -1.3947  0.0384  1.3623  6.6912 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.240873   0.088315   14.05   <2e-16 ***
## x_1         -8.204580   0.009031 -908.53   <2e-16 ***
## x_2         -3.000011   0.012835 -233.75   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.024 on 997 degrees of freedom
## Multiple R-squared:  0.9989, Adjusted R-squared:  0.9989 
## F-statistic: 4.393e+05 on 2 and 997 DF,  p-value: < 2.2e-16
coefs_3 <- coef(model_3)

set.seed(100) 
trainingRowIndex <- sample(1:nrow(dataExample), 0.8*nrow(dataExample))
trainingData <- as.data.frame(dataExample[trainingRowIndex, ])
testData <- as.data.frame(dataExample[-trainingRowIndex, ])

View(trainingData)

lmMod <- lm(trainingData$y_3 ~ trainingData$x_3, data=trainingData)
modPred <- predict(lmMod, testData)
## Warning: 'newdata' had 200 rows but variables found have 800 rows
summary(lmMod)
## 
## Call:
## lm(formula = trainingData$y_3 ~ trainingData$x_3, data = trainingData)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -62.765 -15.081   0.102  14.305  62.601 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       3.88243    1.04007   3.733 0.000203 ***
## trainingData$x_3 -6.50378    0.08675 -74.967  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21.22 on 798 degrees of freedom
## Multiple R-squared:  0.8757, Adjusted R-squared:  0.8755 
## F-statistic:  5620 on 1 and 798 DF,  p-value: < 2.2e-16

Tested again with true_beta_0 at 3.1 and true_beta_1 at 7.3

library(readr)

x_1 <- rnorm(1000, 5, 7) 
hist(x_1, col = "grey")

true_error <- rnorm(1000,0,2)
true_beta_0 <- 3.1
true_beta_1 <- 7.3

y_1 <- true_beta_0 + true_beta_1*x_1 + true_error

hist(y_1)

plot(x_1,y_1, pch=20, col="red")

x_2 <- rnorm(1000, 3, 5)
true_beta_2 <- -3.0

y_2 <- true_beta_0 + true_beta_2*x_2 + true_error
hist(y_2)

plot(x_2, y_2, pch=20, col="blue")

y_3 <- true_beta_0 + true_beta_1*x_1 + true_beta_2*x_2 + true_error
hist(y_3)

x_3 <- x_1 + x_2
dataExample <- cbind(y_3, x_3)
View(dataExample)
plot(y_3, x_1+x_2, pch=20, col="yellow")

model_1 <- lm(y_1 ~ x_1)
model_2 <- lm(y_2 ~ x_2)
model_3 <- lm(y_3 ~ x_1 + x_2)

summary(model_1)
## 
## Call:
## lm(formula = y_1 ~ x_1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.7810 -1.4227  0.0217  1.3656  6.3425 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3.085258   0.079495   38.81   <2e-16 ***
## x_1         7.302098   0.009001  811.27   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.041 on 998 degrees of freedom
## Multiple R-squared:  0.9985, Adjusted R-squared:  0.9985 
## F-statistic: 6.582e+05 on 1 and 998 DF,  p-value: < 2.2e-16
coefs_1 <- coef(model_1)

summary(model_2)
## 
## Call:
## lm(formula = y_2 ~ x_2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.9241 -1.4647  0.0387  1.3306  6.1117 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.19255    0.07410   43.09   <2e-16 ***
## x_2         -3.03372    0.01286 -235.87   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.034 on 998 degrees of freedom
## Multiple R-squared:  0.9824, Adjusted R-squared:  0.9824 
## F-statistic: 5.563e+04 on 1 and 998 DF,  p-value: < 2.2e-16
coefs_2 <- coef(model_2)

summary(model_3)
## 
## Call:
## lm(formula = y_3 ~ x_1 + x_2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.9212 -1.4661  0.0385  1.3282  6.1138 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.190194   0.088876    35.9   <2e-16 ***
## x_1          7.300433   0.008997   811.4   <2e-16 ***
## x_2         -3.033675   0.012901  -235.2   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.035 on 997 degrees of freedom
## Multiple R-squared:  0.9987, Adjusted R-squared:  0.9987 
## F-statistic: 3.722e+05 on 2 and 997 DF,  p-value: < 2.2e-16
coefs_3 <- coef(model_3)

set.seed(100) 
trainingRowIndex <- sample(1:nrow(dataExample), 0.8*nrow(dataExample))
trainingData <- as.data.frame(dataExample[trainingRowIndex, ])
testData <- as.data.frame(dataExample[-trainingRowIndex, ])

View(trainingData)

lmMod <- lm(trainingData$y_3 ~ trainingData$x_3, data=trainingData)
modPred <- predict(lmMod, testData)
## Warning: 'newdata' had 200 rows but variables found have 800 rows
summary(lmMod)
## 
## Call:
## lm(formula = trainingData$y_3 ~ trainingData$x_3, data = trainingData)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -127.269  -28.681   -0.687   29.381  140.821 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -0.4813     2.1184  -0.227     0.82    
## trainingData$x_3   4.0484     0.1808  22.395   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 43.65 on 798 degrees of freedom
## Multiple R-squared:  0.3859, Adjusted R-squared:  0.3852 
## F-statistic: 501.5 on 1 and 798 DF,  p-value: < 2.2e-16