knitr::opts_chunk$set(echo = TRUE)
library(readr)
x_1 <- rnorm(1000, 5, 7)
hist(x_1, col = "grey")

true_error <- rnorm(1000,0,2)
true_beta_0 <- 1.1
true_beta_1 <- -8.2
y_1 <- true_beta_0 + true_beta_1*x_1 + true_error
hist(y_1)

plot(x_1,y_1, pch=20, col="red")

x_2 <- rnorm(1000, 3, 5)
true_beta_2 <- -3.0
y_2 <- true_beta_0 + true_beta_2*x_2 + true_error
hist(y_2)

plot(x_2, y_2, pch=20, col="blue")

y_3 <- true_beta_0 + true_beta_1*x_1 + true_beta_2*x_2 + true_error
hist(y_3)

x_3 <- x_1 + x_2
dataExample <- cbind(y_3, x_3)
View(dataExample)
plot(y_3, x_1+x_2, pch=20, col="yellow")

model_1 <- lm(y_1 ~ x_1)
model_2 <- lm(y_2 ~ x_2)
model_3 <- lm(y_3 ~ x_1 + x_2)
summary(model_1)
##
## Call:
## lm(formula = y_1 ~ x_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.5132 -1.3947 0.0384 1.3624 6.6912
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.240839 0.079000 15.71 <2e-16 ***
## x_1 -8.204580 0.009026 -909.00 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.023 on 998 degrees of freedom
## Multiple R-squared: 0.9988, Adjusted R-squared: 0.9988
## F-statistic: 8.263e+05 on 1 and 998 DF, p-value: < 2.2e-16
coefs_1 <- coef(model_1)
summary(model_2)
##
## Call:
## lm(formula = y_2 ~ x_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.5879 -1.3796 0.0555 1.3654 6.7006
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.21728 0.07504 16.22 <2e-16 ***
## x_2 -2.99999 0.01283 -233.83 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.023 on 998 degrees of freedom
## Multiple R-squared: 0.9821, Adjusted R-squared: 0.9821
## F-statistic: 5.468e+04 on 1 and 998 DF, p-value: < 2.2e-16
coefs_2 <- coef(model_2)
summary(model_3)
##
## Call:
## lm(formula = y_3 ~ x_1 + x_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.5132 -1.3947 0.0384 1.3623 6.6912
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.240873 0.088315 14.05 <2e-16 ***
## x_1 -8.204580 0.009031 -908.53 <2e-16 ***
## x_2 -3.000011 0.012835 -233.75 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.024 on 997 degrees of freedom
## Multiple R-squared: 0.9989, Adjusted R-squared: 0.9989
## F-statistic: 4.393e+05 on 2 and 997 DF, p-value: < 2.2e-16
coefs_3 <- coef(model_3)
set.seed(100)
trainingRowIndex <- sample(1:nrow(dataExample), 0.8*nrow(dataExample))
trainingData <- as.data.frame(dataExample[trainingRowIndex, ])
testData <- as.data.frame(dataExample[-trainingRowIndex, ])
View(trainingData)
lmMod <- lm(trainingData$y_3 ~ trainingData$x_3, data=trainingData)
modPred <- predict(lmMod, testData)
## Warning: 'newdata' had 200 rows but variables found have 800 rows
summary(lmMod)
##
## Call:
## lm(formula = trainingData$y_3 ~ trainingData$x_3, data = trainingData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -62.765 -15.081 0.102 14.305 62.601
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.88243 1.04007 3.733 0.000203 ***
## trainingData$x_3 -6.50378 0.08675 -74.967 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21.22 on 798 degrees of freedom
## Multiple R-squared: 0.8757, Adjusted R-squared: 0.8755
## F-statistic: 5620 on 1 and 798 DF, p-value: < 2.2e-16
Tested again with true_beta_0 at 3.1 and true_beta_1 at 7.3
library(readr)
x_1 <- rnorm(1000, 5, 7)
hist(x_1, col = "grey")

true_error <- rnorm(1000,0,2)
true_beta_0 <- 3.1
true_beta_1 <- 7.3
y_1 <- true_beta_0 + true_beta_1*x_1 + true_error
hist(y_1)

plot(x_1,y_1, pch=20, col="red")

x_2 <- rnorm(1000, 3, 5)
true_beta_2 <- -3.0
y_2 <- true_beta_0 + true_beta_2*x_2 + true_error
hist(y_2)

plot(x_2, y_2, pch=20, col="blue")

y_3 <- true_beta_0 + true_beta_1*x_1 + true_beta_2*x_2 + true_error
hist(y_3)

x_3 <- x_1 + x_2
dataExample <- cbind(y_3, x_3)
View(dataExample)
plot(y_3, x_1+x_2, pch=20, col="yellow")

model_1 <- lm(y_1 ~ x_1)
model_2 <- lm(y_2 ~ x_2)
model_3 <- lm(y_3 ~ x_1 + x_2)
summary(model_1)
##
## Call:
## lm(formula = y_1 ~ x_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.7810 -1.4227 0.0217 1.3656 6.3425
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.085258 0.079495 38.81 <2e-16 ***
## x_1 7.302098 0.009001 811.27 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.041 on 998 degrees of freedom
## Multiple R-squared: 0.9985, Adjusted R-squared: 0.9985
## F-statistic: 6.582e+05 on 1 and 998 DF, p-value: < 2.2e-16
coefs_1 <- coef(model_1)
summary(model_2)
##
## Call:
## lm(formula = y_2 ~ x_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.9241 -1.4647 0.0387 1.3306 6.1117
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.19255 0.07410 43.09 <2e-16 ***
## x_2 -3.03372 0.01286 -235.87 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.034 on 998 degrees of freedom
## Multiple R-squared: 0.9824, Adjusted R-squared: 0.9824
## F-statistic: 5.563e+04 on 1 and 998 DF, p-value: < 2.2e-16
coefs_2 <- coef(model_2)
summary(model_3)
##
## Call:
## lm(formula = y_3 ~ x_1 + x_2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.9212 -1.4661 0.0385 1.3282 6.1138
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.190194 0.088876 35.9 <2e-16 ***
## x_1 7.300433 0.008997 811.4 <2e-16 ***
## x_2 -3.033675 0.012901 -235.2 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.035 on 997 degrees of freedom
## Multiple R-squared: 0.9987, Adjusted R-squared: 0.9987
## F-statistic: 3.722e+05 on 2 and 997 DF, p-value: < 2.2e-16
coefs_3 <- coef(model_3)
set.seed(100)
trainingRowIndex <- sample(1:nrow(dataExample), 0.8*nrow(dataExample))
trainingData <- as.data.frame(dataExample[trainingRowIndex, ])
testData <- as.data.frame(dataExample[-trainingRowIndex, ])
View(trainingData)
lmMod <- lm(trainingData$y_3 ~ trainingData$x_3, data=trainingData)
modPred <- predict(lmMod, testData)
## Warning: 'newdata' had 200 rows but variables found have 800 rows
summary(lmMod)
##
## Call:
## lm(formula = trainingData$y_3 ~ trainingData$x_3, data = trainingData)
##
## Residuals:
## Min 1Q Median 3Q Max
## -127.269 -28.681 -0.687 29.381 140.821
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.4813 2.1184 -0.227 0.82
## trainingData$x_3 4.0484 0.1808 22.395 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 43.65 on 798 degrees of freedom
## Multiple R-squared: 0.3859, Adjusted R-squared: 0.3852
## F-statistic: 501.5 on 1 and 798 DF, p-value: < 2.2e-16