source("demo_simple_regression_rsq.R")
## Plot the scenarios with interactive regression
# pts_1 <- interactive_regression_rsq()
# pts_2 <- interactive_regression_rsq()
# pts_3 <- interactive_regression_rsq()
# pts_4 <- interactive_regression_rsq()
# write.csv(pts_1, "interactive_regression_rsq/pts_1.csv", row.names = FALSE)
# write.csv(pts_2, "interactive_regression_rsq/pts_2.csv", row.names = FALSE)
# write.csv(pts_3, "interactive_regression_rsq/pts_3.csv", row.names = FALSE)
# write.csv(pts_4, "interactive_regression_rsq/pts_4.csv", row.names = FALSE)
## Recover the points in the scenarios
pts_1 <- read.csv(file = 'interactive_regression_rsq/pts_1.csv')
pts_2 <- read.csv(file = 'interactive_regression_rsq/pts_2.csv')
pts_3 <- read.csv(file = 'interactive_regression_rsq/pts_3.csv')
pts_4 <- read.csv(file = 'interactive_regression_rsq/pts_4.csv')
# Plot the scenarios
pts <- list(pts_1, pts_2, pts_3, pts_4)
sapply(pts, plot_regr_rsq)
## $family
## [1] "mono"
##
## $family
## [1] "mono"
##
## $family
## [1] "mono"
##
## $family
## [1] "mono"
We expect to have a stronger R2 in scenario 1.
We expect to have a stronger R2 in scenario 3.
We expect to have a larger SSE, SSR, and SST in scenario 2.
We expect to have a larger SSE, SSR, and SST in scenario 4.
prog <- read.csv("programmer_salaries.txt", sep = "\t")
head(prog)
prog_regr <- lm(Salary ~ Experience + Score + Degree, data = prog)
summary(prog_regr)
##
## Call:
## lm(formula = Salary ~ Experience + Score + Degree, data = prog)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.8963 -1.7290 -0.3375 1.9699 5.0480
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.9448 7.3808 1.076 0.2977
## Experience 1.1476 0.2976 3.856 0.0014 **
## Score 0.1969 0.0899 2.191 0.0436 *
## Degree 2.2804 1.9866 1.148 0.2679
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.396 on 16 degrees of freedom
## Multiple R-squared: 0.8468, Adjusted R-squared: 0.8181
## F-statistic: 29.48 on 3 and 16 DF, p-value: 9.417e-07
head(prog_regr$fitted.values, 5)
## 1 2 3 4 5
## 27.89626 37.95204 26.02901 32.11201 36.34251
head(prog_regr$residuals, 5)
## 1 2 3 4 5
## -3.8962605 5.0479568 -2.3290112 2.1879860 -0.5425072
X <- cbind(rep(1, 20), prog[, 1:3])
X <- as.matrix(X)
y <- prog$Salary
beta_hat <- solve(t(X) %*% X) %*% t(X) %*% y
beta_hat
## [,1]
## rep(1, 20) 7.944849
## Experience 1.147582
## Score 0.196937
## Degree 2.280424
y_hat <- X %*% beta_hat
head(y_hat, 5)
## [,1]
## [1,] 27.89626
## [2,] 37.95204
## [3,] 26.02901
## [4,] 32.11201
## [5,] 36.34251
res <- y - y_hat
head(res, 5)
## [,1]
## [1,] -3.8962605
## [2,] 5.0479568
## [3,] -2.3290112
## [4,] 2.1879860
## [5,] -0.5425072
SSR <- sum( (y_hat - mean(y))^2 )
SSE <- sum( (y - y_hat)^2 )
SST <- sum( (y - mean(y))^2 )
SSR
## [1] 507.896
SSE
## [1] 91.88949
SST
## [1] 599.7855
R2 <- SSR / SST
R2
## [1] 0.8467961
R2 <- cor(y, y_hat)^2
R2
## [,1]
## [1,] 0.8467961
auto <- read.table("auto-data.txt", header = FALSE, na.strings = "?")
names(auto) <- c("mpg", "cylinders", "displacement", "horsepower", "weight",
"acceleration", "model_year", "origin", "car_name")
pairs(~ mpg + displacement + horsepower + weight,
col = factor(auto$origin), pch = 20, data = auto)
round(cor(auto[, 1:8], use = "pairwise.complete.obs"), 2)
## mpg cylinders displacement horsepower weight acceleration
## mpg 1.00 -0.78 -0.80 -0.78 -0.83 0.42
## cylinders -0.78 1.00 0.95 0.84 0.90 -0.51
## displacement -0.80 0.95 1.00 0.90 0.93 -0.54
## horsepower -0.78 0.84 0.90 1.00 0.86 -0.69
## weight -0.83 0.90 0.93 0.86 1.00 -0.42
## acceleration 0.42 -0.51 -0.54 -0.69 -0.42 1.00
## model_year 0.58 -0.35 -0.37 -0.42 -0.31 0.29
## origin 0.56 -0.56 -0.61 -0.46 -0.58 0.21
## model_year origin
## mpg 0.58 0.56
## cylinders -0.35 -0.56
## displacement -0.37 -0.61
## horsepower -0.42 -0.46
## weight -0.31 -0.58
## acceleration 0.29 0.21
## model_year 1.00 0.18
## origin 0.18 1.00
The cylinders, displacement, horsepower, and weight variables seem to relate to mpg.
The relationship between mpg and displacement, horsepower, and weight respectively might not be linear.
auto_regr <-lm(mpg ~ cylinders + displacement + horsepower + weight +
acceleration + model_year + factor(origin), data = auto)
The IVs displacement, weight, model_year, origin have significant relationship with mpg at 1% significance
summary(auto_regr)
##
## Call:
## lm(formula = mpg ~ cylinders + displacement + horsepower + weight +
## acceleration + model_year + factor(origin), data = auto)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.0095 -2.0785 -0.0982 1.9856 13.3608
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.795e+01 4.677e+00 -3.839 0.000145 ***
## cylinders -4.897e-01 3.212e-01 -1.524 0.128215
## displacement 2.398e-02 7.653e-03 3.133 0.001863 **
## horsepower -1.818e-02 1.371e-02 -1.326 0.185488
## weight -6.710e-03 6.551e-04 -10.243 < 2e-16 ***
## acceleration 7.910e-02 9.822e-02 0.805 0.421101
## model_year 7.770e-01 5.178e-02 15.005 < 2e-16 ***
## factor(origin)2 2.630e+00 5.664e-01 4.643 4.72e-06 ***
## factor(origin)3 2.853e+00 5.527e-01 5.162 3.93e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.307 on 383 degrees of freedom
## (因為不存在,6 個觀察量被刪除了)
## Multiple R-squared: 0.8242, Adjusted R-squared: 0.8205
## F-statistic: 224.5 on 8 and 383 DF, p-value: < 2.2e-16
No, we cannot determine which independent variables are the most effective at increasing mpg. For each variable, we use the raw unit of its own, so it is not fair to compare the coefficients before standardized.
We should not standardize origin since it is a categorical variable. The slopes are easier to compare since the value are larger and on the similar scale.
auto_std <- data.frame(scale(auto[, 1:7]))
origin <- auto$origin
auto_std <- cbind(auto_std, origin)
auto_std_regr <-lm(mpg ~ cylinders + displacement + horsepower + weight +
acceleration + model_year + factor(origin), data = auto_std)
summary(auto_std_regr)
##
## Call:
## lm(formula = mpg ~ cylinders + displacement + horsepower + weight +
## acceleration + model_year + factor(origin), data = auto_std)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.15270 -0.26593 -0.01257 0.25404 1.70942
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.13323 0.03174 -4.198 3.35e-05 ***
## cylinders -0.10658 0.06991 -1.524 0.12821
## displacement 0.31989 0.10210 3.133 0.00186 **
## horsepower -0.08955 0.06751 -1.326 0.18549
## weight -0.72705 0.07098 -10.243 < 2e-16 ***
## acceleration 0.02791 0.03465 0.805 0.42110
## model_year 0.36760 0.02450 15.005 < 2e-16 ***
## factor(origin)2 0.33649 0.07247 4.643 4.72e-06 ***
## factor(origin)3 0.36505 0.07072 5.162 3.93e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.423 on 383 degrees of freedom
## (因為不存在,6 個觀察量被刪除了)
## Multiple R-squared: 0.8242, Adjusted R-squared: 0.8205
## F-statistic: 224.5 on 8 and 383 DF, p-value: < 2.2e-16
The original non-significant independent variables, cylinders, horsepower, and acceleration become significant when we regress mpg over them individually.
cyl_std_regr <-lm(mpg ~ cylinders, data = auto_std)
hp_std_regr <-lm(mpg ~ horsepower, data = auto_std)
acc_std_regr <-lm(mpg ~ acceleration, data = auto_std)
summary(cyl_std_regr)
##
## Call:
## lm(formula = mpg ~ cylinders, data = auto_std)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.82455 -0.43297 -0.08288 0.32674 2.29046
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.834e-15 3.169e-02 0.00 1
## cylinders -7.754e-01 3.173e-02 -24.43 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6323 on 396 degrees of freedom
## Multiple R-squared: 0.6012, Adjusted R-squared: 0.6002
## F-statistic: 597.1 on 1 and 396 DF, p-value: < 2.2e-16
summary(hp_std_regr)
##
## Call:
## lm(formula = mpg ~ horsepower, data = auto_std)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.73632 -0.41699 -0.04395 0.35351 2.16531
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.008784 0.031701 -0.277 0.782
## horsepower -0.777334 0.031742 -24.489 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6277 on 390 degrees of freedom
## (因為不存在,6 個觀察量被刪除了)
## Multiple R-squared: 0.6059, Adjusted R-squared: 0.6049
## F-statistic: 599.7 on 1 and 390 DF, p-value: < 2.2e-16
summary(acc_std_regr)
##
## Call:
## lm(formula = mpg ~ acceleration, data = auto_std)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.3039 -0.7210 -0.1589 0.6087 2.9672
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.004e-16 4.554e-02 0.000 1
## acceleration 4.203e-01 4.560e-02 9.217 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9085 on 396 degrees of freedom
## Multiple R-squared: 0.1766, Adjusted R-squared: 0.1746
## F-statistic: 84.96 on 1 and 396 DF, p-value: < 2.2e-16
The residuals are normally distributed and centered around zero.
plot(density(auto_std_regr$residuals))