# A Simple Regression
# Looks at a single variable against another single variabe.
attach(mtcars)
plot(wt, hp, main="Weight vs. HP", col = 'blue', type = 'p')
summary(lm(hp ~ wt), col = 'red')
##
## Call:
## lm(formula = hp ~ wt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -83.430 -33.596 -13.587 7.913 172.030
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.821 32.325 -0.056 0.955
## wt 46.160 9.625 4.796 4.15e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 52.44 on 30 degrees of freedom
## Multiple R-squared: 0.4339, Adjusted R-squared: 0.4151
## F-statistic: 23 on 1 and 30 DF, p-value: 4.146e-05
abline(lm(hp ~ wt), col = 'red')
library(datarium)
## Warning: package 'datarium' was built under R version 3.5.2
attach(marketing)
# Multiple Linear Regression
head(marketing)
## youtube facebook newspaper sales
## 1 276.12 45.36 83.04 26.52
## 2 53.40 47.16 54.12 12.48
## 3 20.64 55.08 83.16 11.16
## 4 181.80 49.56 70.20 22.20
## 5 216.96 12.96 70.08 15.48
## 6 10.44 58.68 90.00 8.64
# In the marketing example, we were able to test sales against MULTIPLE variables (facebook, youtube and newspapers)
# So how do we test multiple variables against one? Take a look at the syntax below...
MyMarketingModel = lm(sales ~ facebook + newspaper + youtube)
summary(MyMarketingModel)
##
## Call:
## lm(formula = sales ~ facebook + newspaper + youtube)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.5932 -1.0690 0.2902 1.4272 3.3951
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.526667 0.374290 9.422 <2e-16 ***
## facebook 0.188530 0.008611 21.893 <2e-16 ***
## newspaper -0.001037 0.005871 -0.177 0.86
## youtube 0.045765 0.001395 32.809 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.023 on 196 degrees of freedom
## Multiple R-squared: 0.8972, Adjusted R-squared: 0.8956
## F-statistic: 570.3 on 3 and 196 DF, p-value: < 2.2e-16
# and check out how we can quickly look at the effects on each variables
pairs(marketing)
#### Take a look at the bottom row, with sales on the y axis and see if you can visualize a linear trend with youtube and facebook but not the newspapers. Look at the Prob(t-value) of the newspaper, does this match up?
library(readr)
LungCapacity = read.csv("/Users/nickvohra/Documents/R/Presentations/Presentation 13/LungCapData.txt", header = TRUE, sep = "\t")
# We have the LungCapacity imported now lets attach and get to it...
attach(LungCapacity)
# First question, is age and lung capacity linearly related? Lets see,
summary(lm(LungCap ~ Age))
##
## Call:
## lm(formula = LungCap ~ Age)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.7799 -1.0203 -0.0005 0.9789 4.2650
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.14686 0.18353 6.249 7.06e-10 ***
## Age 0.54485 0.01416 38.476 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.526 on 723 degrees of freedom
## Multiple R-squared: 0.6719, Adjusted R-squared: 0.6714
## F-statistic: 1480 on 1 and 723 DF, p-value: < 2.2e-16
cor(LungCap, Age)
## [1] 0.8196749
summary(lm(LungCap ~ Age + Height + Smoke + Gender + Caesarean))
##
## Call:
## lm(formula = LungCap ~ Age + Height + Smoke + Gender + Caesarean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3388 -0.7200 0.0444 0.7093 3.0172
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -11.32249 0.47097 -24.041 < 2e-16 ***
## Age 0.16053 0.01801 8.915 < 2e-16 ***
## Height 0.26411 0.01006 26.248 < 2e-16 ***
## Smokeyes -0.60956 0.12598 -4.839 1.60e-06 ***
## Gendermale 0.38701 0.07966 4.858 1.45e-06 ***
## Caesareanyes -0.21422 0.09074 -2.361 0.0185 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.02 on 719 degrees of freedom
## Multiple R-squared: 0.8542, Adjusted R-squared: 0.8532
## F-statistic: 842.8 on 5 and 719 DF, p-value: < 2.2e-16
# Take a look at the coeffiecents for the binary
# The regression model automatically assumes one of the two options and assigns it an coefficient
SmokingMod = lm(LungCap ~ Age + Smoke)
summary(SmokingMod)
##
## Call:
## lm(formula = LungCap ~ Age + Smoke)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.8559 -1.0289 -0.0363 1.0083 4.1995
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.08572 0.18299 5.933 4.61e-09 ***
## Age 0.55540 0.01438 38.628 < 2e-16 ***
## Smokeyes -0.64859 0.18676 -3.473 0.000546 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.514 on 722 degrees of freedom
## Multiple R-squared: 0.6773, Adjusted R-squared: 0.6764
## F-statistic: 757.5 on 2 and 722 DF, p-value: < 2.2e-16
# Plotting smoking - age data vs LungCap
plot(Age[Smoke == 'yes'], LungCap[Smoke == 'yes'], col = 1, pch = 19, xlab = "Age", ylab = "LungCap")
points(Age[Smoke == 'no'], LungCap[Smoke == 'no'], col = 2)
abline(a = 1.27, b = .49, col = 1)
abline(a = 1.05, b = .55, col = 2)
# The effect between smoking
SmokingEffect = lm(LungCap ~ Age + Smoke + Smoke:Age)
summary(SmokingEffect)
##
## Call:
## lm(formula = LungCap ~ Age + Smoke + Smoke:Age)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.8586 -1.0174 -0.0251 1.0004 4.1996
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.05157 0.18706 5.622 2.7e-08 ***
## Age 0.55823 0.01473 37.885 < 2e-16 ***
## Smokeyes 0.22601 1.00755 0.224 0.823
## Age:Smokeyes -0.05970 0.06759 -0.883 0.377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.515 on 721 degrees of freedom
## Multiple R-squared: 0.6776, Adjusted R-squared: 0.6763
## F-statistic: 505.1 on 3 and 721 DF, p-value: < 2.2e-16
# Plotting smoking - age data vs LungCap
plot(Age[Smoke == 'yes'], LungCap[Smoke == 'yes'], col = 1, pch = 19, xlab = "Age", ylab = "LungCap")
points(Age[Smoke == 'no'], LungCap[Smoke == 'no'], col = 2)
# Add the regression lines with the coefficient
abline(a = 1.27, b = .49, col = 1)
abline(a = 1.05, b = .55, col = 2)