Topics for today!
- Topic: Checking correlation
- Topic: Simple Linear Regression Basics
- Topic: Checking the assumptions of linear regression
- Topic: Plotting simple linear regression
- Topic: Linear Regression Example in R
Data
setwd("~/Desktop/R Materials/mih140/Assignments/'20 Assignments/Data")
house = read.table("house.txt", sep = "\t", header = T)
# quick pruning of outliers
house = subset(house, price >= (mean(price) - 3*sd(price)) & price <= (mean(price) + 3*sd(price)))
1. Topic: Checking correlation
# Test if correlation between price and footage is nonzero.
cor.test(house$price, house$sqft_living)
##
## Pearson's product-moment correlation
##
## data: house$price and house$sqft_living
## t = 32.056, df = 580, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7681458 0.8270399
## sample estimates:
## cor
## 0.7995063
cor(house[,c("price", "sqft_living")]) # Compute the correlation itself
## price sqft_living
## price 1.0000000 0.7995063
## sqft_living 0.7995063 1.0000000
Note correlation being zero does not imply no relationship, just no linear relationship
x = cos(seq(0, 2*pi, .1))
y = sin(seq(0, 2*pi, .1))
plot(x,y)

cor(x,y) # Basically 0, although clearly related!
## [1] -0.0002215351
2. Topic: Simple Linear Regression Basics
# Lets use lin reg to see the relationship between price and living space
reg_model = lm(data = house, price ~ sqft_living) # Simple linear regression
reg_model_2 = lm(data = house, price ~ 0 + sqft_living) # Simple linear regression through the origin
# Examining the regression
reg_model$coefficients # gets the coeff. of the model
## (Intercept) sqft_living
## 76730.7926 132.6361
reg_model$fitted.values[1:10] # what the model predicts for each point, lets cut it off after 10 points though
## 1 2 3 4 5 6 7 8
## 327413.1 392404.8 388425.7 328739.4 316802.2 280990.4 215998.7 255789.6
## 9 10
## 282316.8 344655.8
avg_error = mean(abs(reg_model$fitted.values - house$price)) # same as mean(abs(reg_model$residuals))
summary(reg_model) # Can read off if coeff. are stat. sign. diff. from 0
##
## Call:
## lm(formula = price ~ sqft_living, data = house)
##
## Residuals:
## Min 1Q Median 3Q Max
## -162874 -32472 -6783 18664 353838
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 76730.793 9194.565 8.345 5.22e-16 ***
## sqft_living 132.636 4.138 32.056 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 65600 on 580 degrees of freedom
## Multiple R-squared: 0.6392, Adjusted R-squared: 0.6386
## F-statistic: 1028 on 1 and 580 DF, p-value: < 2.2e-16
3. Topic: Checking the assumptions of linear regression
# The assumptions are:
## 1. Linearity - Look at residuals
## 2. Normality - qqplot of residuals
## 3. Independence (of error term and response) - Next time
## 4. Constant Variance - Look at residuals
# Check assumptions by examining
par(mfrow = c(2,2))
plot(reg_model)

# If the assumptions are satisfied, we can do inference using our model. More next time!
4. Topic: Plotting simple linear regression
# Plotting code from before
price_lim = c(min(house$price), max(house$price))
sqrtft_lim = c(min(house$sqft_living), max(house$sqft_living))
plot(data = house[house$bedrooms <= 3,], price ~ sqft_living, col = "red", xlim = sqrtft_lim, ylim = price_lim)
par(new = T) # tells R to hold prev plot
plot(data = house[house$bedrooms > 3,], price ~ sqft_living, col = "blue", xlim = sqrtft_lim, ylim = price_lim)
# Adding lines to plots
abline(reg_model, col = "black", lwd = 2) # Adds a regression line
abline(0,200, lwd = 2, col = "green") # Adds a second regression line
abline(reg_model_2, lwd = 2, col = "black") # Adds a third regression line

Topic: Linear Regression Example in R
setwd("~/Desktop/R Materials/mih140/Assignments/'20 Assignments/Data")
cba = read.table("cba_admissions_1999.txt", sep = "\t", header = T, quote = "", allowEscapes = T)
cba = cba[!is.na(cba$SAT_math),] # remove na rows
# In this example we examine the relationship betwee sat_math score and sat_verbal score in the cba dataset.
# 1: Fit a simple regression between SAT_verbal and SAT_math
res = lm(cba$SAT_verbal ~ cba$SAT_math)
# 2: Write out the regression equation
summary(res) # observe model Y = 0.46625 X + 290.08827
##
## Call:
## lm(formula = cba$SAT_verbal ~ cba$SAT_math)
##
## Residuals:
## Min 1Q Median 3Q Max
## -216.460 -42.014 -0.511 39.489 192.190
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 290.08827 17.99666 16.12 <2e-16 ***
## cba$SAT_math 0.46625 0.03035 15.36 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 61.17 on 862 degrees of freedom
## Multiple R-squared: 0.2149, Adjusted R-squared: 0.214
## F-statistic: 236 on 1 and 862 DF, p-value: < 2.2e-16
# 3: Plot the regression line against the scatter plot, be sure to appropriately label your plot!
plot(cba$SAT_math, cba$SAT_verbal, main = "Linear Relationship Between SAT Math and SAT Verbal", xlab = "SAT Math Score", ylab = "SAT Verbal Score")
abline(res, col = "red")

# 4: Plot the residuals, qqplot, comment on trends you see in the data and check assumptions
par(mfrow = c(2,2))
plot(res)

# Residuals seem relatively consistent, the equal variance assumption and independence between our independent variable and the normal noise assumption appears reasonable.
# QQ Plot is straight and diagonal supporting our normality assumption