When you buy a book off Amazon, you get a quote for how much it costs to ship. This is based on the weight of the book. If you didn’t know the weight of the book, what other characteristics of it could you measure to help predict the weight?
library(tidyverse)
#install.packages("DAAG")
library(DAAG)
data(allbacks)
books <- allbacks[, c(3, 1, 4)]
head(books)
## weight volume cover
## 1 800 885 hb
## 2 950 1016 hb
## 3 1050 1125 hb
## 4 350 239 hb
## 5 750 701 hb
## 6 600 641 hb
# Scatterplot of all books together
ggplot(data=books, aes(x=volume, y=weight))+
geom_point()+
ggtitle("Scatterplot of Volume vs Weight of Books")+
theme_bw()
m3<-lm(weight~volume, data=books)
summary(m3)
##
## Call:
## lm(formula = weight ~ volume, data = books)
##
## Residuals:
## Min 1Q Median 3Q Max
## -189.97 -109.86 38.08 109.73 145.57
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 107.67931 88.37758 1.218 0.245
## volume 0.70864 0.09746 7.271 6.26e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 123.9 on 13 degrees of freedom
## Multiple R-squared: 0.8026, Adjusted R-squared: 0.7875
## F-statistic: 52.87 on 1 and 13 DF, p-value: 6.262e-06
# Scatterplot of all books together and fitted model
ggplot(data=books, aes(x=volume, y=weight))+
geom_point()+
ggtitle("Scatterplot of Volume vs Weight of Books")+
theme_bw()+
geom_abline(slope=m3$coefficients[2], intercept = m3$coefficients[1])
# What about considering cover type
ggplot(data=books, aes(x=volume, y=weight, color=cover))+
geom_point()+
ggtitle("Scatterplot of Volume vs Weight of Books")+
theme_bw()
# Include indicator to shift y-intercept
m4<-lm(weight~volume+cover, data=books)
summary(m4)
##
## Call:
## lm(formula = weight ~ volume + cover, data = books)
##
## Residuals:
## Min 1Q Median 3Q Max
## -110.10 -32.32 -16.10 28.93 210.95
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 197.96284 59.19274 3.344 0.005841 **
## volume 0.71795 0.06153 11.669 6.6e-08 ***
## coverpb -184.04727 40.49420 -4.545 0.000672 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 78.2 on 12 degrees of freedom
## Multiple R-squared: 0.9275, Adjusted R-squared: 0.9154
## F-statistic: 76.73 on 2 and 12 DF, p-value: 1.455e-07
m4$coefficients
## (Intercept) volume coverpb
## 197.9628436 0.7179537 -184.0472714
# plot with shift of y-intercept
ggplot(data=books, aes(x=volume, y=weight, color=cover))+
geom_point()+
ggtitle("Scatterplot of Volume vs Weight of Books")+
theme_bw()+
geom_abline(slope=m4$coefficients[2], intercept = m4$coefficients[1], col=2)+
geom_abline(slope=m4$coefficients[2], intercept = m4$coefficients[1]+m4$coefficients[3], col=4)
# Include interaction to shift intercept and change slope
m5<-lm(weight~volume*cover, data=books)
summary(m5)
##
## Call:
## lm(formula = weight ~ volume * cover, data = books)
##
## Residuals:
## Min 1Q Median 3Q Max
## -89.67 -32.07 -21.82 17.94 215.91
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 161.58654 86.51918 1.868 0.0887 .
## volume 0.76159 0.09718 7.837 7.94e-06 ***
## coverpb -120.21407 115.65899 -1.039 0.3209
## volume:coverpb -0.07573 0.12802 -0.592 0.5661
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 80.41 on 11 degrees of freedom
## Multiple R-squared: 0.9297, Adjusted R-squared: 0.9105
## F-statistic: 48.5 on 3 and 11 DF, p-value: 1.245e-06
m5$coefficients
## (Intercept) volume coverpb volume:coverpb
## 161.58654141 0.76159284 -120.21406570 -0.07573366
# plot with shift of y-intercept
ggplot(data=books, aes(x=volume, y=weight, color=cover))+
geom_point()+
ggtitle("Scatterplot of Volume vs Weight of Books")+
theme_bw()+
geom_abline(slope=m5$coefficients[2], intercept = m5$coefficients[1], col=2)+
geom_abline(slope=m5$coefficients[2]+m5$coefficients[4], intercept = m5$coefficients[1]+m5$coefficients[3], col=4)
## Multiple linear regression
nyc <- read.csv("http://andrewpbray.github.io/data/nyc.csv")
head(nyc)
## Case Restaurant Price Food Decor Service East
## 1 1 Daniella Ristorante 43 22 18 20 0
## 2 2 Tello's Ristorante 32 20 19 19 0
## 3 3 Biricchino 34 21 13 18 0
## 4 4 Bottino 41 20 20 17 0
## 5 5 Da Umberto 54 24 19 21 0
## 6 6 Le Madri 52 22 22 21 0
nyc%>%
select(Price, Food, Decor)%>%
pairs()
m1<-lm(Price~Food+Decor, data=nyc)
summary(m1)
##
## Call:
## lm(formula = Price ~ Food + Decor, data = nyc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.945 -3.766 -0.153 3.701 18.757
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -24.5002 4.7230 -5.187 6.19e-07 ***
## Food 1.6461 0.2615 6.294 2.68e-09 ***
## Decor 1.8820 0.1919 9.810 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.788 on 165 degrees of freedom
## Multiple R-squared: 0.6167, Adjusted R-squared: 0.6121
## F-statistic: 132.7 on 2 and 165 DF, p-value: < 2.2e-16
anova(m1)
## Analysis of Variance Table
##
## Response: Price
## Df Sum Sq Mean Sq F value Pr(>F)
## Food 1 5670.3 5670.3 169.262 < 2.2e-16 ***
## Decor 1 3223.7 3223.7 96.228 < 2.2e-16 ***
## Residuals 165 5527.5 33.5
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
A way to quantify the impact of multicollinearity is to look at the VIF (variance inflation factor). A VIF close to 1 would imply no correlation. The larger the VIF the larger the amount of multicollinearity.
library(car)
vif(lm(Price~Food+Decor, data=nyc))
## Food Decor
## 1.340359 1.340359
As the climate grows warmer, we expect many animal species to move toward the poles in an attempt to maintain their preferred temperature range.
Do data on fish in the North Sea confirm this suspicion?
The data are 25 years of mean winter temperatures at the bottom of the North Sea (degrees Celsius) and the center of the distribution of anglerfish (sometimes called monkfish) in degrees of north latitude.
# EXAMPLE: Climate Change and Fish Habitats
# Data on anglerfish distribution
# Explanatory: Temp in C (mean winter temperature at bottom of North Sea)
# Response: Latitude of center for distribution of anglerfish
year<-c(1977:2001)
temp<-c(6.26, 6.26, 6.27, 6.31, 6.34, 6.32, 6.37, 6.39, 6.42,
6.52, 6.68, 6.76, 6.78, 6.89, 6.90, 6.93, 6.98,
7.02, 7.09, 7.13, 7.15, 7.29, 7.34, 7.57, 7.65)
lat<-c(57.20, 57.96, 57.65, 57.59, 58.01, 59.06, 56.85, 56.87, 57.43,
57.72, 57.83, 57.87, 57.48, 58.13, 58.52, 58.48, 57.89,
58.71, 58.07, 58.49, 58.28, 58.49, 58.01, 58.57, 58.90)
# Make a dataframe
fish<-data.frame(year, temp, lat)
library(tidyverse)
# Scatterplot of data looks linear
ggplot(data=fish, aes(temp, lat))+
geom_point()+
ggtitle("Scatterplot of Temp vs Fish Latitude")+
xlab("Temperature (C)")+
ylab("Fish Latitude")+
theme_bw()
# Fit a simple linear model
mod2<-lm(lat~temp)
summary(mod2)
##
## Call:
## lm(formula = lat ~ temp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.81309 -0.27207 -0.02401 0.20523 1.43781
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 52.4524 1.5324 34.229 < 2e-16 ***
## temp 0.8180 0.2254 3.629 0.00141 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4734 on 23 degrees of freedom
## Multiple R-squared: 0.3641, Adjusted R-squared: 0.3364
## F-statistic: 13.17 on 1 and 23 DF, p-value: 0.001408
names(mod2)
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
# Scatterplot with fitted line
ggplot(data=fish, aes(temp, lat))+
geom_point()+
ggtitle("Scatterplot of Temp vs Fish Latitude")+
xlab("Temperature (C)")+
ylab("Fish Latitude")+
theme_bw()+
geom_abline(slope=mod2$coefficients[2], intercept=mod2$coefficients[1],
color="blue", lty=2, lwd=1)
# QQ NORM Plot
hist(mod2$residuals)
qqnorm(mod2$residuals)
qqline(mod2$residuals)
fish<-cbind(fish,
fit=mod2$fitted.values,
residual=mod2$residuals)
ggplot(data=fish, aes(residual))+
geom_histogram(bins=8)+
ggtitle("Histogram of Residuals")+
theme_bw()
# Residual Plot
ggplot(data=fish, aes(temp, residual))+
geom_point()+
ggtitle("Residual Plot")+
xlab("Temperature (C)")+
ylab("Residuals")+
theme_bw()+
geom_hline(yintercept = 0,
color="blue", lty=2, lwd=1)