#Linear regession Homework [75pts] let’s analyze the correlation
between displacement and Highway miles per gallon but this time we will
only consider cars with 4 cylinder engines.
Setup block
#reset workspace
rm(list = ls())
#load libraries
library(dplyr)
library(ggplot2)
#load the data
mpg <- read.csv("https://tinyurl.com/didampg2022")
#0. [1pts] rename this file to include your name
#1. [5pts] p1:using dplyr functions create a new dataset called
mpg_4c that only includes cars with 4 cylinder engines. p2:we will use
this dataframe for the rest of the homework so double check using head
or View or table that only 4 cilinder cars are in the dataset mpg_4c
#p1
mpg_4c <- mpg %>% filter(Cyl == 4)
#p2
head(mpg_4c)
#2. [10pts] p1:using geom_jitter plot the displacement on the x and
the highway mpg on the y axis. p2: add appropriate axis labels. make
sure you are using the mpg_4c dataset
ggplot(mpg_4c) +
geom_jitter(aes(x=Displ, y= Hwy.MPG), width=0.5) +
xlab("Engine Displacement (L)") +
ylab("Highway MPG")
#3. [5pts] Describe this trend.
The plot suggests that cars with bigger engines don’t go as far on
gallon of gas. the Smaller the engine the more miles you can get per
gallon.
#4. [12pts]Split the dataset in a 1/5 ratio 4/5 to the training data,
1/5 to the validation data. follow the example from class, use the names
mpg4.train mpg4.valid for the two resulting dataframes make sure you are
using the 4 cylinder dataset everywhere
set.seed(13)
split <- 0.8
rows <- nrow(mpg_4c)
train.entries <- sample(rows, rows*split)
mpg_4c.train <- mpg_4c[train.entries, ]
mpg_4c.valid <- mpg_4c[-train.entries, ]
#5. [5pts]Generate the linear model and save it in the model
variable
model <- lm(Hwy.MPG~ Displ, data=mpg_4c.train)
#6. [2pts] using summary print the model
summary(model)
#7. [5pts] using the information from summary write the y hat
equation:
yhat =
#8. [10pts] calculate the residuals for both training and testing
data. use mutate to store yhat and the residual as columns in each
dataset
#training data
mpg_4c.train <- mpg_4c.train %>%
mutate(yhat = predict(model, newdata=mpg_4c.train)) %>%
mutate(residual = Hwy.MPG - yhat)
#validation data
mpg_4c.valid <- mpg_4c.valid %>%
mutate(yhat = predict(model, newdata=mpg_4c.valid)) %>%
mutate(residual = Hwy.MPG - yhat)
#9. [10pts] make a plot of y_hat on the x axis and the residuals on
the y axis using geom_jitter with a width of 1 for the validation data.
include a geom_hline on y=0 and a limit the y axis to the range
[-20,20]
ggplot(mpg_4c.valid, aes(x = yhat, y = residual)) +
geom_jitter(width = 1) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
ylim(-20, 20) +
xlab("Predicted Mileage (mpg)") +
ylab("Residual (mpg)")
#10. [5pts] looking at this graph what can you tell me about this set
of residuals: p1:does it show a trend? p2:how is the spread? Looking at
this set of residuals it seems as though there is a lack of a trend
because the line of best fit is smack dab in the middle of the graph. so
it’s equally divided.
#11. [5pts] p1:darw a histogram of the residuals , limit the x axis
to -20,20, p2:visually estimate the range where most of the observations
fall.
ggplot(mpg_4c.valid) +
geom_histogram(aes(x=residual), bins=50, color="black") +
xlim(-20, 20) +
xlab("Residual (miles per gallon)")
cat("the range is [A,B]")
#12. [5pts] are we overfitting? compare the standard deviation of the
residuals from training and validation data and write wether we are
overfitting or not and why.
sd(mpg_4c.train$residual)
sd(mpg_4c.valid$residual)
#overfitting or not overfitting?
based on how close the standard deviations and close they are I would
say that we are not overfitting
LS0tCnRpdGxlOiAiTGluZWFyIHJlZ3Jlc3Npb24iCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCiNMaW5lYXIgcmVnZXNzaW9uIEhvbWV3b3JrIFs3NXB0c10KbGV0J3MgYW5hbHl6ZSB0aGUgY29ycmVsYXRpb24gYmV0d2VlbiBkaXNwbGFjZW1lbnQgYW5kIEhpZ2h3YXkgbWlsZXMgcGVyIGdhbGxvbiBidXQgdGhpcyB0aW1lIHdlIHdpbGwgb25seSBjb25zaWRlciBjYXJzIHdpdGggNCBjeWxpbmRlciBlbmdpbmVzLgoKU2V0dXAgYmxvY2sKYGBge3IsIHdhcm5pbmc9RkFMU0UsbWVzc2FnZT1GQUxTRX0KI3Jlc2V0IHdvcmtzcGFjZQpybShsaXN0ID0gbHMoKSkKI2xvYWQgbGlicmFyaWVzCmxpYnJhcnkoZHBseXIpCmxpYnJhcnkoZ2dwbG90MikKI2xvYWQgdGhlIGRhdGEKbXBnIDwtIHJlYWQuY3N2KCJodHRwczovL3Rpbnl1cmwuY29tL2RpZGFtcGcyMDIyIikKYGBgCgojMC4gWzFwdHNdIHJlbmFtZSB0aGlzIGZpbGUgdG8gaW5jbHVkZSB5b3VyIG5hbWUKCiMxLiBbNXB0c10gcDE6dXNpbmcgZHBseXIgZnVuY3Rpb25zIGNyZWF0ZSBhIG5ldyBkYXRhc2V0IGNhbGxlZCBtcGdfNGMgdGhhdCBvbmx5IGluY2x1ZGVzIGNhcnMgd2l0aCA0IGN5bGluZGVyIGVuZ2luZXMuIHAyOndlIHdpbGwgdXNlIHRoaXMgZGF0YWZyYW1lIGZvciB0aGUgcmVzdCBvZiB0aGUgaG9tZXdvcmsgc28gZG91YmxlIGNoZWNrIHVzaW5nIGhlYWQgb3IgVmlldyBvciB0YWJsZSB0aGF0IG9ubHkgNCBjaWxpbmRlciBjYXJzIGFyZSBpbiB0aGUgZGF0YXNldCBtcGdfNGMKYGBge3J9CiNwMQptcGdfNGMgPC0gbXBnICU+JSBmaWx0ZXIoQ3lsID09IDQpCgojcDIKaGVhZChtcGdfNGMpCmBgYAoKIzIuIFsxMHB0c10gcDE6dXNpbmcgZ2VvbV9qaXR0ZXIgcGxvdCB0aGUgZGlzcGxhY2VtZW50IG9uIHRoZSB4IGFuZCB0aGUgaGlnaHdheSBtcGcgb24gdGhlIHkgYXhpcy4gcDI6IGFkZCBhcHByb3ByaWF0ZSBheGlzIGxhYmVscy4gbWFrZSBzdXJlIHlvdSBhcmUgdXNpbmcgdGhlIG1wZ180YyBkYXRhc2V0CmBgYHtyfQpnZ3Bsb3QobXBnXzRjKSArCiAgZ2VvbV9qaXR0ZXIoYWVzKHg9RGlzcGwsIHk9IEh3eS5NUEcpLCB3aWR0aD0wLjUpICsKICB4bGFiKCJFbmdpbmUgRGlzcGxhY2VtZW50IChMKSIpICsKICB5bGFiKCJIaWdod2F5IE1QRyIpCgpgYGAKIzMuIFs1cHRzXSBEZXNjcmliZSB0aGlzIHRyZW5kLgoKVGhlIHBsb3Qgc3VnZ2VzdHMgdGhhdCBjYXJzIHdpdGggYmlnZ2VyIGVuZ2luZXMgZG9uJ3QgZ28gYXMgZmFyIG9uIGdhbGxvbiBvZiBnYXMuIHRoZSBTbWFsbGVyIHRoZSBlbmdpbmUgdGhlIG1vcmUgbWlsZXMgeW91IGNhbiBnZXQgcGVyIGdhbGxvbi4gCgojNC4gWzEycHRzXVNwbGl0IHRoZSBkYXRhc2V0IGluIGEgMS81IHJhdGlvIDQvNSB0byB0aGUgdHJhaW5pbmcgZGF0YSwgMS81IHRvIHRoZSB2YWxpZGF0aW9uIGRhdGEuCmZvbGxvdyB0aGUgZXhhbXBsZSBmcm9tIGNsYXNzLCB1c2UgdGhlIG5hbWVzIG1wZzQudHJhaW4gbXBnNC52YWxpZCBmb3IgdGhlIHR3byByZXN1bHRpbmcgZGF0YWZyYW1lcwptYWtlIHN1cmUgeW91IGFyZSB1c2luZyB0aGUgNCBjeWxpbmRlciBkYXRhc2V0IGV2ZXJ5d2hlcmUKYGBge3J9CgpzZXQuc2VlZCgxMykKCgpzcGxpdCA8LSAwLjgKCnJvd3MgIDwtIG5yb3cobXBnXzRjKQoKdHJhaW4uZW50cmllcyA8LSBzYW1wbGUocm93cywgcm93cypzcGxpdCkKbXBnXzRjLnRyYWluIDwtIG1wZ180Y1t0cmFpbi5lbnRyaWVzLCBdCgptcGdfNGMudmFsaWQgIDwtIG1wZ180Y1stdHJhaW4uZW50cmllcywgIF0KYGBgCgoKIzUuIFs1cHRzXUdlbmVyYXRlIHRoZSBsaW5lYXIgbW9kZWwgYW5kIHNhdmUgaXQgaW4gdGhlIG1vZGVsIHZhcmlhYmxlCmBgYHtyfQptb2RlbCA8LSBsbShId3kuTVBHfiBEaXNwbCwgZGF0YT1tcGdfNGMudHJhaW4pCgpgYGAKCgojNi4gWzJwdHNdIHVzaW5nIHN1bW1hcnkgcHJpbnQgdGhlIG1vZGVsCmBgYHtyfQpzdW1tYXJ5KG1vZGVsKQpgYGAKCiM3LiBbNXB0c10gdXNpbmcgdGhlIGluZm9ybWF0aW9uIGZyb20gc3VtbWFyeSB3cml0ZSB0aGUgeSBoYXQgZXF1YXRpb246Cgp5aGF0ID0KCiM4LiBbMTBwdHNdIGNhbGN1bGF0ZSB0aGUgcmVzaWR1YWxzIGZvciBib3RoIHRyYWluaW5nIGFuZCB0ZXN0aW5nIGRhdGEuIHVzZSBtdXRhdGUgdG8gc3RvcmUgeWhhdCBhbmQgdGhlIHJlc2lkdWFsIGFzIGNvbHVtbnMgaW4gZWFjaCBkYXRhc2V0CgpgYGB7cn0KI3RyYWluaW5nIGRhdGEKbXBnXzRjLnRyYWluIDwtIG1wZ180Yy50cmFpbiAlPiUKICAgIG11dGF0ZSh5aGF0ID0gcHJlZGljdChtb2RlbCwgbmV3ZGF0YT1tcGdfNGMudHJhaW4pKSAlPiUKICAgIG11dGF0ZShyZXNpZHVhbCA9IEh3eS5NUEcgLSB5aGF0KQogIAojdmFsaWRhdGlvbiBkYXRhCm1wZ180Yy52YWxpZCA8LSBtcGdfNGMudmFsaWQgJT4lCiAgICBtdXRhdGUoeWhhdCA9IHByZWRpY3QobW9kZWwsIG5ld2RhdGE9bXBnXzRjLnZhbGlkKSkgJT4lCiAgICBtdXRhdGUocmVzaWR1YWwgPSBId3kuTVBHIC0geWhhdCkKICAKICAKYGBgCgojOS4gWzEwcHRzXSBtYWtlIGEgcGxvdCBvZiB5X2hhdCBvbiB0aGUgeCBheGlzIGFuZCB0aGUgcmVzaWR1YWxzIG9uIHRoZSB5IGF4aXMgdXNpbmcgZ2VvbV9qaXR0ZXIgd2l0aCBhIHdpZHRoIG9mIDEgZm9yIHRoZSB2YWxpZGF0aW9uIGRhdGEuIGluY2x1ZGUgYSBnZW9tX2hsaW5lIG9uIHk9MCBhbmQgYSBsaW1pdCB0aGUgeSBheGlzIHRvIHRoZSByYW5nZSBbLTIwLDIwXQoKYGBge3J9CgpnZ3Bsb3QobXBnXzRjLnZhbGlkLCBhZXMoeCA9IHloYXQsIHkgPSByZXNpZHVhbCkpICsKICBnZW9tX2ppdHRlcih3aWR0aCA9IDEpICsgCiAgZ2VvbV9obGluZSh5aW50ZXJjZXB0ID0gMCwgbGluZXR5cGUgPSAiZGFzaGVkIiwgY29sb3IgPSAicmVkIikgKyAKICB5bGltKC0yMCwgMjApICsKICB4bGFiKCJQcmVkaWN0ZWQgTWlsZWFnZSAobXBnKSIpICsKICB5bGFiKCJSZXNpZHVhbCAobXBnKSIpCgpgYGAKCiMxMC4gWzVwdHNdIGxvb2tpbmcgYXQgdGhpcyBncmFwaCB3aGF0IGNhbiB5b3UgdGVsbCBtZSBhYm91dCB0aGlzIHNldCBvZiByZXNpZHVhbHM6IHAxOmRvZXMgaXQgc2hvdyBhIHRyZW5kPyBwMjpob3cgaXMgdGhlIHNwcmVhZD8KTG9va2luZyBhdCB0aGlzIHNldCBvZiByZXNpZHVhbHMgaXQgc2VlbXMgYXMgdGhvdWdoIHRoZXJlIGlzIGEgbGFjayBvZiBhIHRyZW5kIGJlY2F1c2UgdGhlIGxpbmUgb2YgYmVzdCBmaXQgaXMgc21hY2sgZGFiIGluIHRoZSBtaWRkbGUgb2YgdGhlIGdyYXBoLiBzbyBpdCdzIGVxdWFsbHkgZGl2aWRlZC4gCgojMTEuIFs1cHRzXSBwMTpkYXJ3IGEgaGlzdG9ncmFtIG9mIHRoZSByZXNpZHVhbHMgLCBsaW1pdCB0aGUgeCBheGlzIHRvIC0yMCwyMCwgcDI6dmlzdWFsbHkgZXN0aW1hdGUgdGhlIHJhbmdlIHdoZXJlIG1vc3Qgb2YgdGhlIG9ic2VydmF0aW9ucyBmYWxsLgoKYGBge3J9CiAgZ2dwbG90KG1wZ180Yy52YWxpZCkgKwogIGdlb21faGlzdG9ncmFtKGFlcyh4PXJlc2lkdWFsKSwgYmlucz01MCwgY29sb3I9ImJsYWNrIikgKwogIHhsaW0oLTIwLCAyMCkgKwogIHhsYWIoIlJlc2lkdWFsIChtaWxlcyBwZXIgZ2FsbG9uKSIpCiAgY2F0KCJ0aGUgcmFuZ2UgaXMgW0EsQl0iKQpgYGAKCiMxMi4gWzVwdHNdIGFyZSB3ZSBvdmVyZml0dGluZz8gY29tcGFyZSB0aGUgc3RhbmRhcmQgZGV2aWF0aW9uIG9mIHRoZSByZXNpZHVhbHMgZnJvbSB0cmFpbmluZyBhbmQgdmFsaWRhdGlvbiBkYXRhIGFuZCB3cml0ZSB3ZXRoZXIgd2UgYXJlIG92ZXJmaXR0aW5nIG9yIG5vdCBhbmQgd2h5LgoKYGBge3J9CgpzZChtcGdfNGMudHJhaW4kcmVzaWR1YWwpCnNkKG1wZ180Yy52YWxpZCRyZXNpZHVhbCkKI292ZXJmaXR0aW5nIG9yIG5vdCBvdmVyZml0dGluZz8KCmBgYApiYXNlZCBvbiBob3cgY2xvc2UgIHRoZSBzdGFuZGFyZCBkZXZpYXRpb25zIGFuZCBjbG9zZSB0aGV5IGFyZSBJIHdvdWxkIHNheSB0aGF0CndlIGFyZSBub3Qgb3ZlcmZpdHRpbmcg