#Linear regession Homework [75pts] let’s analyze the correlation between displacement and Highway miles per gallon but this time we will only consider cars with 4 cylinder engines.

Setup block

#reset workspace
rm(list = ls())
#load libraries
library(dplyr)
library(ggplot2)
#load the data
mpg <- read.csv("https://tinyurl.com/didampg2022")

#0. [1pts] rename this file to include your name

#1. [5pts] p1:using dplyr functions create a new dataset called mpg_4c that only includes cars with 4 cylinder engines. p2:we will use this dataframe for the rest of the homework so double check using head or View or table that only 4 cilinder cars are in the dataset mpg_4c

#p1
mpg_4c <- mpg %>% filter(Cyl == 4)

#p2
head(mpg_4c)

#2. [10pts] p1:using geom_jitter plot the displacement on the x and the highway mpg on the y axis. p2: add appropriate axis labels. make sure you are using the mpg_4c dataset

ggplot(mpg_4c) +
  geom_jitter(aes(x=Displ, y= Hwy.MPG), width=0.5) +
  xlab("Engine Displacement (L)") +
  ylab("Highway MPG")

#3. [5pts] Describe this trend.

The plot suggests that cars with bigger engines don’t go as far on gallon of gas. the Smaller the engine the more miles you can get per gallon.

#4. [12pts]Split the dataset in a 1/5 ratio 4/5 to the training data, 1/5 to the validation data. follow the example from class, use the names mpg4.train mpg4.valid for the two resulting dataframes make sure you are using the 4 cylinder dataset everywhere


set.seed(13)


split <- 0.8

rows  <- nrow(mpg_4c)

train.entries <- sample(rows, rows*split)
mpg_4c.train <- mpg_4c[train.entries, ]

mpg_4c.valid  <- mpg_4c[-train.entries,  ]

#5. [5pts]Generate the linear model and save it in the model variable

model <- lm(Hwy.MPG~ Displ, data=mpg_4c.train)

#6. [2pts] using summary print the model

summary(model)

#7. [5pts] using the information from summary write the y hat equation:

yhat =

#8. [10pts] calculate the residuals for both training and testing data. use mutate to store yhat and the residual as columns in each dataset

#training data
mpg_4c.train <- mpg_4c.train %>%
    mutate(yhat = predict(model, newdata=mpg_4c.train)) %>%
    mutate(residual = Hwy.MPG - yhat)
  
#validation data
mpg_4c.valid <- mpg_4c.valid %>%
    mutate(yhat = predict(model, newdata=mpg_4c.valid)) %>%
    mutate(residual = Hwy.MPG - yhat)
  
  

#9. [10pts] make a plot of y_hat on the x axis and the residuals on the y axis using geom_jitter with a width of 1 for the validation data. include a geom_hline on y=0 and a limit the y axis to the range [-20,20]


ggplot(mpg_4c.valid, aes(x = yhat, y = residual)) +
  geom_jitter(width = 1) + 
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") + 
  ylim(-20, 20) +
  xlab("Predicted Mileage (mpg)") +
  ylab("Residual (mpg)")

#10. [5pts] looking at this graph what can you tell me about this set of residuals: p1:does it show a trend? p2:how is the spread? Looking at this set of residuals it seems as though there is a lack of a trend because the line of best fit is smack dab in the middle of the graph. so it’s equally divided.

#11. [5pts] p1:darw a histogram of the residuals , limit the x axis to -20,20, p2:visually estimate the range where most of the observations fall.

  ggplot(mpg_4c.valid) +
  geom_histogram(aes(x=residual), bins=50, color="black") +
  xlim(-20, 20) +
  xlab("Residual (miles per gallon)")
  cat("the range is [A,B]")

#12. [5pts] are we overfitting? compare the standard deviation of the residuals from training and validation data and write wether we are overfitting or not and why.


sd(mpg_4c.train$residual)
sd(mpg_4c.valid$residual)
#overfitting or not overfitting?

based on how close the standard deviations and close they are I would say that we are not overfitting

LS0tCnRpdGxlOiAiTGluZWFyIHJlZ3Jlc3Npb24iCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCiNMaW5lYXIgcmVnZXNzaW9uIEhvbWV3b3JrIFs3NXB0c10KbGV0J3MgYW5hbHl6ZSB0aGUgY29ycmVsYXRpb24gYmV0d2VlbiBkaXNwbGFjZW1lbnQgYW5kIEhpZ2h3YXkgbWlsZXMgcGVyIGdhbGxvbiBidXQgdGhpcyB0aW1lIHdlIHdpbGwgb25seSBjb25zaWRlciBjYXJzIHdpdGggNCBjeWxpbmRlciBlbmdpbmVzLgoKU2V0dXAgYmxvY2sKYGBge3IsIHdhcm5pbmc9RkFMU0UsbWVzc2FnZT1GQUxTRX0KI3Jlc2V0IHdvcmtzcGFjZQpybShsaXN0ID0gbHMoKSkKI2xvYWQgbGlicmFyaWVzCmxpYnJhcnkoZHBseXIpCmxpYnJhcnkoZ2dwbG90MikKI2xvYWQgdGhlIGRhdGEKbXBnIDwtIHJlYWQuY3N2KCJodHRwczovL3Rpbnl1cmwuY29tL2RpZGFtcGcyMDIyIikKYGBgCgojMC4gWzFwdHNdIHJlbmFtZSB0aGlzIGZpbGUgdG8gaW5jbHVkZSB5b3VyIG5hbWUKCiMxLiBbNXB0c10gcDE6dXNpbmcgZHBseXIgZnVuY3Rpb25zIGNyZWF0ZSBhIG5ldyBkYXRhc2V0IGNhbGxlZCBtcGdfNGMgdGhhdCBvbmx5IGluY2x1ZGVzIGNhcnMgd2l0aCA0IGN5bGluZGVyIGVuZ2luZXMuIHAyOndlIHdpbGwgdXNlIHRoaXMgZGF0YWZyYW1lIGZvciB0aGUgcmVzdCBvZiB0aGUgaG9tZXdvcmsgc28gZG91YmxlIGNoZWNrIHVzaW5nIGhlYWQgb3IgVmlldyBvciB0YWJsZSB0aGF0IG9ubHkgNCBjaWxpbmRlciBjYXJzIGFyZSBpbiB0aGUgZGF0YXNldCBtcGdfNGMKYGBge3J9CiNwMQptcGdfNGMgPC0gbXBnICU+JSBmaWx0ZXIoQ3lsID09IDQpCgojcDIKaGVhZChtcGdfNGMpCmBgYAoKIzIuIFsxMHB0c10gcDE6dXNpbmcgZ2VvbV9qaXR0ZXIgcGxvdCB0aGUgZGlzcGxhY2VtZW50IG9uIHRoZSB4IGFuZCB0aGUgaGlnaHdheSBtcGcgb24gdGhlIHkgYXhpcy4gcDI6IGFkZCBhcHByb3ByaWF0ZSBheGlzIGxhYmVscy4gbWFrZSBzdXJlIHlvdSBhcmUgdXNpbmcgdGhlIG1wZ180YyBkYXRhc2V0CmBgYHtyfQpnZ3Bsb3QobXBnXzRjKSArCiAgZ2VvbV9qaXR0ZXIoYWVzKHg9RGlzcGwsIHk9IEh3eS5NUEcpLCB3aWR0aD0wLjUpICsKICB4bGFiKCJFbmdpbmUgRGlzcGxhY2VtZW50IChMKSIpICsKICB5bGFiKCJIaWdod2F5IE1QRyIpCgpgYGAKIzMuIFs1cHRzXSBEZXNjcmliZSB0aGlzIHRyZW5kLgoKVGhlIHBsb3Qgc3VnZ2VzdHMgdGhhdCBjYXJzIHdpdGggYmlnZ2VyIGVuZ2luZXMgZG9uJ3QgZ28gYXMgZmFyIG9uIGdhbGxvbiBvZiBnYXMuIHRoZSBTbWFsbGVyIHRoZSBlbmdpbmUgdGhlIG1vcmUgbWlsZXMgeW91IGNhbiBnZXQgcGVyIGdhbGxvbi4gCgojNC4gWzEycHRzXVNwbGl0IHRoZSBkYXRhc2V0IGluIGEgMS81IHJhdGlvIDQvNSB0byB0aGUgdHJhaW5pbmcgZGF0YSwgMS81IHRvIHRoZSB2YWxpZGF0aW9uIGRhdGEuCmZvbGxvdyB0aGUgZXhhbXBsZSBmcm9tIGNsYXNzLCB1c2UgdGhlIG5hbWVzIG1wZzQudHJhaW4gbXBnNC52YWxpZCBmb3IgdGhlIHR3byByZXN1bHRpbmcgZGF0YWZyYW1lcwptYWtlIHN1cmUgeW91IGFyZSB1c2luZyB0aGUgNCBjeWxpbmRlciBkYXRhc2V0IGV2ZXJ5d2hlcmUKYGBge3J9CgpzZXQuc2VlZCgxMykKCgpzcGxpdCA8LSAwLjgKCnJvd3MgIDwtIG5yb3cobXBnXzRjKQoKdHJhaW4uZW50cmllcyA8LSBzYW1wbGUocm93cywgcm93cypzcGxpdCkKbXBnXzRjLnRyYWluIDwtIG1wZ180Y1t0cmFpbi5lbnRyaWVzLCBdCgptcGdfNGMudmFsaWQgIDwtIG1wZ180Y1stdHJhaW4uZW50cmllcywgIF0KYGBgCgoKIzUuIFs1cHRzXUdlbmVyYXRlIHRoZSBsaW5lYXIgbW9kZWwgYW5kIHNhdmUgaXQgaW4gdGhlIG1vZGVsIHZhcmlhYmxlCmBgYHtyfQptb2RlbCA8LSBsbShId3kuTVBHfiBEaXNwbCwgZGF0YT1tcGdfNGMudHJhaW4pCgpgYGAKCgojNi4gWzJwdHNdIHVzaW5nIHN1bW1hcnkgcHJpbnQgdGhlIG1vZGVsCmBgYHtyfQpzdW1tYXJ5KG1vZGVsKQpgYGAKCiM3LiBbNXB0c10gdXNpbmcgdGhlIGluZm9ybWF0aW9uIGZyb20gc3VtbWFyeSB3cml0ZSB0aGUgeSBoYXQgZXF1YXRpb246Cgp5aGF0ID0KCiM4LiBbMTBwdHNdIGNhbGN1bGF0ZSB0aGUgcmVzaWR1YWxzIGZvciBib3RoIHRyYWluaW5nIGFuZCB0ZXN0aW5nIGRhdGEuIHVzZSBtdXRhdGUgdG8gc3RvcmUgeWhhdCBhbmQgdGhlIHJlc2lkdWFsIGFzIGNvbHVtbnMgaW4gZWFjaCBkYXRhc2V0CgpgYGB7cn0KI3RyYWluaW5nIGRhdGEKbXBnXzRjLnRyYWluIDwtIG1wZ180Yy50cmFpbiAlPiUKICAgIG11dGF0ZSh5aGF0ID0gcHJlZGljdChtb2RlbCwgbmV3ZGF0YT1tcGdfNGMudHJhaW4pKSAlPiUKICAgIG11dGF0ZShyZXNpZHVhbCA9IEh3eS5NUEcgLSB5aGF0KQogIAojdmFsaWRhdGlvbiBkYXRhCm1wZ180Yy52YWxpZCA8LSBtcGdfNGMudmFsaWQgJT4lCiAgICBtdXRhdGUoeWhhdCA9IHByZWRpY3QobW9kZWwsIG5ld2RhdGE9bXBnXzRjLnZhbGlkKSkgJT4lCiAgICBtdXRhdGUocmVzaWR1YWwgPSBId3kuTVBHIC0geWhhdCkKICAKICAKYGBgCgojOS4gWzEwcHRzXSBtYWtlIGEgcGxvdCBvZiB5X2hhdCBvbiB0aGUgeCBheGlzIGFuZCB0aGUgcmVzaWR1YWxzIG9uIHRoZSB5IGF4aXMgdXNpbmcgZ2VvbV9qaXR0ZXIgd2l0aCBhIHdpZHRoIG9mIDEgZm9yIHRoZSB2YWxpZGF0aW9uIGRhdGEuIGluY2x1ZGUgYSBnZW9tX2hsaW5lIG9uIHk9MCBhbmQgYSBsaW1pdCB0aGUgeSBheGlzIHRvIHRoZSByYW5nZSBbLTIwLDIwXQoKYGBge3J9CgpnZ3Bsb3QobXBnXzRjLnZhbGlkLCBhZXMoeCA9IHloYXQsIHkgPSByZXNpZHVhbCkpICsKICBnZW9tX2ppdHRlcih3aWR0aCA9IDEpICsgCiAgZ2VvbV9obGluZSh5aW50ZXJjZXB0ID0gMCwgbGluZXR5cGUgPSAiZGFzaGVkIiwgY29sb3IgPSAicmVkIikgKyAKICB5bGltKC0yMCwgMjApICsKICB4bGFiKCJQcmVkaWN0ZWQgTWlsZWFnZSAobXBnKSIpICsKICB5bGFiKCJSZXNpZHVhbCAobXBnKSIpCgpgYGAKCiMxMC4gWzVwdHNdIGxvb2tpbmcgYXQgdGhpcyBncmFwaCB3aGF0IGNhbiB5b3UgdGVsbCBtZSBhYm91dCB0aGlzIHNldCBvZiByZXNpZHVhbHM6IHAxOmRvZXMgaXQgc2hvdyBhIHRyZW5kPyBwMjpob3cgaXMgdGhlIHNwcmVhZD8KTG9va2luZyBhdCB0aGlzIHNldCBvZiByZXNpZHVhbHMgaXQgc2VlbXMgYXMgdGhvdWdoIHRoZXJlIGlzIGEgbGFjayBvZiBhIHRyZW5kIGJlY2F1c2UgdGhlIGxpbmUgb2YgYmVzdCBmaXQgaXMgc21hY2sgZGFiIGluIHRoZSBtaWRkbGUgb2YgdGhlIGdyYXBoLiBzbyBpdCdzIGVxdWFsbHkgZGl2aWRlZC4gCgojMTEuIFs1cHRzXSBwMTpkYXJ3IGEgaGlzdG9ncmFtIG9mIHRoZSByZXNpZHVhbHMgLCBsaW1pdCB0aGUgeCBheGlzIHRvIC0yMCwyMCwgcDI6dmlzdWFsbHkgZXN0aW1hdGUgdGhlIHJhbmdlIHdoZXJlIG1vc3Qgb2YgdGhlIG9ic2VydmF0aW9ucyBmYWxsLgoKYGBge3J9CiAgZ2dwbG90KG1wZ180Yy52YWxpZCkgKwogIGdlb21faGlzdG9ncmFtKGFlcyh4PXJlc2lkdWFsKSwgYmlucz01MCwgY29sb3I9ImJsYWNrIikgKwogIHhsaW0oLTIwLCAyMCkgKwogIHhsYWIoIlJlc2lkdWFsIChtaWxlcyBwZXIgZ2FsbG9uKSIpCiAgY2F0KCJ0aGUgcmFuZ2UgaXMgW0EsQl0iKQpgYGAKCiMxMi4gWzVwdHNdIGFyZSB3ZSBvdmVyZml0dGluZz8gY29tcGFyZSB0aGUgc3RhbmRhcmQgZGV2aWF0aW9uIG9mIHRoZSByZXNpZHVhbHMgZnJvbSB0cmFpbmluZyBhbmQgdmFsaWRhdGlvbiBkYXRhIGFuZCB3cml0ZSB3ZXRoZXIgd2UgYXJlIG92ZXJmaXR0aW5nIG9yIG5vdCBhbmQgd2h5LgoKYGBge3J9CgpzZChtcGdfNGMudHJhaW4kcmVzaWR1YWwpCnNkKG1wZ180Yy52YWxpZCRyZXNpZHVhbCkKI292ZXJmaXR0aW5nIG9yIG5vdCBvdmVyZml0dGluZz8KCmBgYApiYXNlZCBvbiBob3cgY2xvc2UgIHRoZSBzdGFuZGFyZCBkZXZpYXRpb25zIGFuZCBjbG9zZSB0aGV5IGFyZSBJIHdvdWxkIHNheSB0aGF0CndlIGFyZSBub3Qgb3ZlcmZpdHRpbmcg