https://archive.ics.uci.edu/ml/datasets/Auto+MPG
1. mpg: continuous
2. cylinders: multi-valued discrete
3. displacement: continuous
4. horsepower: continuous
5. weight: continuous
6. acceleration: continuous
7. model year: multi-valued discrete
8. origin: multi-valued discrete
9. car name: string (unique for each instance)
library(tidyr)
#retrieve auto mpg data from UCI website
car_mpg <- read.csv(file="https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data", header=FALSE, sep="\t")
#collapse multiple-spaces into single space for columns in V1 with subset of numbers
car_mpg$V1 <- (gsub("[[:space:]]+", " ", car_mpg$V1))
#seperate V1 into multiple columns with headers
car_mpg <- car_mpg %>% separate(V1, into=c('mpg','cylinders','displacement','horsepower','weight','acceleration','model_year','origin'), sep=" ")
It seems there is negative correlation between mpg and horsepower
# mpg as function of horsepower
car_horsepower_vs_mpg <- lm(mpg~horsepower, car_mpg, na.action = na.exclude)
options(warn=-1)
plot(x = car_mpg$horsepower, y= car_mpg$mpg )
from the qqplot and residual fitted charts, the residual values are pretty normal and random distributed.
plot(car_horsepower_vs_mpg)