Is Being Eco Friendly Really Expensive?

Using Linear Regression Model To Predict Fuel Economy using Price

Andrew Chen s3488195 Yonn April s3727210

Last updated: 01 August, 2019

Introduction

Problem Statement

Data

Descriptive Statistics and Visualisation

cars.all <- read.csv('cars.csv', header=FALSE, 
                 dec=".", 
                 na.strings='?', 
                 col.names = c("symboling","normalized-losses","make","fuel-type","aspiration", "num-of-doors", "body-styles","drive-wheels","engine-location","wheel-base","length","width", "height","curb-weight","engine-type","num-of-cylinders","engine-size","fuel-system","bore","stroke","compression-ratio","horsepower","peak-rpm","city-mpg","highway-mpg","price"))

cars <- cars.all[,c("city.mpg", "price")]
cars <- cars %>% na.omit()

stats <- c( min(cars$price,na.rm = TRUE), quantile(cars$price,probs = .25,na.rm = TRUE), median(cars$price, na.rm = TRUE), quantile(cars$price,probs = .75,na.rm = TRUE), max(cars$price,na.rm = TRUE), mean(cars$price, na.rm = TRUE), sd(cars$price, na.rm = TRUE), length(cars$price), sum(is.na(cars$price)), min(cars$city.mpg,na.rm = TRUE), quantile(cars$city.mpg,probs = .25,na.rm = TRUE), median(cars$city.mpg, na.rm = TRUE), quantile(cars$city.mpg,probs = .75,na.rm = TRUE), max(cars$city.mpg,na.rm = TRUE), mean(cars$city.mpg, na.rm = TRUE), sd(cars$city.mpg, na.rm = TRUE), length(cars$city.mpg), sum(is.na(cars$city.mpg))
)
summary <- matrix(stats, ncol = 9, byrow = TRUE)
colnames(summary) <- c("Min", "Q1", "Median",   "Q3",   "Max",  "Mean", "SD",   "n",    "Missing")
rownames(summary) <- c("Price", "City MPG")
knitr::kable(summary)
Min Q1 Median Q3 Max Mean SD n Missing
Price 5118 7775 10295 16500 45400 13207.1294 7947.066342 201 0
City MPG 13 19 24 30 49 25.1791 6.423221 201 0

Descriptive Statistics and Visualisation cont.

ggplot(cars, aes(x=price, y=city.mpg)) + geom_point()

Descriptive Statistics and Visualisation cont.

a <- ggplot(cars, aes(x=price)) + geom_histogram(aes(y=..density..)) + geom_density()
b <- ggplot(cars, aes(x=city.mpg)) + geom_histogram(aes(y=..density..)) + geom_density()
grid.arrange(a, b, nrow = 1)

Descriptive Statistics and Visualisation cont.

a <- ggplot(cars, aes(x=log(price))) + geom_histogram(aes(y=..density..)) + geom_density()
b <- ggplot(cars, aes(x=log(city.mpg))) + geom_histogram(aes(y=..density..)) + geom_density()
grid.arrange(a, b, nrow = 1)

Descriptive Statistics and Visualisation cont.

ggplot(cars, aes(x=log(price), y=log(city.mpg))) + geom_point()

Descriptive Statistics and Visualisation cont.

model1 <- lm(log(city.mpg) ~ log(price), data = cars)
model1 %>% summary()
## 
## Call:
## lm(formula = log(city.mpg) ~ log(price), data = cars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.38139 -0.09863 -0.01045  0.06525  0.46221 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.02840    0.19350   36.32   <2e-16 ***
## log(price)  -0.41006    0.02067  -19.84   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1468 on 199 degrees of freedom
## Multiple R-squared:  0.6643, Adjusted R-squared:  0.6626 
## F-statistic: 393.7 on 1 and 199 DF,  p-value: < 2.2e-16

Descriptive Statistics and Visualisation cont.

alpha <- model1$coefficients[["(Intercept)"]]
beta <- model1$coefficients[["log(price)"]]
model.function <- function(x) alpha + beta*x
ggplot(cars, aes(x=log(price), y=log(city.mpg))) + geom_point() + stat_function(fun = model.function) + xlim(8, 11)

Hypothesis Testing

model1 %>% summary
## 
## Call:
## lm(formula = log(city.mpg) ~ log(price), data = cars)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.38139 -0.09863 -0.01045  0.06525  0.46221 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.02840    0.19350   36.32   <2e-16 ***
## log(price)  -0.41006    0.02067  -19.84   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1468 on 199 degrees of freedom
## Multiple R-squared:  0.6643, Adjusted R-squared:  0.6626 
## F-statistic: 393.7 on 1 and 199 DF,  p-value: < 2.2e-16

Hypothesis Testing Cont.

model1 %>% summary() %>% coef()
##               Estimate Std. Error   t value     Pr(>|t|)
## (Intercept)  7.0283979 0.19350366  36.32178 9.413514e-90
## log(price)  -0.4100568 0.02066568 -19.84240 4.753214e-49
model1 %>% confint()
##                  2.5 %    97.5 %
## (Intercept)  6.6468170  7.409979
## log(price)  -0.4508086 -0.369305

Testing Assumptions - Normality of Residuals

cars$predicted <- predict(model1)
cars$residuals <- residuals(model1)

a <- ggplot(cars, aes(x = log(price), y = log(city.mpg))) +
  geom_smooth(method = "lm", se = FALSE, color = "lightgrey") +
  geom_segment(aes(xend = log(price), yend = predicted), alpha = .2) +
  geom_point(aes(color = residuals)) +  
  scale_color_gradient2(low = "blue", mid = "white", high = "red") +  
  guides(color = FALSE) +
  geom_point(aes(y = predicted), shape = 1) +
  theme_bw()
b <- ggplot(cars, aes(x = predicted, y = residuals)) + geom_point() +  geom_smooth(aes(colour = "red"), se= FALSE) + 
 geom_hline(yintercept=0) + theme(legend.position = "none")
grid.arrange(a, b, nrow = 1)

Testing Assumptions - Normality of Residuals, Homoscedasticity and Influential Cases

grid.arrange(a, b, c, nrow = 1)

Correlation coefficient

r.squared <- model1 %>% summary()
r.squared <- r.squared$r.squared

#Correlation Coefficient r
r <- cor(log(cars$city.mpg), log(cars$price), use = "complete.obs")

#r 95%CI
library(psychometric)
n <- cars$city.mpg %>% length()
print(CIr(r, n = n, level = .95))
## [1] -0.8567765 -0.7626498
detach("package:psychometric", unload = TRUE)

Results

Discussion

References