Andrew Chen s3488195 Yonn April s3727210
Last updated: 01 August, 2019
cars.all <- read.csv('cars.csv', header=FALSE,
dec=".",
na.strings='?',
col.names = c("symboling","normalized-losses","make","fuel-type","aspiration", "num-of-doors", "body-styles","drive-wheels","engine-location","wheel-base","length","width", "height","curb-weight","engine-type","num-of-cylinders","engine-size","fuel-system","bore","stroke","compression-ratio","horsepower","peak-rpm","city-mpg","highway-mpg","price"))
cars <- cars.all[,c("city.mpg", "price")]
cars <- cars %>% na.omit()
stats <- c( min(cars$price,na.rm = TRUE), quantile(cars$price,probs = .25,na.rm = TRUE), median(cars$price, na.rm = TRUE), quantile(cars$price,probs = .75,na.rm = TRUE), max(cars$price,na.rm = TRUE), mean(cars$price, na.rm = TRUE), sd(cars$price, na.rm = TRUE), length(cars$price), sum(is.na(cars$price)), min(cars$city.mpg,na.rm = TRUE), quantile(cars$city.mpg,probs = .25,na.rm = TRUE), median(cars$city.mpg, na.rm = TRUE), quantile(cars$city.mpg,probs = .75,na.rm = TRUE), max(cars$city.mpg,na.rm = TRUE), mean(cars$city.mpg, na.rm = TRUE), sd(cars$city.mpg, na.rm = TRUE), length(cars$city.mpg), sum(is.na(cars$city.mpg))
)
summary <- matrix(stats, ncol = 9, byrow = TRUE)
colnames(summary) <- c("Min", "Q1", "Median", "Q3", "Max", "Mean", "SD", "n", "Missing")
rownames(summary) <- c("Price", "City MPG")knitr::kable(summary)| Min | Q1 | Median | Q3 | Max | Mean | SD | n | Missing | |
|---|---|---|---|---|---|---|---|---|---|
| Price | 5118 | 7775 | 10295 | 16500 | 45400 | 13207.1294 | 7947.066342 | 201 | 0 |
| City MPG | 13 | 19 | 24 | 30 | 49 | 25.1791 | 6.423221 | 201 | 0 |
ggplot(cars, aes(x=price, y=city.mpg)) + geom_point()a <- ggplot(cars, aes(x=price)) + geom_histogram(aes(y=..density..)) + geom_density()
b <- ggplot(cars, aes(x=city.mpg)) + geom_histogram(aes(y=..density..)) + geom_density()
grid.arrange(a, b, nrow = 1)a <- ggplot(cars, aes(x=log(price))) + geom_histogram(aes(y=..density..)) + geom_density()
b <- ggplot(cars, aes(x=log(city.mpg))) + geom_histogram(aes(y=..density..)) + geom_density()
grid.arrange(a, b, nrow = 1)ggplot(cars, aes(x=log(price), y=log(city.mpg))) + geom_point()model1 <- lm(log(city.mpg) ~ log(price), data = cars)model1 %>% summary()##
## Call:
## lm(formula = log(city.mpg) ~ log(price), data = cars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.38139 -0.09863 -0.01045 0.06525 0.46221
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.02840 0.19350 36.32 <2e-16 ***
## log(price) -0.41006 0.02067 -19.84 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1468 on 199 degrees of freedom
## Multiple R-squared: 0.6643, Adjusted R-squared: 0.6626
## F-statistic: 393.7 on 1 and 199 DF, p-value: < 2.2e-16
alpha <- model1$coefficients[["(Intercept)"]]
beta <- model1$coefficients[["log(price)"]]
model.function <- function(x) alpha + beta*xggplot(cars, aes(x=log(price), y=log(city.mpg))) + geom_point() + stat_function(fun = model.function) + xlim(8, 11)model1 %>% summary##
## Call:
## lm(formula = log(city.mpg) ~ log(price), data = cars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.38139 -0.09863 -0.01045 0.06525 0.46221
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.02840 0.19350 36.32 <2e-16 ***
## log(price) -0.41006 0.02067 -19.84 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1468 on 199 degrees of freedom
## Multiple R-squared: 0.6643, Adjusted R-squared: 0.6626
## F-statistic: 393.7 on 1 and 199 DF, p-value: < 2.2e-16
model1 %>% summary() %>% coef()## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.0283979 0.19350366 36.32178 9.413514e-90
## log(price) -0.4100568 0.02066568 -19.84240 4.753214e-49
model1 %>% confint()## 2.5 % 97.5 %
## (Intercept) 6.6468170 7.409979
## log(price) -0.4508086 -0.369305
cars$predicted <- predict(model1)
cars$residuals <- residuals(model1)
a <- ggplot(cars, aes(x = log(price), y = log(city.mpg))) +
geom_smooth(method = "lm", se = FALSE, color = "lightgrey") +
geom_segment(aes(xend = log(price), yend = predicted), alpha = .2) +
geom_point(aes(color = residuals)) +
scale_color_gradient2(low = "blue", mid = "white", high = "red") +
guides(color = FALSE) +
geom_point(aes(y = predicted), shape = 1) +
theme_bw()
b <- ggplot(cars, aes(x = predicted, y = residuals)) + geom_point() + geom_smooth(aes(colour = "red"), se= FALSE) +
geom_hline(yintercept=0) + theme(legend.position = "none")grid.arrange(a, b, nrow = 1)grid.arrange(a, b, c, nrow = 1)r.squared <- model1 %>% summary()
r.squared <- r.squared$r.squared
#Correlation Coefficient r
r <- cor(log(cars$city.mpg), log(cars$price), use = "complete.obs")
#r 95%CI
library(psychometric)
n <- cars$city.mpg %>% length()
print(CIr(r, n = n, level = .95))## [1] -0.8567765 -0.7626498
detach("package:psychometric", unload = TRUE)Schlimmer, J. (1987). Automobile data set. Retrieved from https://archive.ics.uci.edu/ml/datasets/automobile
Borger, B. D., & Rouwendal, J. (2014). Car user taxes, quality characteristics, and fuel efficiency: Household behaviour and market adjustment. Journal of Transport Economics and Policy. 48(3), 345-366
Alberini, A., Bareit, M., & FIlippini M., (2016). What is the effect of fuel efficiency information on car prices? Evidence from Switzerland. The Energy Journal, 37(3). 2016.