# Reset environment
rm(list=ls())

# Read data
library(readxl)
ToyotaPrices <- read_excel("C:/Users/nefst/Downloads/ToyotaPrices.xlsx")

# Subset data
dt <- subset(ToyotaPrices, select = c(Price, 
                                      Mfg_Year, 
                                      KM, 
                                      Automatic, 
                                      Weight, 
                                      ABS, 
                                      Tow_Bar))

# Convert categorical variables

dt$Automatic <- factor(dt$Automatic)
levels(levels(dt$Automatic) <- c("NO", "YES"))
## NULL
dt$ABS <- factor(dt$ABS)
levels(dt$ABS) <- c("NO", "YES")

dt$Tow_Bar <- factor(dt$Tow_Bar)
levels(dt$Tow_Bar) <- c("No", "Yes")

# NA some values
dt$KM[dt$KM < 500] <- NA

# Omit NA
dt <- na.omit(dt)

1 Scatterplot Matrix

library(ggplot2)
library(GGally)
## Warning: package 'GGally' was built under R version 4.5.1
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
pairs(~Automatic + ABS + Tow_Bar + Mfg_Year + KM + Weight + Price, data = dt, main = "Scatterplot Matrix")

Scatterplots do not include regression lines or smoothers to show the direction of relationships.

2 Scatterplot matrix with factor information

ggplot(dt, aes(x = KM, y = Price, color = Tow_Bar)) +
  geom_point() +
  labs(title = "Price vs KM by Tow Bar")

ggplot(dt, aes(x = Mfg_Year, y = Price, color = Tow_Bar)) +
  geom_point() +
  labs(title = "Price vs Mfg_Year by Tow Bar")

ggplot(dt, aes(x = Weight, y = Price, color = Tow_Bar)) +
  geom_point() +
  labs(title = "Price vs Weight by Tow Bar")

The relationship between Price and KM is not exactly the same for both groups. Cars without a tow bar tend to show a more depreciation with KM, whereas with a tow bar, the relationship is more varied.

3 Scatterplot matrix with factor information and fitted line

ggplot(dt, aes(x = KM, y = Price, color = Tow_Bar)) +
  geom_point() +
  geom_smooth() +
  labs(title = "Smooth: Price vs KM by Tow Bar")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

ggplot(dt, aes(x = Weight, y = Price, color = Tow_Bar)) +
  geom_point() +
  geom_smooth() +
  labs(title = "Smooth: Price vs Weight by Tow Bar")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

Smoothers help identify interactions or nonlinear effects that a simple linear model may not show as well.

4 Multiple regression model

g1 <- lm(Price ~ Mfg_Year + KM + Weight + ABS + Automatic + Tow_Bar, data = dt)
summary(g1)
## 
## Call:
## lm(formula = Price ~ Mfg_Year + KM + Weight + ABS + Automatic + 
##     Tow_Bar, data = dt)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10495.6   -726.1     -3.7    722.9   6624.5 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -3.013e+06  6.975e+04 -43.201   <2e-16 ***
## Mfg_Year      1.503e+03  3.512e+01  42.808   <2e-16 ***
## KM           -2.357e-02  1.159e-03 -20.333   <2e-16 ***
## Weight        1.858e+01  8.303e-01  22.384   <2e-16 ***
## ABSYES       -2.187e+02  9.976e+01  -2.192   0.0285 *  
## AutomaticYES  2.954e+02  1.559e+02   1.895   0.0583 .  
## Tow_BarYes   -1.156e+02  8.002e+01  -1.445   0.1487    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1334 on 1418 degrees of freedom
## Multiple R-squared:  0.8585, Adjusted R-squared:  0.8579 
## F-statistic:  1434 on 6 and 1418 DF,  p-value: < 2.2e-16
yhat <- predict(g1)
cor(dt$Price, yhat)^2
## [1] 0.8584889

5 Residual Plot

plot(g1$fitted.values, g1$residuals,
     xlab = "Fitted values", ylab = "Residuals",
     main = "Residuals vs Fitted")
abline(h = 0)

6 Residuals vs explanatory variables

par(mfrow = c(2, 3))
plot(dt$Mfg_Year, g1$residuals, main = "Residuals vs Mfg_Year")
plot(dt$KM, g1$residuals, main = "Residuals vs KM")
plot(dt$Weight, g1$residuals, main = "Residuals vs Weight")
plot(dt$ABS, g1$residuals, main = "Residuals vs ABS")
plot(dt$Automatic, g1$residuals, main = "Residuals vs Automatic")
plot(dt$Tow_Bar, g1$residuals, main = "Residuals vs Tow_Bar")

par(mfrow = c(1, 1))

There may be patterns in KM and Weight, indicating those variables are not fully expressed by the linear model.

7 Comparing a big and a small model

g2 <- lm(Price ~ Mfg_Year + KM + Weight, data = dt)
summary(g2)
## 
## Call:
## lm(formula = Price ~ Mfg_Year + KM + Weight, data = dt)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10479.3   -738.1    -27.6    741.6   6588.7 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.953e+06  6.368e+04  -46.38   <2e-16 ***
## Mfg_Year     1.473e+03  3.206e+01   45.95   <2e-16 ***
## KM          -2.403e-02  1.146e-03  -20.96   <2e-16 ***
## Weight       1.899e+01  8.190e-01   23.19   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1337 on 1421 degrees of freedom
## Multiple R-squared:  0.8575, Adjusted R-squared:  0.8572 
## F-statistic:  2849 on 3 and 1421 DF,  p-value: < 2.2e-16
cat("Adjusted R^2 for g1:", summary(g1)$adj.r.squared, "\n")
## Adjusted R^2 for g1: 0.8578902
cat("Adjusted R^2 for g2:", summary(g2)$adj.r.squared, "\n")
## Adjusted R^2 for g2: 0.8571551