# Reset environment
rm(list=ls())
# Read data
library(readxl)
ToyotaPrices <- read_excel("C:/Users/nefst/Downloads/ToyotaPrices.xlsx")
# Subset data
dt <- subset(ToyotaPrices, select = c(Price,
Mfg_Year,
KM,
Automatic,
Weight,
ABS,
Tow_Bar))
# Convert categorical variables
dt$Automatic <- factor(dt$Automatic)
levels(levels(dt$Automatic) <- c("NO", "YES"))
## NULL
dt$ABS <- factor(dt$ABS)
levels(dt$ABS) <- c("NO", "YES")
dt$Tow_Bar <- factor(dt$Tow_Bar)
levels(dt$Tow_Bar) <- c("No", "Yes")
# NA some values
dt$KM[dt$KM < 500] <- NA
# Omit NA
dt <- na.omit(dt)
1 Scatterplot Matrix
library(ggplot2)
library(GGally)
## Warning: package 'GGally' was built under R version 4.5.1
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
pairs(~Automatic + ABS + Tow_Bar + Mfg_Year + KM + Weight + Price, data = dt, main = "Scatterplot Matrix")

Scatterplots do not include regression lines or smoothers to show
the direction of relationships.
4 Multiple regression model
g1 <- lm(Price ~ Mfg_Year + KM + Weight + ABS + Automatic + Tow_Bar, data = dt)
summary(g1)
##
## Call:
## lm(formula = Price ~ Mfg_Year + KM + Weight + ABS + Automatic +
## Tow_Bar, data = dt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10495.6 -726.1 -3.7 722.9 6624.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.013e+06 6.975e+04 -43.201 <2e-16 ***
## Mfg_Year 1.503e+03 3.512e+01 42.808 <2e-16 ***
## KM -2.357e-02 1.159e-03 -20.333 <2e-16 ***
## Weight 1.858e+01 8.303e-01 22.384 <2e-16 ***
## ABSYES -2.187e+02 9.976e+01 -2.192 0.0285 *
## AutomaticYES 2.954e+02 1.559e+02 1.895 0.0583 .
## Tow_BarYes -1.156e+02 8.002e+01 -1.445 0.1487
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1334 on 1418 degrees of freedom
## Multiple R-squared: 0.8585, Adjusted R-squared: 0.8579
## F-statistic: 1434 on 6 and 1418 DF, p-value: < 2.2e-16
yhat <- predict(g1)
cor(dt$Price, yhat)^2
## [1] 0.8584889
5 Residual Plot
plot(g1$fitted.values, g1$residuals,
xlab = "Fitted values", ylab = "Residuals",
main = "Residuals vs Fitted")
abline(h = 0)

6 Residuals vs explanatory variables
par(mfrow = c(2, 3))
plot(dt$Mfg_Year, g1$residuals, main = "Residuals vs Mfg_Year")
plot(dt$KM, g1$residuals, main = "Residuals vs KM")
plot(dt$Weight, g1$residuals, main = "Residuals vs Weight")
plot(dt$ABS, g1$residuals, main = "Residuals vs ABS")
plot(dt$Automatic, g1$residuals, main = "Residuals vs Automatic")
plot(dt$Tow_Bar, g1$residuals, main = "Residuals vs Tow_Bar")

par(mfrow = c(1, 1))
There may be patterns in KM and Weight, indicating those variables
are not fully expressed by the linear model.
7 Comparing a big and a small model
g2 <- lm(Price ~ Mfg_Year + KM + Weight, data = dt)
summary(g2)
##
## Call:
## lm(formula = Price ~ Mfg_Year + KM + Weight, data = dt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10479.3 -738.1 -27.6 741.6 6588.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.953e+06 6.368e+04 -46.38 <2e-16 ***
## Mfg_Year 1.473e+03 3.206e+01 45.95 <2e-16 ***
## KM -2.403e-02 1.146e-03 -20.96 <2e-16 ***
## Weight 1.899e+01 8.190e-01 23.19 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1337 on 1421 degrees of freedom
## Multiple R-squared: 0.8575, Adjusted R-squared: 0.8572
## F-statistic: 2849 on 3 and 1421 DF, p-value: < 2.2e-16
cat("Adjusted R^2 for g1:", summary(g1)$adj.r.squared, "\n")
## Adjusted R^2 for g1: 0.8578902
cat("Adjusted R^2 for g2:", summary(g2)$adj.r.squared, "\n")
## Adjusted R^2 for g2: 0.8571551