Automobile Price Prediction Model - Simple Linear Regression

What to Do First?

Notes:

  1. Check the working directory and check the list of files
  2. Load necessary libraries
  3. The diamonds data set will be listed as a ‘Promise’ in the workspace. This is a special object in R, and you need to run a command on the data to fully load the data set.
setwd("C:/Users/labadmin/OneDrive/Learning/MySlideRule - Intro to DS/R/R Datasource")
getwd()
## [1] "C:/Users/labadmin/OneDrive/Learning/MySlideRule - Intro to DS/R/R Datasource"
library(ggplot2)

#install.packages("caTools")

library(caTools)

Load Dataset and view summary

autos <- read.csv("C:/Users/labadmin/OneDrive/Learning/MySlideRule - Intro to DS/R/R Datasource/Automobile price data _Raw_transformed_v2.csv")

set.seed(85)
split <- sample.split(autos$price, SplitRatio = 0.85)

autosTrain <- subset(autos, split == TRUE)
autosTest <- subset(autos, split == FALSE)

nrow(autosTrain)
## [1] 174
nrow(autosTest)
## [1] 31

Examine the dependent variable price

names(autos)
##  [1] "symboling"           "normalized.losses"   "make"               
##  [4] "fuel.type"           "aspiration"          "num.of.doors"       
##  [7] "body.style"          "drive.wheels"        "engine.location"    
## [10] "wheel.base"          "length"              "width"              
## [13] "height"              "curb.weight"         "engine.type"        
## [16] "num.of.cylinders"    "no.of.cylinders.num" "engine.size"        
## [19] "fuel.system"         "bore"                "stroke"             
## [22] "compression.ratio"   "horsepower"          "peak.rpm"           
## [25] "city.mpg"            "highway.mpg"         "price"
summary(autos)
##    symboling       normalized.losses         make      fuel.type  
##  Min.   :-2.0000   Min.   :  0.0     toyota    : 32   diesel: 20  
##  1st Qu.: 0.0000   1st Qu.: 77.0     nissan    : 18   gas   :185  
##  Median : 1.0000   Median :103.0     mazda     : 17               
##  Mean   : 0.8341   Mean   : 97.6     honda     : 13               
##  3rd Qu.: 2.0000   3rd Qu.:137.0     mitsubishi: 13               
##  Max.   : 3.0000   Max.   :256.0     subaru    : 12               
##                                      (Other)   :100               
##  aspiration  num.of.doors       body.style drive.wheels engine.location
##  std  :168   ?   :  2     convertible: 6   4wd:  9      front:202      
##  turbo: 37   four:114     hardtop    : 8   fwd:120      rear :  3      
##              two : 89     hatchback  :70   rwd: 76                     
##                           sedan      :96                               
##                           wagon      :25                               
##                                                                        
##                                                                        
##    wheel.base         length          width           height     
##  Min.   : 86.60   Min.   :141.1   Min.   :60.30   Min.   :47.80  
##  1st Qu.: 94.50   1st Qu.:166.3   1st Qu.:64.10   1st Qu.:52.00  
##  Median : 97.00   Median :173.2   Median :65.50   Median :54.10  
##  Mean   : 98.76   Mean   :174.0   Mean   :65.91   Mean   :53.72  
##  3rd Qu.:102.40   3rd Qu.:183.1   3rd Qu.:66.90   3rd Qu.:55.50  
##  Max.   :120.90   Max.   :208.1   Max.   :72.30   Max.   :59.80  
##                                                                  
##   curb.weight   engine.type num.of.cylinders no.of.cylinders.num
##  Min.   :1488   dohc : 12   eight :  5       Min.   : 2.00      
##  1st Qu.:2145   dohcv:  1   five  : 11       1st Qu.: 4.00      
##  Median :2414   l    : 12   four  :159       Median : 4.00      
##  Mean   :2556   ohc  :148   six   : 24       Mean   : 4.38      
##  3rd Qu.:2935   ohcf : 15   three :  1       3rd Qu.: 4.00      
##  Max.   :4066   ohcv : 13   twelve:  1       Max.   :12.00      
##                 rotor:  4   two   :  4                          
##   engine.size     fuel.system      bore           stroke     
##  Min.   : 61.0   mpfi   :94   Min.   :0.000   Min.   :0.000  
##  1st Qu.: 97.0   2bbl   :66   1st Qu.:3.130   1st Qu.:3.100  
##  Median :120.0   idi    :20   Median :3.310   Median :3.290  
##  Mean   :126.9   1bbl   :11   Mean   :3.265   Mean   :3.192  
##  3rd Qu.:141.0   spdi   : 9   3rd Qu.:3.580   3rd Qu.:3.410  
##  Max.   :326.0   4bbl   : 3   Max.   :3.940   Max.   :4.170  
##                  (Other): 2                                  
##  compression.ratio   horsepower       peak.rpm       city.mpg    
##  Min.   : 7.00     Min.   :  0.0   Min.   :   0   Min.   :13.00  
##  1st Qu.: 8.60     1st Qu.: 70.0   1st Qu.:4800   1st Qu.:19.00  
##  Median : 9.00     Median : 95.0   Median :5200   Median :24.00  
##  Mean   :10.14     Mean   :103.2   Mean   :5075   Mean   :25.22  
##  3rd Qu.: 9.40     3rd Qu.:116.0   3rd Qu.:5500   3rd Qu.:30.00  
##  Max.   :23.00     Max.   :288.0   Max.   :6600   Max.   :49.00  
##                                                                  
##   highway.mpg        price      
##  Min.   :16.00   Min.   :    0  
##  1st Qu.:25.00   1st Qu.: 7689  
##  Median :30.00   Median :10198  
##  Mean   :30.75   Mean   :12949  
##  3rd Qu.:34.00   3rd Qu.:16500  
##  Max.   :54.00   Max.   :45400  
## 
str(autos)
## 'data.frame':    205 obs. of  27 variables:
##  $ symboling          : int  3 3 1 2 2 2 1 1 1 0 ...
##  $ normalized.losses  : int  0 0 0 164 164 0 158 0 158 0 ...
##  $ make               : Factor w/ 22 levels "alfa-romero",..: 1 1 1 2 2 2 2 2 2 2 ...
##  $ fuel.type          : Factor w/ 2 levels "diesel","gas": 2 2 2 2 2 2 2 2 2 2 ...
##  $ aspiration         : Factor w/ 2 levels "std","turbo": 1 1 1 1 1 1 1 1 2 2 ...
##  $ num.of.doors       : Factor w/ 3 levels "?","four","two": 3 3 3 2 2 3 2 2 2 3 ...
##  $ body.style         : Factor w/ 5 levels "convertible",..: 1 1 3 4 4 4 4 5 4 3 ...
##  $ drive.wheels       : Factor w/ 3 levels "4wd","fwd","rwd": 3 3 3 2 1 2 2 2 2 1 ...
##  $ engine.location    : Factor w/ 2 levels "front","rear": 1 1 1 1 1 1 1 1 1 1 ...
##  $ wheel.base         : num  88.6 88.6 94.5 99.8 99.4 ...
##  $ length             : num  169 169 171 177 177 ...
##  $ width              : num  64.1 64.1 65.5 66.2 66.4 66.3 71.4 71.4 71.4 67.9 ...
##  $ height             : num  48.8 48.8 52.4 54.3 54.3 53.1 55.7 55.7 55.9 52 ...
##  $ curb.weight        : int  2548 2548 2823 2337 2824 2507 2844 2954 3086 3053 ...
##  $ engine.type        : Factor w/ 7 levels "dohc","dohcv",..: 1 1 6 4 4 4 4 4 4 4 ...
##  $ num.of.cylinders   : Factor w/ 7 levels "eight","five",..: 3 3 4 3 2 2 2 2 2 2 ...
##  $ no.of.cylinders.num: int  4 4 6 4 5 5 5 5 5 5 ...
##  $ engine.size        : int  130 130 152 109 136 136 136 136 131 131 ...
##  $ fuel.system        : Factor w/ 8 levels "1bbl","2bbl",..: 6 6 6 6 6 6 6 6 6 6 ...
##  $ bore               : num  3.47 3.47 2.68 3.19 3.19 3.19 3.19 3.19 3.13 3.13 ...
##  $ stroke             : num  2.68 2.68 3.47 3.4 3.4 3.4 3.4 3.4 3.4 3.4 ...
##  $ compression.ratio  : num  9 9 9 10 8 8.5 8.5 8.5 8.3 7 ...
##  $ horsepower         : int  111 111 154 102 115 110 110 110 140 160 ...
##  $ peak.rpm           : int  5000 5000 5000 5500 5500 5500 5500 5500 5500 5500 ...
##  $ city.mpg           : int  21 21 19 24 18 19 19 19 17 16 ...
##  $ highway.mpg        : int  27 27 26 30 22 25 25 25 20 22 ...
##  $ price              : int  13495 16500 16500 13950 17450 15250 17710 18920 23875 0 ...
table(autos$price)
## 
##     0  5118  5151  5195  5348  5389  5399  5499  5572  6095  6189  6229 
##     4     1     1     1     1     1     1     1     2     1     1     2 
##  6295  6338  6377  6479  6488  6529  6575  6649  6669  6692  6695  6785 
##     1     1     1     1     1     1     1     1     1     2     1     1 
##  6795  6849  6855  6918  6938  6989  7053  7099  7126  7129  7198  7295 
##     1     1     1     1     1     1     1     1     1     1     1     2 
##  7299  7349  7395  7463  7499  7603  7609  7689  7738  7775  7788  7799 
##     1     1     1     1     1     1     2     1     1     2     1     1 
##  7895  7898  7957  7975  7995  7999  8013  8058  8189  8195  8238  8249 
##     1     2     2     1     1     1     1     1     1     1     1     1 
##  8358  8449  8495  8499  8558  8778  8845  8921  8948  8949  9095  9233 
##     1     1     2     1     1     1     2     2     1     1     1     1 
##  9258  9279  9295  9298  9495  9538  9549  9639  9895  9959  9960  9980 
##     1     2     1     1     1     1     1     1     1     1     1     1 
##  9988  9989  9995 10198 10245 10295 10345 10595 10698 10795 10898 10945 
##     1     1     1     1     1     1     1     1     1     1     1     1 
## 11048 11199 11245 11248 11259 11549 11595 11694 11845 11850 11900 12170 
##     1     1     1     1     1     1     1     1     1     1     1     1 
## 12290 12440 12629 12764 12940 12945 12964 13200 13295 13415 13495 13499 
##     1     1     1     1     1     1     1     1     1     1     1     2 
## 13645 13845 13860 13950 14399 14489 14869 15040 15250 15510 15580 15645 
##     1     1     1     1     1     1     1     1     1     1     1     1 
## 15690 15750 15985 15998 16430 16500 16503 16515 16558 16630 16695 16845 
##     1     1     1     1     1     2     1     1     1     1     1     1 
## 16900 16925 17075 17199 17450 17669 17710 17950 18150 18280 18344 18399 
##     1     1     1     1     1     1     1     1     2     1     1     1 
## 18420 18620 18920 18950 19045 19699 20970 21105 21485 22018 22470 22625 
##     1     1     1     1     1     1     1     1     1     1     1     1 
## 23875 24565 25552 28176 28248 30760 31600 32250 32528 34028 34184 35056 
##     1     1     1     1     1     1     1     1     1     1     1     1 
## 35550 36000 36880 37028 40960 41315 45400 
##     1     1     1     1     1     1     1
summary(autos$price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    7689   10200   12950   16500   45400
is.factor(autos$price)
## [1] FALSE
range(autos$price)
## [1]     0 45400
#plot and adjust price histogram
ggplot(aes(x=price), data = autos, binwidth=1000 ) +
  geom_histogram() +
  scale_x_continuous(breaks = seq(0,50000,5000))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

#examine the USD 10,000 mark
ggplot(aes(x=price), data = autos, binwidth=25 ) +
  geom_histogram() +
  scale_x_continuous(limits= c(9900,10000),breaks = seq(0,10000,25))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## Warning: position_stack requires constant width: output may be incorrect

#facet wrap by make
ggplot(aes(x=price), data = autos, binwidth=1000 ) +
  geom_histogram() +
  scale_x_continuous(breaks = seq(0,50000,5000))+
  facet_wrap(~make,ncol = 4)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

NOTES

On average most cars are priced around the USD10k mark Most manufactures compete at similar price points on the lower end of the price scale Jaguar make few but expensive models BMW and Porche offer a wide price range of models Toyota make the most number of cars ***

EDA

ggplot(aes(x=make,y=price), data=autos)+
  geom_point()

###FINDINGS Make of the car does not seem to influence the price of the car but intitively we know that it does.

ggplot(aes(x=no.of.cylinders.num,y=price), data=autos)+
  geom_point()+
  scale_x_discrete(breaks = seq(2,12,1))

ggplot(aes(x=no.of.cylinders.num,y=price), data=autos)+
  geom_point(alpha = 1/10)+
  scale_x_discrete(breaks = seq(2,12,1)) +
  geom_smooth(method = 'lm', color = 'red')

###FINDINGS The number of cylinders seems to influnce the price. More cylinders == more expensive

ggplot(aes(x=horsepower,y=price), data=autos)+
  geom_point()

ggplot(aes(x=horsepower,y=price), data=autos)+
geom_point(alpha = 1/5) +
  geom_smooth(method = 'lm', color = 'red')

cor.test(x=autos$horsepower,y=autos$price)
## 
##  Pearson's product-moment correlation
## 
## data:  autos$horsepower and autos$price
## t = 13.631, df = 203, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6122519 0.7566454
## sample estimates:
##       cor 
## 0.6912879

FINDINGS

There is a positive correlation between price and HP

ggplot(aes(x=city.mpg,y=price), data=autos)+
  geom_point()

ggplot(aes(x=city.mpg,y=price), data=autos)+
  geom_point(alpha = 1/5) +
  geom_smooth(method = 'lm', color = 'red')

cor.test(x=autos$city.mpg,y=autos$price)
## 
##  Pearson's product-moment correlation
## 
## data:  autos$city.mpg and autos$price
## t = -12.518, df = 203, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.7309491 -0.5749975
## sample estimates:
##        cor 
## -0.6600259

FINDINGS

Seems to be a negative corelation between price and city mpg

ggplot(aes(x=highway.mpg,y=price), data=autos)+
  geom_point()

ggplot(aes(x=city.mpg,y=price), data=autos)+
  geom_point(alpha = 1/5) +
  geom_smooth(method = 'lm', color = 'red')

cor.test(x=autos$city.mpg,y=autos$price)
## 
##  Pearson's product-moment correlation
## 
## data:  autos$city.mpg and autos$price
## t = -12.518, df = 203, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.7309491 -0.5749975
## sample estimates:
##        cor 
## -0.6600259

FINDINGS

Seems to be a negative corelation between price and highway mpg

ggplot(aes(x=curb.weight,y=price), data=autos)+
  geom_point()

ggplot(aes(x=curb.weight,y=price), data=autos)+
  geom_point(alpha = 1/5) +
  geom_smooth(method = 'lm', color = 'red')

cor.test(x=autos$city.mpg,y=autos$price)
## 
##  Pearson's product-moment correlation
## 
## data:  autos$city.mpg and autos$price
## t = -12.518, df = 203, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.7309491 -0.5749975
## sample estimates:
##        cor 
## -0.6600259

FINDINGS

There is a positive correlation between price and curb weight

ggplot(aes(x=num.of.doors,y=price), data=autos)+
  geom_point()

###FINDINGS number of doors does not seem to influence price

ggplot(aes(x=engine.size,y=price), data=autos)+
  geom_point()

ggplot(aes(x=engine.size,y=price), data=autos)+
  geom_point(alpha = 1/5) +
  geom_smooth(method = 'lm', color = 'red')

cor.test(x=autos$engine.size,y=autos$price)
## 
##  Pearson's product-moment correlation
## 
## data:  autos$engine.size and autos$price
## t = 21.889, df = 203, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7920253 0.8746768
## sample estimates:
##       cor 
## 0.8380973

FINDINGS

Strong positive correlation between price and engine size

ggplot(aes(x=peak.rpm,y=price), data=autos)+
  geom_point()

ggplot(aes(x=peak.rpm,y=price), data=autos)+
  geom_point()+
  scale_x_continuous(limits = c(4000,6100))
## Warning: Removed 4 rows containing missing values (geom_point).

###FINDINGS Unable to determine a relationship

ggplot(aes(x=height,y=price), data=autos)+
  geom_point()

###FINDINGS Unable to determine a relationship. Doesnt seem to influence price

ggplot(aes(x=wheel.base,y=price), data=autos)+
  geom_point()

ggplot(aes(x=wheel.base,y=price), data=autos)+
  geom_point(alpha = 1/5) +
  geom_smooth(method = 'lm', color = 'red')

cor.test(x=autos$wheel.base,y=autos$price)
## 
##  Pearson's product-moment correlation
## 
## data:  autos$wheel.base and autos$price
## t = 10.113, df = 203, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4798271 0.6632337
## sample estimates:
##      cor 
## 0.578804

FINDINGS

wheel base seems to positively influence price

FINDINGS

Unable to determine a relationship. Doesnt seem to influence price

ggplot(aes(x=peak.rpm,y=price), data=autos)+
  geom_point()

ggplot(aes(x=peak.rpm,y=price), data=autos)+
  geom_point(alpha = 1/5) +
  geom_smooth(method = 'lm', color = 'red')

cor.test(x=autos$peak.rpm,y=autos$price)
## 
##  Pearson's product-moment correlation
## 
## data:  autos$peak.rpm and autos$price
## t = -0.7888, df = 203, p-value = 0.4312
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1908671  0.0823811
## sample estimates:
##         cor 
## -0.05527795

FINDINGS

peak rpm does not influence price


Model Building - Linear Regression

model1 <- lm(price ~ engine.size, data=autosTrain)
summary(model1)
## 
## Call:
## lm(formula = price ~ engine.size, data = autosTrain)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -24968.0  -2046.1   -234.5   1614.2  13505.3 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -7633.403   1024.759  -7.449 4.36e-12 ***
## engine.size   160.598      7.678  20.918  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4155 on 172 degrees of freedom
## Multiple R-squared:  0.7178, Adjusted R-squared:  0.7162 
## F-statistic: 437.6 on 1 and 172 DF,  p-value: < 2.2e-16
SSE <- sum(model1$residuals^2)
SSE
## [1] 2970049568
RMSE <- sqrt(SSE/nrow(autosTrain))
RMSE
## [1] 4131.495
model2 <- lm(price ~ engine.size + no.of.cylinders.num, data = autosTrain)
summary(model2)
## 
## Call:
## lm(formula = price ~ engine.size + no.of.cylinders.num, data = autosTrain)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22384.5  -2088.8    -62.9   1569.5  13615.6 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -5489.52    1320.20  -4.158 5.07e-05 ***
## engine.size           191.53      14.42  13.281  < 2e-16 ***
## no.of.cylinders.num -1375.81     546.21  -2.519   0.0127 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4092 on 171 degrees of freedom
## Multiple R-squared:  0.7279, Adjusted R-squared:  0.7247 
## F-statistic: 228.8 on 2 and 171 DF,  p-value: < 2.2e-16
SSE <- sum(model2$residuals^2)
SSE
## [1] 2863796332
RMSE <- sqrt(SSE/nrow(autosTrain))
RMSE
## [1] 4056.92
model3 <- lm(price ~ engine.size + no.of.cylinders.num + make , data=autosTrain)
summary(model3)
## 
## Call:
## lm(formula = price ~ engine.size + no.of.cylinders.num + make, 
##     data = autosTrain)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -24381.1  -1314.7   -116.4   1257.3  10627.0 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          4473.30    2522.82   1.773 0.078236 .  
## engine.size           135.28      14.53   9.313  < 2e-16 ***
## no.of.cylinders.num -1618.70     496.31  -3.261 0.001372 ** 
## makeaudi             3532.51    2276.55   1.552 0.122843    
## makebmw              5796.30    2471.19   2.346 0.020309 *  
## makechevrolet       -3158.77    3002.00  -1.052 0.294390    
## makedodge           -4101.30    2202.92  -1.862 0.064594 .  
## makehonda           -3192.93    2109.80  -1.513 0.132287    
## makeisuzu           -8859.36    2656.56  -3.335 0.001075 ** 
## makejaguar           6367.15    3399.30   1.873 0.063004 .  
## makemazda           -2723.87    2079.51  -1.310 0.192245    
## makemercedes-benz    9330.94    2430.42   3.839 0.000182 ***
## makemercury          -435.22    3725.10  -0.117 0.907147    
## makemitsubishi      -4863.55    2086.46  -2.331 0.021085 *  
## makenissan          -4036.69    2047.51  -1.972 0.050504 .  
## makepeugot           -738.48    2139.60  -0.345 0.730465    
## makeplymouth        -4491.93    2290.22  -1.961 0.051690 .  
## makeporsche          5394.82    2548.24   2.117 0.035903 *  
## makerenault         -6260.95    2944.85  -2.126 0.035135 *  
## makesaab              798.17    2357.07   0.339 0.735364    
## makesubaru          -3943.88    2096.85  -1.881 0.061929 .  
## maketoyota          -3445.12    1961.21  -1.757 0.081021 .  
## makevolkswagen      -2356.13    2113.57  -1.115 0.266734    
## makevolvo            2150.47    2147.57   1.001 0.318270    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3211 on 150 degrees of freedom
## Multiple R-squared:  0.8531, Adjusted R-squared:  0.8305 
## F-statistic: 37.86 on 23 and 150 DF,  p-value: < 2.2e-16
SSE <- sum(model3$residuals^2)
SSE
## [1] 1546605167
RMSE <- sqrt(SSE/nrow(autosTrain))
RMSE
## [1] 2981.365
#model 4: addition of horsepower does not improve the model. Also corelated with engine size
model4 <- lm(price ~ engine.size + no.of.cylinders.num + make + drive.wheels + body.style + engine.location, data=autosTrain)
summary(model4)
## 
## Call:
## lm(formula = price ~ engine.size + no.of.cylinders.num + make + 
##     drive.wheels + body.style + engine.location, data = autosTrain)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5750.0  -976.6   -63.9   843.4  6257.5 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           2436.92    1882.82   1.294 0.197651    
## engine.size            100.90      10.23   9.865  < 2e-16 ***
## no.of.cylinders.num   -336.35     336.64  -0.999 0.319416    
## makeaudi              6438.73    1707.52   3.771 0.000237 ***
## makebmw               6424.48    1741.27   3.690 0.000319 ***
## makechevrolet           25.38    2072.10   0.012 0.990243    
## makedodge             -613.95    1638.19  -0.375 0.708386    
## makehonda              183.08    1582.68   0.116 0.908073    
## makeisuzu            -6720.96    1856.38  -3.620 0.000407 ***
## makejaguar            7003.05    2319.58   3.019 0.003003 ** 
## makemazda              652.84    1547.07   0.422 0.673673    
## makemercedes-benz    10963.88    1674.75   6.547 9.83e-10 ***
## makemercury           1877.57    2490.92   0.754 0.452230    
## makemitsubishi        -708.44    1604.80  -0.441 0.659556    
## makenissan           -1063.18    1547.49  -0.687 0.493176    
## makepeugot             961.63    1568.50   0.613 0.540791    
## makeplymouth         -1259.13    1665.79  -0.756 0.450965    
## makeporsche         -19636.59    2579.95  -7.611 3.35e-12 ***
## makerenault          -1814.53    2104.49  -0.862 0.390012    
## makesaab              4954.88    1750.87   2.830 0.005326 ** 
## makesubaru            -979.26    1613.78  -0.607 0.544938    
## maketoyota           -1072.70    1430.12  -0.750 0.454441    
## makevolkswagen         844.13    1563.88   0.540 0.590198    
## makevolvo             3471.93    1562.75   2.222 0.027876 *  
## drive.wheelsfwd      -1236.47     885.27  -1.397 0.164663    
## drive.wheelsrwd       1457.50    1063.02   1.371 0.172495    
## body.stylehardtop    -2702.14    1244.51  -2.171 0.031562 *  
## body.stylehatchback  -2049.29    1083.97  -1.891 0.060709 .  
## body.stylesedan      -1557.51    1069.10  -1.457 0.147355    
## body.stylewagon      -1478.80    1156.26  -1.279 0.202987    
## engine.locationrear  34515.51    2548.74  13.542  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2047 on 143 degrees of freedom
## Multiple R-squared:  0.9431, Adjusted R-squared:  0.9311 
## F-statistic: 78.97 on 30 and 143 DF,  p-value: < 2.2e-16
SSE<-sum(model4$residuals^2)
SSE
## [1] 599182064
RMSE <- sqrt(SSE/nrow(autosTrain))
RMSE
## [1] 1855.687
cor.test(autos$engine.size,autos$horsepower)
## 
##  Pearson's product-moment correlation
## 
## data:  autos$engine.size and autos$horsepower
## t = 17.851, df = 203, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.7218551 0.8297436
## sample estimates:
##      cor 
## 0.781577
model5 <- lm(price ~ engine.size + no.of.cylinders.num + make + drive.wheels + body.style + engine.location + length, data=autosTrain)
summary(model5)
## 
## Call:
## lm(formula = price ~ engine.size + no.of.cylinders.num + make + 
##     drive.wheels + body.style + engine.location + length, data = autosTrain)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5744.6 -1176.6   -34.8   847.6  5880.4 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -12482.12    5449.60  -2.290 0.023467 *  
## engine.size             74.71      13.44   5.560 1.30e-07 ***
## no.of.cylinders.num     10.44     349.20   0.030 0.976193    
## makeaudi              5482.46    1696.86   3.231 0.001534 ** 
## makebmw               6417.76    1697.59   3.781 0.000230 ***
## makechevrolet         1340.80    2070.16   0.648 0.518239    
## makedodge              122.50    1617.06   0.076 0.939718    
## makehonda              789.51    1557.01   0.507 0.612893    
## makeisuzu            -5747.37    1840.53  -3.123 0.002172 ** 
## makejaguar            7884.21    2281.61   3.456 0.000725 ***
## makemazda              578.93    1508.48   0.384 0.701714    
## makemercedes-benz    10631.29    1636.74   6.495 1.30e-09 ***
## makemercury           1652.65    2429.66   0.680 0.497488    
## makemitsubishi        -477.61    1566.56  -0.305 0.760905    
## makenissan            -711.12    1513.52  -0.470 0.639187    
## makepeugot             144.24    1554.78   0.093 0.926217    
## makeplymouth          -804.57    1631.51  -0.493 0.622672    
## makeporsche         -19318.58    2517.60  -7.673 2.45e-12 ***
## makerenault          -1979.25    2052.48  -0.964 0.336526    
## makesaab              3421.46    1786.57   1.915 0.057489 .  
## makesubaru            -766.12    1575.01  -0.486 0.627417    
## maketoyota           -1127.21    1394.37  -0.808 0.420213    
## makevolkswagen         657.60    1526.00   0.431 0.667170    
## makevolvo             2837.50    1539.09   1.844 0.067324 .  
## drive.wheelsfwd      -1520.85     868.58  -1.751 0.082114 .  
## drive.wheelsrwd        953.46    1050.76   0.907 0.365730    
## body.stylehardtop    -3043.54    1218.96  -2.497 0.013673 *  
## body.stylehatchback  -2629.79    1075.47  -2.445 0.015700 *  
## body.stylesedan      -2786.70    1124.76  -2.478 0.014400 *  
## body.stylewagon      -3193.67    1272.23  -2.510 0.013185 *  
## engine.locationrear  35325.95    2500.39  14.128  < 2e-16 ***
## length                 103.74      35.68   2.908 0.004228 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1996 on 142 degrees of freedom
## Multiple R-squared:  0.9463, Adjusted R-squared:  0.9345 
## F-statistic: 80.68 on 31 and 142 DF,  p-value: < 2.2e-16
SSE<-sum(model5$residuals^2)
SSE
## [1] 565514780
RMSE <- sqrt(SSE/nrow(autosTrain))
RMSE
## [1] 1802.799
#FINDINGS: the R^2 and Adjusted R^2 values have dropped
# test for multi-collinearity
cor(autosTrain$engine.size,autosTrain$city.mpg)
## [1] -0.6857895
cor(autosTrain$engine.size, autosTrain$highway.mpg)
## [1] -0.7040139
cor(autosTrain$engine.size, autosTrain$horsepower)
## [1] 0.7800674
cor(autosTrain$engine.size,autosTrain$curb.weight)
## [1] 0.8453962
cor(autosTrain$engine.size,autosTrain$length)
## [1] 0.6681608
cor(autosTrain$curb.weight,autosTrain$city.mpg)
## [1] -0.767168
cor(autosTrain$curb.weight,autosTrain$highway.mpg)
## [1] -0.803512
cor.test(autosTrain$engine.size,autosTrain$no.of.cylinders.num)
## 
##  Pearson's product-moment correlation
## 
## data:  autosTrain$engine.size and autosTrain$no.of.cylinders.num
## t = 21.3, df = 172, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8047081 0.8878321
## sample estimates:
##       cor 
## 0.8515353
cor.test(autosTrain$curb.weight,autosTrain$length)
## 
##  Pearson's product-moment correlation
## 
## data:  autosTrain$curb.weight and autosTrain$length
## t = 23.985, df = 172, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8380192 0.9076899
## sample estimates:
##       cor 
## 0.8774016
cor.test(autosTrain$engine.size,autosTrain$no.of.cylinders.num)
## 
##  Pearson's product-moment correlation
## 
## data:  autosTrain$engine.size and autosTrain$no.of.cylinders.num
## t = 21.3, df = 172, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8047081 0.8878321
## sample estimates:
##       cor 
## 0.8515353
#horsepower and engine size are highly correlated
#no of lenght and curb.weight correlated
#curb weight and milage highly correlated
#engine size and no.of.cylinders highy corrlated
#test by removing horsepower

model6 <- lm(price ~ engine.size + no.of.cylinders.num + make + drive.wheels + body.style + engine.location + length + city.mpg, data=autosTrain)
summary(model6)
## 
## Call:
## lm(formula = price ~ engine.size + no.of.cylinders.num + make + 
##     drive.wheels + body.style + engine.location + length + city.mpg, 
##     data = autosTrain)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5524.9 -1074.8     0.0   821.9  6007.8 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -4597.59    6260.41  -0.734 0.463929    
## engine.size             68.19      13.48   5.059 1.29e-06 ***
## no.of.cylinders.num     20.01     343.32   0.058 0.953596    
## makeaudi              5153.18    1673.65   3.079 0.002497 ** 
## makebmw               6614.87    1670.85   3.959 0.000119 ***
## makechevrolet         2103.17    2059.12   1.021 0.308817    
## makedodge              -88.86    1592.09  -0.056 0.955571    
## makehonda              765.18    1530.72   0.500 0.617937    
## makeisuzu            -5445.55    1813.65  -3.003 0.003168 ** 
## makejaguar            8634.54    2264.11   3.814 0.000204 ***
## makemazda              682.76    1483.59   0.460 0.646074    
## makemercedes-benz    11530.38    1650.92   6.984 1.04e-10 ***
## makemercury           1626.73    2388.61   0.681 0.496965    
## makemitsubishi        -716.90    1543.20  -0.465 0.642968    
## makenissan            -589.22    1488.78  -0.396 0.692870    
## makepeugot             775.05    1550.30   0.500 0.617899    
## makeplymouth          -860.24    1604.09  -0.536 0.592612    
## makeporsche         -19264.81    2475.14  -7.783 1.37e-12 ***
## makerenault          -2050.49    2017.99  -1.016 0.311320    
## makesaab              3186.34    1759.01   1.811 0.072202 .  
## makesubaru            -808.90    1548.48  -0.522 0.602224    
## maketoyota            -836.43    1375.99  -0.608 0.544245    
## makevolkswagen         708.88    1500.35   0.472 0.637316    
## makevolvo             3190.83    1520.02   2.099 0.037581 *  
## drive.wheelsfwd      -1085.73     872.41  -1.245 0.215373    
## drive.wheelsrwd        990.29    1033.10   0.959 0.339420    
## body.stylehardtop    -2738.50    1204.88  -2.273 0.024548 *  
## body.stylehatchback  -2380.89    1062.22  -2.241 0.026561 *  
## body.stylesedan      -2397.67    1117.23  -2.146 0.033579 *  
## body.stylewagon      -2857.04    1258.34  -2.270 0.024694 *  
## engine.locationrear  35132.15    2459.40  14.285  < 2e-16 ***
## length                  74.36      37.10   2.005 0.046923 *  
## city.mpg              -106.74      43.85  -2.434 0.016169 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1962 on 141 degrees of freedom
## Multiple R-squared:  0.9484, Adjusted R-squared:  0.9367 
## F-statistic: 81.05 on 32 and 141 DF,  p-value: < 2.2e-16
SSE<-sum(model6$residuals^2)
SSE
## [1] 542705641
RMSE <- sqrt(SSE/nrow(autosTrain))
RMSE
## [1] 1766.068
#Findings: . Select model6 as the baseline
summary(model6)
## 
## Call:
## lm(formula = price ~ engine.size + no.of.cylinders.num + make + 
##     drive.wheels + body.style + engine.location + length + city.mpg, 
##     data = autosTrain)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5524.9 -1074.8     0.0   821.9  6007.8 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -4597.59    6260.41  -0.734 0.463929    
## engine.size             68.19      13.48   5.059 1.29e-06 ***
## no.of.cylinders.num     20.01     343.32   0.058 0.953596    
## makeaudi              5153.18    1673.65   3.079 0.002497 ** 
## makebmw               6614.87    1670.85   3.959 0.000119 ***
## makechevrolet         2103.17    2059.12   1.021 0.308817    
## makedodge              -88.86    1592.09  -0.056 0.955571    
## makehonda              765.18    1530.72   0.500 0.617937    
## makeisuzu            -5445.55    1813.65  -3.003 0.003168 ** 
## makejaguar            8634.54    2264.11   3.814 0.000204 ***
## makemazda              682.76    1483.59   0.460 0.646074    
## makemercedes-benz    11530.38    1650.92   6.984 1.04e-10 ***
## makemercury           1626.73    2388.61   0.681 0.496965    
## makemitsubishi        -716.90    1543.20  -0.465 0.642968    
## makenissan            -589.22    1488.78  -0.396 0.692870    
## makepeugot             775.05    1550.30   0.500 0.617899    
## makeplymouth          -860.24    1604.09  -0.536 0.592612    
## makeporsche         -19264.81    2475.14  -7.783 1.37e-12 ***
## makerenault          -2050.49    2017.99  -1.016 0.311320    
## makesaab              3186.34    1759.01   1.811 0.072202 .  
## makesubaru            -808.90    1548.48  -0.522 0.602224    
## maketoyota            -836.43    1375.99  -0.608 0.544245    
## makevolkswagen         708.88    1500.35   0.472 0.637316    
## makevolvo             3190.83    1520.02   2.099 0.037581 *  
## drive.wheelsfwd      -1085.73     872.41  -1.245 0.215373    
## drive.wheelsrwd        990.29    1033.10   0.959 0.339420    
## body.stylehardtop    -2738.50    1204.88  -2.273 0.024548 *  
## body.stylehatchback  -2380.89    1062.22  -2.241 0.026561 *  
## body.stylesedan      -2397.67    1117.23  -2.146 0.033579 *  
## body.stylewagon      -2857.04    1258.34  -2.270 0.024694 *  
## engine.locationrear  35132.15    2459.40  14.285  < 2e-16 ***
## length                  74.36      37.10   2.005 0.046923 *  
## city.mpg              -106.74      43.85  -2.434 0.016169 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1962 on 141 degrees of freedom
## Multiple R-squared:  0.9484, Adjusted R-squared:  0.9367 
## F-statistic: 81.05 on 32 and 141 DF,  p-value: < 2.2e-16
mean(autosTrain$price)
## [1] 12764.43
#model6 still then best model. On average it will be off by USD 2,646.582 where the average price is USD 12,949?

Model testing

predictTest <- predict(model1, newdata = autosTest)
predictTest
##        10        12        14        17        18        20        24 
## 13404.972  9711.212 18704.716 25931.639 25931.639  6820.443  8105.229 
##        34        47        48        54        56        57        58 
##  7141.639 11477.793 33800.955  6981.041  3608.477  3608.477  3608.477 
##        69        80        95        96       100       106       108 
## 21756.083  8105.229  7944.631  7944.631 11638.391 21434.887 11638.391 
##       121       126       136       158       159       168       172 
##  6820.443 16616.938 11798.990  8105.229 10032.408 15813.947 15813.947 
##       188       196       201 
##  7944.631 15010.955 15010.955
SSE <- sum((autosTest$price - predictTest)^2)

SST <- sum((autosTest$price - mean(autosTest$price))^2)
1 - SSE/SST
## [1] 0.6355735
predictTest <- predict(model2, newdata = autosTest)
predictTest
##        10        12        14        17        18        20        24 
## 12721.834  9692.458 17666.507 26285.347 26285.347  6244.922  7777.161 
##        34        47        48        54        56        57        58 
##  6627.982 11799.286 35670.306  6436.452  5165.946  5165.946  5165.946 
##        69        80        95        96       100       106       108 
## 22681.382  7777.161  7585.631  7585.631 11990.816 20922.513 11990.816 
##       121       126       136       158       159       168       172 
##  6244.922 17928.239 12182.346  7777.161 10075.518 16970.590 16970.590 
##       188       196       201 
##  7585.631 16012.941 16012.941
SSE <- sum((autosTest$price - predictTest)^2)

SST <- sum((autosTest$price - mean(autosTest$price))^2)
1 - SSE/SST
## [1] 0.6675685
predictTest <- predict(model3, newdata = autosTest)
predictTest
##        10        12        14        17        18        20        24 
## 17634.478 18405.441 22743.931 28831.696 28831.696  7015.264  7155.011 
##        34        47        48        54        56        57        58 
##  7251.671  5237.907 36031.448  7585.451  7981.889  7981.889  7981.889 
##        69        80        95        96       100       106       108 
## 30467.665  6392.760  7084.329  7084.329 10195.853 15210.762 13494.064 
##       121       126       136       158       159       168       172 
##  5682.111 23821.158 15166.000  7811.189  9434.593 14304.805 14304.805 
##       188       196       201 
##  8764.892 19223.980 19223.980
SSE <- sum((autosTest$price - predictTest)^2)

SST <- sum((autosTest$price - mean(autosTest$price))^2)
1 - SSE/SST
## [1] 0.7227817
predictTest <- predict(model4, newdata = autosTest)
predictTest
##        10        12        14        17        18        20        24 
## 18362.226 18312.963 23290.543 27830.952 27830.952  6911.951  7079.804 
##        34        47        48        54        56        57        58 
##  7271.439  5785.615 33353.530  8132.087  8888.119  8888.119  8888.119 
##        69        80        95        96       100       106       108 
## 30162.065  6985.316  7021.464  6529.678  8850.331 17026.369 14060.892 
##       121       126       136       158       159       168       172 
##  5627.433 -3901.280 15461.072  6621.056  8323.617 13505.279 14158.127 
##       188       196       201 
##  8928.770 18768.755 18690.048
SSE <- sum((autosTest$price - predictTest)^2)

SST <- sum((autosTest$price - mean(autosTest$price))^2)
1 - SSE/SST
## [1] 0.4796795
predictTest <- predict(model5, newdata = autosTest)
predictTest
##        10        12        14        17        18        20        24 
## 18696.667 18554.570 22759.076 27884.571 28216.553  7647.213  7171.823 
##        34        47        48        54        56        57        58 
##  6633.251  6932.407 33613.413  7933.952  9203.671  9203.671  9203.671 
##        69        80        95        96       100       106       108 
## 29437.425  6571.708  6936.534  7124.566  9652.046 16424.292 14204.605 
##       121       126       136       158       159       168       172 
##  5647.088 -4632.014 15071.855  6855.810  7595.393 13529.401 13943.155 
##       188       196       201 
##  8969.223 18277.612 18684.588
SSE <- sum((autosTest$price - predictTest)^2)

SST <- sum((autosTest$price - mean(autosTest$price))^2)
1 - SSE/SST
## [1] 0.4718531
predictTest <- predict(model6, newdata = autosTest)
predictTest
##        10        12        14        17        18        20        24 
## 18750.821 18746.437 22818.424 27684.696 28029.394  7792.885  7744.794 
##        34        47        48        54        56        57        58 
##  7006.434  7033.790 33583.587  7981.605 10260.371 10260.371 10260.371 
##        69        80        95        96       100       106       108 
## 29491.878  7116.750  7007.202  7046.284  9621.567 16763.638 14888.010 
##       121       126       136       158       159       168       172 
##  5680.750 -4344.929 15070.546  7026.047  7400.568 13394.064 13751.668 
##       188       196       201 
##  8140.793 18005.548 18464.924
SSE <- sum((autosTest$price - predictTest)^2)

SST <- sum((autosTest$price - mean(autosTest$price))^2)
1 - SSE/SST
## [1] 0.4801799