Notes:
setwd("C:/Users/labadmin/OneDrive/Learning/MySlideRule - Intro to DS/R/R Datasource")
getwd()
## [1] "C:/Users/labadmin/OneDrive/Learning/MySlideRule - Intro to DS/R/R Datasource"
library(ggplot2)
#install.packages("caTools")
library(caTools)
autos <- read.csv("C:/Users/labadmin/OneDrive/Learning/MySlideRule - Intro to DS/R/R Datasource/Automobile price data _Raw_transformed_v2.csv")
set.seed(85)
split <- sample.split(autos$price, SplitRatio = 0.85)
autosTrain <- subset(autos, split == TRUE)
autosTest <- subset(autos, split == FALSE)
nrow(autosTrain)
## [1] 174
nrow(autosTest)
## [1] 31
names(autos)
## [1] "symboling" "normalized.losses" "make"
## [4] "fuel.type" "aspiration" "num.of.doors"
## [7] "body.style" "drive.wheels" "engine.location"
## [10] "wheel.base" "length" "width"
## [13] "height" "curb.weight" "engine.type"
## [16] "num.of.cylinders" "no.of.cylinders.num" "engine.size"
## [19] "fuel.system" "bore" "stroke"
## [22] "compression.ratio" "horsepower" "peak.rpm"
## [25] "city.mpg" "highway.mpg" "price"
summary(autos)
## symboling normalized.losses make fuel.type
## Min. :-2.0000 Min. : 0.0 toyota : 32 diesel: 20
## 1st Qu.: 0.0000 1st Qu.: 77.0 nissan : 18 gas :185
## Median : 1.0000 Median :103.0 mazda : 17
## Mean : 0.8341 Mean : 97.6 honda : 13
## 3rd Qu.: 2.0000 3rd Qu.:137.0 mitsubishi: 13
## Max. : 3.0000 Max. :256.0 subaru : 12
## (Other) :100
## aspiration num.of.doors body.style drive.wheels engine.location
## std :168 ? : 2 convertible: 6 4wd: 9 front:202
## turbo: 37 four:114 hardtop : 8 fwd:120 rear : 3
## two : 89 hatchback :70 rwd: 76
## sedan :96
## wagon :25
##
##
## wheel.base length width height
## Min. : 86.60 Min. :141.1 Min. :60.30 Min. :47.80
## 1st Qu.: 94.50 1st Qu.:166.3 1st Qu.:64.10 1st Qu.:52.00
## Median : 97.00 Median :173.2 Median :65.50 Median :54.10
## Mean : 98.76 Mean :174.0 Mean :65.91 Mean :53.72
## 3rd Qu.:102.40 3rd Qu.:183.1 3rd Qu.:66.90 3rd Qu.:55.50
## Max. :120.90 Max. :208.1 Max. :72.30 Max. :59.80
##
## curb.weight engine.type num.of.cylinders no.of.cylinders.num
## Min. :1488 dohc : 12 eight : 5 Min. : 2.00
## 1st Qu.:2145 dohcv: 1 five : 11 1st Qu.: 4.00
## Median :2414 l : 12 four :159 Median : 4.00
## Mean :2556 ohc :148 six : 24 Mean : 4.38
## 3rd Qu.:2935 ohcf : 15 three : 1 3rd Qu.: 4.00
## Max. :4066 ohcv : 13 twelve: 1 Max. :12.00
## rotor: 4 two : 4
## engine.size fuel.system bore stroke
## Min. : 61.0 mpfi :94 Min. :0.000 Min. :0.000
## 1st Qu.: 97.0 2bbl :66 1st Qu.:3.130 1st Qu.:3.100
## Median :120.0 idi :20 Median :3.310 Median :3.290
## Mean :126.9 1bbl :11 Mean :3.265 Mean :3.192
## 3rd Qu.:141.0 spdi : 9 3rd Qu.:3.580 3rd Qu.:3.410
## Max. :326.0 4bbl : 3 Max. :3.940 Max. :4.170
## (Other): 2
## compression.ratio horsepower peak.rpm city.mpg
## Min. : 7.00 Min. : 0.0 Min. : 0 Min. :13.00
## 1st Qu.: 8.60 1st Qu.: 70.0 1st Qu.:4800 1st Qu.:19.00
## Median : 9.00 Median : 95.0 Median :5200 Median :24.00
## Mean :10.14 Mean :103.2 Mean :5075 Mean :25.22
## 3rd Qu.: 9.40 3rd Qu.:116.0 3rd Qu.:5500 3rd Qu.:30.00
## Max. :23.00 Max. :288.0 Max. :6600 Max. :49.00
##
## highway.mpg price
## Min. :16.00 Min. : 0
## 1st Qu.:25.00 1st Qu.: 7689
## Median :30.00 Median :10198
## Mean :30.75 Mean :12949
## 3rd Qu.:34.00 3rd Qu.:16500
## Max. :54.00 Max. :45400
##
str(autos)
## 'data.frame': 205 obs. of 27 variables:
## $ symboling : int 3 3 1 2 2 2 1 1 1 0 ...
## $ normalized.losses : int 0 0 0 164 164 0 158 0 158 0 ...
## $ make : Factor w/ 22 levels "alfa-romero",..: 1 1 1 2 2 2 2 2 2 2 ...
## $ fuel.type : Factor w/ 2 levels "diesel","gas": 2 2 2 2 2 2 2 2 2 2 ...
## $ aspiration : Factor w/ 2 levels "std","turbo": 1 1 1 1 1 1 1 1 2 2 ...
## $ num.of.doors : Factor w/ 3 levels "?","four","two": 3 3 3 2 2 3 2 2 2 3 ...
## $ body.style : Factor w/ 5 levels "convertible",..: 1 1 3 4 4 4 4 5 4 3 ...
## $ drive.wheels : Factor w/ 3 levels "4wd","fwd","rwd": 3 3 3 2 1 2 2 2 2 1 ...
## $ engine.location : Factor w/ 2 levels "front","rear": 1 1 1 1 1 1 1 1 1 1 ...
## $ wheel.base : num 88.6 88.6 94.5 99.8 99.4 ...
## $ length : num 169 169 171 177 177 ...
## $ width : num 64.1 64.1 65.5 66.2 66.4 66.3 71.4 71.4 71.4 67.9 ...
## $ height : num 48.8 48.8 52.4 54.3 54.3 53.1 55.7 55.7 55.9 52 ...
## $ curb.weight : int 2548 2548 2823 2337 2824 2507 2844 2954 3086 3053 ...
## $ engine.type : Factor w/ 7 levels "dohc","dohcv",..: 1 1 6 4 4 4 4 4 4 4 ...
## $ num.of.cylinders : Factor w/ 7 levels "eight","five",..: 3 3 4 3 2 2 2 2 2 2 ...
## $ no.of.cylinders.num: int 4 4 6 4 5 5 5 5 5 5 ...
## $ engine.size : int 130 130 152 109 136 136 136 136 131 131 ...
## $ fuel.system : Factor w/ 8 levels "1bbl","2bbl",..: 6 6 6 6 6 6 6 6 6 6 ...
## $ bore : num 3.47 3.47 2.68 3.19 3.19 3.19 3.19 3.19 3.13 3.13 ...
## $ stroke : num 2.68 2.68 3.47 3.4 3.4 3.4 3.4 3.4 3.4 3.4 ...
## $ compression.ratio : num 9 9 9 10 8 8.5 8.5 8.5 8.3 7 ...
## $ horsepower : int 111 111 154 102 115 110 110 110 140 160 ...
## $ peak.rpm : int 5000 5000 5000 5500 5500 5500 5500 5500 5500 5500 ...
## $ city.mpg : int 21 21 19 24 18 19 19 19 17 16 ...
## $ highway.mpg : int 27 27 26 30 22 25 25 25 20 22 ...
## $ price : int 13495 16500 16500 13950 17450 15250 17710 18920 23875 0 ...
table(autos$price)
##
## 0 5118 5151 5195 5348 5389 5399 5499 5572 6095 6189 6229
## 4 1 1 1 1 1 1 1 2 1 1 2
## 6295 6338 6377 6479 6488 6529 6575 6649 6669 6692 6695 6785
## 1 1 1 1 1 1 1 1 1 2 1 1
## 6795 6849 6855 6918 6938 6989 7053 7099 7126 7129 7198 7295
## 1 1 1 1 1 1 1 1 1 1 1 2
## 7299 7349 7395 7463 7499 7603 7609 7689 7738 7775 7788 7799
## 1 1 1 1 1 1 2 1 1 2 1 1
## 7895 7898 7957 7975 7995 7999 8013 8058 8189 8195 8238 8249
## 1 2 2 1 1 1 1 1 1 1 1 1
## 8358 8449 8495 8499 8558 8778 8845 8921 8948 8949 9095 9233
## 1 1 2 1 1 1 2 2 1 1 1 1
## 9258 9279 9295 9298 9495 9538 9549 9639 9895 9959 9960 9980
## 1 2 1 1 1 1 1 1 1 1 1 1
## 9988 9989 9995 10198 10245 10295 10345 10595 10698 10795 10898 10945
## 1 1 1 1 1 1 1 1 1 1 1 1
## 11048 11199 11245 11248 11259 11549 11595 11694 11845 11850 11900 12170
## 1 1 1 1 1 1 1 1 1 1 1 1
## 12290 12440 12629 12764 12940 12945 12964 13200 13295 13415 13495 13499
## 1 1 1 1 1 1 1 1 1 1 1 2
## 13645 13845 13860 13950 14399 14489 14869 15040 15250 15510 15580 15645
## 1 1 1 1 1 1 1 1 1 1 1 1
## 15690 15750 15985 15998 16430 16500 16503 16515 16558 16630 16695 16845
## 1 1 1 1 1 2 1 1 1 1 1 1
## 16900 16925 17075 17199 17450 17669 17710 17950 18150 18280 18344 18399
## 1 1 1 1 1 1 1 1 2 1 1 1
## 18420 18620 18920 18950 19045 19699 20970 21105 21485 22018 22470 22625
## 1 1 1 1 1 1 1 1 1 1 1 1
## 23875 24565 25552 28176 28248 30760 31600 32250 32528 34028 34184 35056
## 1 1 1 1 1 1 1 1 1 1 1 1
## 35550 36000 36880 37028 40960 41315 45400
## 1 1 1 1 1 1 1
summary(autos$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 7689 10200 12950 16500 45400
is.factor(autos$price)
## [1] FALSE
range(autos$price)
## [1] 0 45400
#plot and adjust price histogram
ggplot(aes(x=price), data = autos, binwidth=1000 ) +
geom_histogram() +
scale_x_continuous(breaks = seq(0,50000,5000))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
#examine the USD 10,000 mark
ggplot(aes(x=price), data = autos, binwidth=25 ) +
geom_histogram() +
scale_x_continuous(limits= c(9900,10000),breaks = seq(0,10000,25))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## Warning: position_stack requires constant width: output may be incorrect
#facet wrap by make
ggplot(aes(x=price), data = autos, binwidth=1000 ) +
geom_histogram() +
scale_x_continuous(breaks = seq(0,50000,5000))+
facet_wrap(~make,ncol = 4)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
On average most cars are priced around the USD10k mark Most manufactures compete at similar price points on the lower end of the price scale Jaguar make few but expensive models BMW and Porche offer a wide price range of models Toyota make the most number of cars ***
ggplot(aes(x=make,y=price), data=autos)+
geom_point()
###FINDINGS Make of the car does not seem to influence the price of the car but intitively we know that it does.
ggplot(aes(x=no.of.cylinders.num,y=price), data=autos)+
geom_point()+
scale_x_discrete(breaks = seq(2,12,1))
ggplot(aes(x=no.of.cylinders.num,y=price), data=autos)+
geom_point(alpha = 1/10)+
scale_x_discrete(breaks = seq(2,12,1)) +
geom_smooth(method = 'lm', color = 'red')
###FINDINGS The number of cylinders seems to influnce the price. More cylinders == more expensive
ggplot(aes(x=horsepower,y=price), data=autos)+
geom_point()
ggplot(aes(x=horsepower,y=price), data=autos)+
geom_point(alpha = 1/5) +
geom_smooth(method = 'lm', color = 'red')
cor.test(x=autos$horsepower,y=autos$price)
##
## Pearson's product-moment correlation
##
## data: autos$horsepower and autos$price
## t = 13.631, df = 203, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6122519 0.7566454
## sample estimates:
## cor
## 0.6912879
There is a positive correlation between price and HP
ggplot(aes(x=city.mpg,y=price), data=autos)+
geom_point()
ggplot(aes(x=city.mpg,y=price), data=autos)+
geom_point(alpha = 1/5) +
geom_smooth(method = 'lm', color = 'red')
cor.test(x=autos$city.mpg,y=autos$price)
##
## Pearson's product-moment correlation
##
## data: autos$city.mpg and autos$price
## t = -12.518, df = 203, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.7309491 -0.5749975
## sample estimates:
## cor
## -0.6600259
Seems to be a negative corelation between price and city mpg
ggplot(aes(x=highway.mpg,y=price), data=autos)+
geom_point()
ggplot(aes(x=city.mpg,y=price), data=autos)+
geom_point(alpha = 1/5) +
geom_smooth(method = 'lm', color = 'red')
cor.test(x=autos$city.mpg,y=autos$price)
##
## Pearson's product-moment correlation
##
## data: autos$city.mpg and autos$price
## t = -12.518, df = 203, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.7309491 -0.5749975
## sample estimates:
## cor
## -0.6600259
Seems to be a negative corelation between price and highway mpg
ggplot(aes(x=curb.weight,y=price), data=autos)+
geom_point()
ggplot(aes(x=curb.weight,y=price), data=autos)+
geom_point(alpha = 1/5) +
geom_smooth(method = 'lm', color = 'red')
cor.test(x=autos$city.mpg,y=autos$price)
##
## Pearson's product-moment correlation
##
## data: autos$city.mpg and autos$price
## t = -12.518, df = 203, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.7309491 -0.5749975
## sample estimates:
## cor
## -0.6600259
There is a positive correlation between price and curb weight
ggplot(aes(x=num.of.doors,y=price), data=autos)+
geom_point()
###FINDINGS number of doors does not seem to influence price
ggplot(aes(x=engine.size,y=price), data=autos)+
geom_point()
ggplot(aes(x=engine.size,y=price), data=autos)+
geom_point(alpha = 1/5) +
geom_smooth(method = 'lm', color = 'red')
cor.test(x=autos$engine.size,y=autos$price)
##
## Pearson's product-moment correlation
##
## data: autos$engine.size and autos$price
## t = 21.889, df = 203, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7920253 0.8746768
## sample estimates:
## cor
## 0.8380973
Strong positive correlation between price and engine size
ggplot(aes(x=peak.rpm,y=price), data=autos)+
geom_point()
ggplot(aes(x=peak.rpm,y=price), data=autos)+
geom_point()+
scale_x_continuous(limits = c(4000,6100))
## Warning: Removed 4 rows containing missing values (geom_point).
###FINDINGS Unable to determine a relationship
ggplot(aes(x=height,y=price), data=autos)+
geom_point()
###FINDINGS Unable to determine a relationship. Doesnt seem to influence price
ggplot(aes(x=wheel.base,y=price), data=autos)+
geom_point()
ggplot(aes(x=wheel.base,y=price), data=autos)+
geom_point(alpha = 1/5) +
geom_smooth(method = 'lm', color = 'red')
cor.test(x=autos$wheel.base,y=autos$price)
##
## Pearson's product-moment correlation
##
## data: autos$wheel.base and autos$price
## t = 10.113, df = 203, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4798271 0.6632337
## sample estimates:
## cor
## 0.578804
wheel base seems to positively influence price
Unable to determine a relationship. Doesnt seem to influence price
ggplot(aes(x=peak.rpm,y=price), data=autos)+
geom_point()
ggplot(aes(x=peak.rpm,y=price), data=autos)+
geom_point(alpha = 1/5) +
geom_smooth(method = 'lm', color = 'red')
cor.test(x=autos$peak.rpm,y=autos$price)
##
## Pearson's product-moment correlation
##
## data: autos$peak.rpm and autos$price
## t = -0.7888, df = 203, p-value = 0.4312
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1908671 0.0823811
## sample estimates:
## cor
## -0.05527795
peak rpm does not influence price
model1 <- lm(price ~ engine.size, data=autosTrain)
summary(model1)
##
## Call:
## lm(formula = price ~ engine.size, data = autosTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24968.0 -2046.1 -234.5 1614.2 13505.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7633.403 1024.759 -7.449 4.36e-12 ***
## engine.size 160.598 7.678 20.918 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4155 on 172 degrees of freedom
## Multiple R-squared: 0.7178, Adjusted R-squared: 0.7162
## F-statistic: 437.6 on 1 and 172 DF, p-value: < 2.2e-16
SSE <- sum(model1$residuals^2)
SSE
## [1] 2970049568
RMSE <- sqrt(SSE/nrow(autosTrain))
RMSE
## [1] 4131.495
model2 <- lm(price ~ engine.size + no.of.cylinders.num, data = autosTrain)
summary(model2)
##
## Call:
## lm(formula = price ~ engine.size + no.of.cylinders.num, data = autosTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22384.5 -2088.8 -62.9 1569.5 13615.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5489.52 1320.20 -4.158 5.07e-05 ***
## engine.size 191.53 14.42 13.281 < 2e-16 ***
## no.of.cylinders.num -1375.81 546.21 -2.519 0.0127 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4092 on 171 degrees of freedom
## Multiple R-squared: 0.7279, Adjusted R-squared: 0.7247
## F-statistic: 228.8 on 2 and 171 DF, p-value: < 2.2e-16
SSE <- sum(model2$residuals^2)
SSE
## [1] 2863796332
RMSE <- sqrt(SSE/nrow(autosTrain))
RMSE
## [1] 4056.92
model3 <- lm(price ~ engine.size + no.of.cylinders.num + make , data=autosTrain)
summary(model3)
##
## Call:
## lm(formula = price ~ engine.size + no.of.cylinders.num + make,
## data = autosTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24381.1 -1314.7 -116.4 1257.3 10627.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4473.30 2522.82 1.773 0.078236 .
## engine.size 135.28 14.53 9.313 < 2e-16 ***
## no.of.cylinders.num -1618.70 496.31 -3.261 0.001372 **
## makeaudi 3532.51 2276.55 1.552 0.122843
## makebmw 5796.30 2471.19 2.346 0.020309 *
## makechevrolet -3158.77 3002.00 -1.052 0.294390
## makedodge -4101.30 2202.92 -1.862 0.064594 .
## makehonda -3192.93 2109.80 -1.513 0.132287
## makeisuzu -8859.36 2656.56 -3.335 0.001075 **
## makejaguar 6367.15 3399.30 1.873 0.063004 .
## makemazda -2723.87 2079.51 -1.310 0.192245
## makemercedes-benz 9330.94 2430.42 3.839 0.000182 ***
## makemercury -435.22 3725.10 -0.117 0.907147
## makemitsubishi -4863.55 2086.46 -2.331 0.021085 *
## makenissan -4036.69 2047.51 -1.972 0.050504 .
## makepeugot -738.48 2139.60 -0.345 0.730465
## makeplymouth -4491.93 2290.22 -1.961 0.051690 .
## makeporsche 5394.82 2548.24 2.117 0.035903 *
## makerenault -6260.95 2944.85 -2.126 0.035135 *
## makesaab 798.17 2357.07 0.339 0.735364
## makesubaru -3943.88 2096.85 -1.881 0.061929 .
## maketoyota -3445.12 1961.21 -1.757 0.081021 .
## makevolkswagen -2356.13 2113.57 -1.115 0.266734
## makevolvo 2150.47 2147.57 1.001 0.318270
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3211 on 150 degrees of freedom
## Multiple R-squared: 0.8531, Adjusted R-squared: 0.8305
## F-statistic: 37.86 on 23 and 150 DF, p-value: < 2.2e-16
SSE <- sum(model3$residuals^2)
SSE
## [1] 1546605167
RMSE <- sqrt(SSE/nrow(autosTrain))
RMSE
## [1] 2981.365
#model 4: addition of horsepower does not improve the model. Also corelated with engine size
model4 <- lm(price ~ engine.size + no.of.cylinders.num + make + drive.wheels + body.style + engine.location, data=autosTrain)
summary(model4)
##
## Call:
## lm(formula = price ~ engine.size + no.of.cylinders.num + make +
## drive.wheels + body.style + engine.location, data = autosTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5750.0 -976.6 -63.9 843.4 6257.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2436.92 1882.82 1.294 0.197651
## engine.size 100.90 10.23 9.865 < 2e-16 ***
## no.of.cylinders.num -336.35 336.64 -0.999 0.319416
## makeaudi 6438.73 1707.52 3.771 0.000237 ***
## makebmw 6424.48 1741.27 3.690 0.000319 ***
## makechevrolet 25.38 2072.10 0.012 0.990243
## makedodge -613.95 1638.19 -0.375 0.708386
## makehonda 183.08 1582.68 0.116 0.908073
## makeisuzu -6720.96 1856.38 -3.620 0.000407 ***
## makejaguar 7003.05 2319.58 3.019 0.003003 **
## makemazda 652.84 1547.07 0.422 0.673673
## makemercedes-benz 10963.88 1674.75 6.547 9.83e-10 ***
## makemercury 1877.57 2490.92 0.754 0.452230
## makemitsubishi -708.44 1604.80 -0.441 0.659556
## makenissan -1063.18 1547.49 -0.687 0.493176
## makepeugot 961.63 1568.50 0.613 0.540791
## makeplymouth -1259.13 1665.79 -0.756 0.450965
## makeporsche -19636.59 2579.95 -7.611 3.35e-12 ***
## makerenault -1814.53 2104.49 -0.862 0.390012
## makesaab 4954.88 1750.87 2.830 0.005326 **
## makesubaru -979.26 1613.78 -0.607 0.544938
## maketoyota -1072.70 1430.12 -0.750 0.454441
## makevolkswagen 844.13 1563.88 0.540 0.590198
## makevolvo 3471.93 1562.75 2.222 0.027876 *
## drive.wheelsfwd -1236.47 885.27 -1.397 0.164663
## drive.wheelsrwd 1457.50 1063.02 1.371 0.172495
## body.stylehardtop -2702.14 1244.51 -2.171 0.031562 *
## body.stylehatchback -2049.29 1083.97 -1.891 0.060709 .
## body.stylesedan -1557.51 1069.10 -1.457 0.147355
## body.stylewagon -1478.80 1156.26 -1.279 0.202987
## engine.locationrear 34515.51 2548.74 13.542 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2047 on 143 degrees of freedom
## Multiple R-squared: 0.9431, Adjusted R-squared: 0.9311
## F-statistic: 78.97 on 30 and 143 DF, p-value: < 2.2e-16
SSE<-sum(model4$residuals^2)
SSE
## [1] 599182064
RMSE <- sqrt(SSE/nrow(autosTrain))
RMSE
## [1] 1855.687
cor.test(autos$engine.size,autos$horsepower)
##
## Pearson's product-moment correlation
##
## data: autos$engine.size and autos$horsepower
## t = 17.851, df = 203, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7218551 0.8297436
## sample estimates:
## cor
## 0.781577
model5 <- lm(price ~ engine.size + no.of.cylinders.num + make + drive.wheels + body.style + engine.location + length, data=autosTrain)
summary(model5)
##
## Call:
## lm(formula = price ~ engine.size + no.of.cylinders.num + make +
## drive.wheels + body.style + engine.location + length, data = autosTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5744.6 -1176.6 -34.8 847.6 5880.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -12482.12 5449.60 -2.290 0.023467 *
## engine.size 74.71 13.44 5.560 1.30e-07 ***
## no.of.cylinders.num 10.44 349.20 0.030 0.976193
## makeaudi 5482.46 1696.86 3.231 0.001534 **
## makebmw 6417.76 1697.59 3.781 0.000230 ***
## makechevrolet 1340.80 2070.16 0.648 0.518239
## makedodge 122.50 1617.06 0.076 0.939718
## makehonda 789.51 1557.01 0.507 0.612893
## makeisuzu -5747.37 1840.53 -3.123 0.002172 **
## makejaguar 7884.21 2281.61 3.456 0.000725 ***
## makemazda 578.93 1508.48 0.384 0.701714
## makemercedes-benz 10631.29 1636.74 6.495 1.30e-09 ***
## makemercury 1652.65 2429.66 0.680 0.497488
## makemitsubishi -477.61 1566.56 -0.305 0.760905
## makenissan -711.12 1513.52 -0.470 0.639187
## makepeugot 144.24 1554.78 0.093 0.926217
## makeplymouth -804.57 1631.51 -0.493 0.622672
## makeporsche -19318.58 2517.60 -7.673 2.45e-12 ***
## makerenault -1979.25 2052.48 -0.964 0.336526
## makesaab 3421.46 1786.57 1.915 0.057489 .
## makesubaru -766.12 1575.01 -0.486 0.627417
## maketoyota -1127.21 1394.37 -0.808 0.420213
## makevolkswagen 657.60 1526.00 0.431 0.667170
## makevolvo 2837.50 1539.09 1.844 0.067324 .
## drive.wheelsfwd -1520.85 868.58 -1.751 0.082114 .
## drive.wheelsrwd 953.46 1050.76 0.907 0.365730
## body.stylehardtop -3043.54 1218.96 -2.497 0.013673 *
## body.stylehatchback -2629.79 1075.47 -2.445 0.015700 *
## body.stylesedan -2786.70 1124.76 -2.478 0.014400 *
## body.stylewagon -3193.67 1272.23 -2.510 0.013185 *
## engine.locationrear 35325.95 2500.39 14.128 < 2e-16 ***
## length 103.74 35.68 2.908 0.004228 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1996 on 142 degrees of freedom
## Multiple R-squared: 0.9463, Adjusted R-squared: 0.9345
## F-statistic: 80.68 on 31 and 142 DF, p-value: < 2.2e-16
SSE<-sum(model5$residuals^2)
SSE
## [1] 565514780
RMSE <- sqrt(SSE/nrow(autosTrain))
RMSE
## [1] 1802.799
#FINDINGS: the R^2 and Adjusted R^2 values have dropped
# test for multi-collinearity
cor(autosTrain$engine.size,autosTrain$city.mpg)
## [1] -0.6857895
cor(autosTrain$engine.size, autosTrain$highway.mpg)
## [1] -0.7040139
cor(autosTrain$engine.size, autosTrain$horsepower)
## [1] 0.7800674
cor(autosTrain$engine.size,autosTrain$curb.weight)
## [1] 0.8453962
cor(autosTrain$engine.size,autosTrain$length)
## [1] 0.6681608
cor(autosTrain$curb.weight,autosTrain$city.mpg)
## [1] -0.767168
cor(autosTrain$curb.weight,autosTrain$highway.mpg)
## [1] -0.803512
cor.test(autosTrain$engine.size,autosTrain$no.of.cylinders.num)
##
## Pearson's product-moment correlation
##
## data: autosTrain$engine.size and autosTrain$no.of.cylinders.num
## t = 21.3, df = 172, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8047081 0.8878321
## sample estimates:
## cor
## 0.8515353
cor.test(autosTrain$curb.weight,autosTrain$length)
##
## Pearson's product-moment correlation
##
## data: autosTrain$curb.weight and autosTrain$length
## t = 23.985, df = 172, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8380192 0.9076899
## sample estimates:
## cor
## 0.8774016
cor.test(autosTrain$engine.size,autosTrain$no.of.cylinders.num)
##
## Pearson's product-moment correlation
##
## data: autosTrain$engine.size and autosTrain$no.of.cylinders.num
## t = 21.3, df = 172, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8047081 0.8878321
## sample estimates:
## cor
## 0.8515353
#horsepower and engine size are highly correlated
#no of lenght and curb.weight correlated
#curb weight and milage highly correlated
#engine size and no.of.cylinders highy corrlated
#test by removing horsepower
model6 <- lm(price ~ engine.size + no.of.cylinders.num + make + drive.wheels + body.style + engine.location + length + city.mpg, data=autosTrain)
summary(model6)
##
## Call:
## lm(formula = price ~ engine.size + no.of.cylinders.num + make +
## drive.wheels + body.style + engine.location + length + city.mpg,
## data = autosTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5524.9 -1074.8 0.0 821.9 6007.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4597.59 6260.41 -0.734 0.463929
## engine.size 68.19 13.48 5.059 1.29e-06 ***
## no.of.cylinders.num 20.01 343.32 0.058 0.953596
## makeaudi 5153.18 1673.65 3.079 0.002497 **
## makebmw 6614.87 1670.85 3.959 0.000119 ***
## makechevrolet 2103.17 2059.12 1.021 0.308817
## makedodge -88.86 1592.09 -0.056 0.955571
## makehonda 765.18 1530.72 0.500 0.617937
## makeisuzu -5445.55 1813.65 -3.003 0.003168 **
## makejaguar 8634.54 2264.11 3.814 0.000204 ***
## makemazda 682.76 1483.59 0.460 0.646074
## makemercedes-benz 11530.38 1650.92 6.984 1.04e-10 ***
## makemercury 1626.73 2388.61 0.681 0.496965
## makemitsubishi -716.90 1543.20 -0.465 0.642968
## makenissan -589.22 1488.78 -0.396 0.692870
## makepeugot 775.05 1550.30 0.500 0.617899
## makeplymouth -860.24 1604.09 -0.536 0.592612
## makeporsche -19264.81 2475.14 -7.783 1.37e-12 ***
## makerenault -2050.49 2017.99 -1.016 0.311320
## makesaab 3186.34 1759.01 1.811 0.072202 .
## makesubaru -808.90 1548.48 -0.522 0.602224
## maketoyota -836.43 1375.99 -0.608 0.544245
## makevolkswagen 708.88 1500.35 0.472 0.637316
## makevolvo 3190.83 1520.02 2.099 0.037581 *
## drive.wheelsfwd -1085.73 872.41 -1.245 0.215373
## drive.wheelsrwd 990.29 1033.10 0.959 0.339420
## body.stylehardtop -2738.50 1204.88 -2.273 0.024548 *
## body.stylehatchback -2380.89 1062.22 -2.241 0.026561 *
## body.stylesedan -2397.67 1117.23 -2.146 0.033579 *
## body.stylewagon -2857.04 1258.34 -2.270 0.024694 *
## engine.locationrear 35132.15 2459.40 14.285 < 2e-16 ***
## length 74.36 37.10 2.005 0.046923 *
## city.mpg -106.74 43.85 -2.434 0.016169 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1962 on 141 degrees of freedom
## Multiple R-squared: 0.9484, Adjusted R-squared: 0.9367
## F-statistic: 81.05 on 32 and 141 DF, p-value: < 2.2e-16
SSE<-sum(model6$residuals^2)
SSE
## [1] 542705641
RMSE <- sqrt(SSE/nrow(autosTrain))
RMSE
## [1] 1766.068
#Findings: . Select model6 as the baseline
summary(model6)
##
## Call:
## lm(formula = price ~ engine.size + no.of.cylinders.num + make +
## drive.wheels + body.style + engine.location + length + city.mpg,
## data = autosTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5524.9 -1074.8 0.0 821.9 6007.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4597.59 6260.41 -0.734 0.463929
## engine.size 68.19 13.48 5.059 1.29e-06 ***
## no.of.cylinders.num 20.01 343.32 0.058 0.953596
## makeaudi 5153.18 1673.65 3.079 0.002497 **
## makebmw 6614.87 1670.85 3.959 0.000119 ***
## makechevrolet 2103.17 2059.12 1.021 0.308817
## makedodge -88.86 1592.09 -0.056 0.955571
## makehonda 765.18 1530.72 0.500 0.617937
## makeisuzu -5445.55 1813.65 -3.003 0.003168 **
## makejaguar 8634.54 2264.11 3.814 0.000204 ***
## makemazda 682.76 1483.59 0.460 0.646074
## makemercedes-benz 11530.38 1650.92 6.984 1.04e-10 ***
## makemercury 1626.73 2388.61 0.681 0.496965
## makemitsubishi -716.90 1543.20 -0.465 0.642968
## makenissan -589.22 1488.78 -0.396 0.692870
## makepeugot 775.05 1550.30 0.500 0.617899
## makeplymouth -860.24 1604.09 -0.536 0.592612
## makeporsche -19264.81 2475.14 -7.783 1.37e-12 ***
## makerenault -2050.49 2017.99 -1.016 0.311320
## makesaab 3186.34 1759.01 1.811 0.072202 .
## makesubaru -808.90 1548.48 -0.522 0.602224
## maketoyota -836.43 1375.99 -0.608 0.544245
## makevolkswagen 708.88 1500.35 0.472 0.637316
## makevolvo 3190.83 1520.02 2.099 0.037581 *
## drive.wheelsfwd -1085.73 872.41 -1.245 0.215373
## drive.wheelsrwd 990.29 1033.10 0.959 0.339420
## body.stylehardtop -2738.50 1204.88 -2.273 0.024548 *
## body.stylehatchback -2380.89 1062.22 -2.241 0.026561 *
## body.stylesedan -2397.67 1117.23 -2.146 0.033579 *
## body.stylewagon -2857.04 1258.34 -2.270 0.024694 *
## engine.locationrear 35132.15 2459.40 14.285 < 2e-16 ***
## length 74.36 37.10 2.005 0.046923 *
## city.mpg -106.74 43.85 -2.434 0.016169 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1962 on 141 degrees of freedom
## Multiple R-squared: 0.9484, Adjusted R-squared: 0.9367
## F-statistic: 81.05 on 32 and 141 DF, p-value: < 2.2e-16
mean(autosTrain$price)
## [1] 12764.43
#model6 still then best model. On average it will be off by USD 2,646.582 where the average price is USD 12,949?
predictTest <- predict(model1, newdata = autosTest)
predictTest
## 10 12 14 17 18 20 24
## 13404.972 9711.212 18704.716 25931.639 25931.639 6820.443 8105.229
## 34 47 48 54 56 57 58
## 7141.639 11477.793 33800.955 6981.041 3608.477 3608.477 3608.477
## 69 80 95 96 100 106 108
## 21756.083 8105.229 7944.631 7944.631 11638.391 21434.887 11638.391
## 121 126 136 158 159 168 172
## 6820.443 16616.938 11798.990 8105.229 10032.408 15813.947 15813.947
## 188 196 201
## 7944.631 15010.955 15010.955
SSE <- sum((autosTest$price - predictTest)^2)
SST <- sum((autosTest$price - mean(autosTest$price))^2)
1 - SSE/SST
## [1] 0.6355735
predictTest <- predict(model2, newdata = autosTest)
predictTest
## 10 12 14 17 18 20 24
## 12721.834 9692.458 17666.507 26285.347 26285.347 6244.922 7777.161
## 34 47 48 54 56 57 58
## 6627.982 11799.286 35670.306 6436.452 5165.946 5165.946 5165.946
## 69 80 95 96 100 106 108
## 22681.382 7777.161 7585.631 7585.631 11990.816 20922.513 11990.816
## 121 126 136 158 159 168 172
## 6244.922 17928.239 12182.346 7777.161 10075.518 16970.590 16970.590
## 188 196 201
## 7585.631 16012.941 16012.941
SSE <- sum((autosTest$price - predictTest)^2)
SST <- sum((autosTest$price - mean(autosTest$price))^2)
1 - SSE/SST
## [1] 0.6675685
predictTest <- predict(model3, newdata = autosTest)
predictTest
## 10 12 14 17 18 20 24
## 17634.478 18405.441 22743.931 28831.696 28831.696 7015.264 7155.011
## 34 47 48 54 56 57 58
## 7251.671 5237.907 36031.448 7585.451 7981.889 7981.889 7981.889
## 69 80 95 96 100 106 108
## 30467.665 6392.760 7084.329 7084.329 10195.853 15210.762 13494.064
## 121 126 136 158 159 168 172
## 5682.111 23821.158 15166.000 7811.189 9434.593 14304.805 14304.805
## 188 196 201
## 8764.892 19223.980 19223.980
SSE <- sum((autosTest$price - predictTest)^2)
SST <- sum((autosTest$price - mean(autosTest$price))^2)
1 - SSE/SST
## [1] 0.7227817
predictTest <- predict(model4, newdata = autosTest)
predictTest
## 10 12 14 17 18 20 24
## 18362.226 18312.963 23290.543 27830.952 27830.952 6911.951 7079.804
## 34 47 48 54 56 57 58
## 7271.439 5785.615 33353.530 8132.087 8888.119 8888.119 8888.119
## 69 80 95 96 100 106 108
## 30162.065 6985.316 7021.464 6529.678 8850.331 17026.369 14060.892
## 121 126 136 158 159 168 172
## 5627.433 -3901.280 15461.072 6621.056 8323.617 13505.279 14158.127
## 188 196 201
## 8928.770 18768.755 18690.048
SSE <- sum((autosTest$price - predictTest)^2)
SST <- sum((autosTest$price - mean(autosTest$price))^2)
1 - SSE/SST
## [1] 0.4796795
predictTest <- predict(model5, newdata = autosTest)
predictTest
## 10 12 14 17 18 20 24
## 18696.667 18554.570 22759.076 27884.571 28216.553 7647.213 7171.823
## 34 47 48 54 56 57 58
## 6633.251 6932.407 33613.413 7933.952 9203.671 9203.671 9203.671
## 69 80 95 96 100 106 108
## 29437.425 6571.708 6936.534 7124.566 9652.046 16424.292 14204.605
## 121 126 136 158 159 168 172
## 5647.088 -4632.014 15071.855 6855.810 7595.393 13529.401 13943.155
## 188 196 201
## 8969.223 18277.612 18684.588
SSE <- sum((autosTest$price - predictTest)^2)
SST <- sum((autosTest$price - mean(autosTest$price))^2)
1 - SSE/SST
## [1] 0.4718531
predictTest <- predict(model6, newdata = autosTest)
predictTest
## 10 12 14 17 18 20 24
## 18750.821 18746.437 22818.424 27684.696 28029.394 7792.885 7744.794
## 34 47 48 54 56 57 58
## 7006.434 7033.790 33583.587 7981.605 10260.371 10260.371 10260.371
## 69 80 95 96 100 106 108
## 29491.878 7116.750 7007.202 7046.284 9621.567 16763.638 14888.010
## 121 126 136 158 159 168 172
## 5680.750 -4344.929 15070.546 7026.047 7400.568 13394.064 13751.668
## 188 196 201
## 8140.793 18005.548 18464.924
SSE <- sum((autosTest$price - predictTest)^2)
SST <- sum((autosTest$price - mean(autosTest$price))^2)
1 - SSE/SST
## [1] 0.4801799