# Read the Automobile data set into R
library(readxl)
automobile_data <- read_excel("/Users/pallavisaitu/Desktop/Spring_ANYL510-50/Automobile_project_dataset.xlsx")
automobile_data
## # A tibble: 159 x 26
## symboling `normalized-los… make `fuel-type` aspiration no_of_doors
## <dbl> <dbl> <chr> <chr> <chr> <chr>
## 1 2 164 audi gas std four
## 2 2 164 audi gas std four
## 3 1 158 audi gas std four
## 4 1 158 audi gas turbo four
## 5 2 192 bmw gas std two
## 6 0 192 bmw gas std four
## 7 0 188 bmw gas std two
## 8 0 188 bmw gas std four
## 9 2 121 chev… gas std two
## 10 1 98 chev… gas std two
## # … with 149 more rows, and 20 more variables: body_style <chr>,
## # drive_wheels <chr>, engine_location <chr>, wheel_base <dbl>,
## # length <dbl>, width <dbl>, height <dbl>, curb_weight <dbl>,
## # engine_type <chr>, no_of_cylinders <chr>, engine_size <dbl>,
## # fuel_system <chr>, bore <dbl>, stroke <dbl>, compression_ratio <dbl>,
## # horse_power <dbl>, peak_rpm <dbl>, city_mpg <dbl>, highway_mpg <dbl>,
## # price <dbl>
# Summary of the dataset
str(automobile_data)
## Classes 'tbl_df', 'tbl' and 'data.frame': 159 obs. of 26 variables:
## $ symboling : num 2 2 1 1 2 0 0 0 2 1 ...
## $ normalized-losses: num 164 164 158 158 192 192 188 188 121 98 ...
## $ make : chr "audi" "audi" "audi" "audi" ...
## $ fuel-type : chr "gas" "gas" "gas" "gas" ...
## $ aspiration : chr "std" "std" "std" "turbo" ...
## $ no_of_doors : chr "four" "four" "four" "four" ...
## $ body_style : chr "sedan" "sedan" "sedan" "sedan" ...
## $ drive_wheels : chr "fwd" "4wd" "fwd" "fwd" ...
## $ engine_location : chr "front" "front" "front" "front" ...
## $ wheel_base : num 99.8 99.4 105.8 105.8 101.2 ...
## $ length : num 177 177 193 193 177 ...
## $ width : num 66.2 66.4 71.4 71.4 64.8 64.8 64.8 64.8 60.3 63.6 ...
## $ height : num 54.3 54.3 55.7 55.9 54.3 54.3 54.3 54.3 53.2 52 ...
## $ curb_weight : num 2337 2824 2844 3086 2395 ...
## $ engine_type : chr "ohc" "ohc" "ohc" "ohc" ...
## $ no_of_cylinders : chr "four" "five" "five" "five" ...
## $ engine_size : num 109 136 136 131 108 108 164 164 61 90 ...
## $ fuel_system : chr "mpfi" "mpfi" "mpfi" "mpfi" ...
## $ bore : num 3.19 3.19 3.19 3.13 3.5 3.5 3.31 3.31 2.91 3.03 ...
## $ stroke : num 3.4 3.4 3.4 3.4 2.8 2.8 3.19 3.19 3.03 3.11 ...
## $ compression_ratio: num 10 8 8.5 8.3 8.8 8.8 9 9 9.5 9.6 ...
## $ horse_power : num 102 115 110 140 101 101 121 121 48 70 ...
## $ peak_rpm : num 5500 5500 5500 5500 5800 5800 4250 4250 5100 5400 ...
## $ city_mpg : num 24 18 19 17 23 23 21 21 47 38 ...
## $ highway_mpg : num 30 22 25 20 29 29 28 28 53 43 ...
## $ price : num 13950 17450 17710 23875 16430 ...
summary(automobile_data)
## symboling normalized-losses make fuel-type
## Min. :-2.0000 Min. : 65.0 Length:159 Length:159
## 1st Qu.: 0.0000 1st Qu.: 94.0 Class :character Class :character
## Median : 1.0000 Median :113.0 Mode :character Mode :character
## Mean : 0.7358 Mean :121.1
## 3rd Qu.: 2.0000 3rd Qu.:148.0
## Max. : 3.0000 Max. :256.0
## aspiration no_of_doors body_style
## Length:159 Length:159 Length:159
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## drive_wheels engine_location wheel_base length
## Length:159 Length:159 Min. : 86.60 Min. :141.1
## Class :character Class :character 1st Qu.: 94.50 1st Qu.:165.7
## Mode :character Mode :character Median : 96.90 Median :172.4
## Mean : 98.26 Mean :172.4
## 3rd Qu.:100.80 3rd Qu.:177.8
## Max. :115.60 Max. :202.6
## width height curb_weight engine_type
## Min. :60.30 Min. :49.40 Min. :1488 Length:159
## 1st Qu.:64.00 1st Qu.:52.25 1st Qu.:2066 Class :character
## Median :65.40 Median :54.10 Median :2340 Mode :character
## Mean :65.61 Mean :53.90 Mean :2461
## 3rd Qu.:66.50 3rd Qu.:55.50 3rd Qu.:2810
## Max. :71.70 Max. :59.80 Max. :4066
## no_of_cylinders engine_size fuel_system bore
## Length:159 Min. : 61.0 Length:159 Min. :2.54
## Class :character 1st Qu.: 97.0 Class :character 1st Qu.:3.05
## Mode :character Median :110.0 Mode :character Median :3.27
## Mean :119.2 Mean :3.30
## 3rd Qu.:135.0 3rd Qu.:3.56
## Max. :258.0 Max. :3.94
## stroke compression_ratio horse_power peak_rpm
## Min. :2.070 Min. : 7.00 Min. : 48.00 Min. :4150
## 1st Qu.:3.105 1st Qu.: 8.70 1st Qu.: 69.00 1st Qu.:4800
## Median :3.270 Median : 9.00 Median : 88.00 Median :5200
## Mean :3.236 Mean :10.16 Mean : 95.84 Mean :5114
## 3rd Qu.:3.410 3rd Qu.: 9.40 3rd Qu.:114.00 3rd Qu.:5500
## Max. :4.170 Max. :23.00 Max. :200.00 Max. :6600
## city_mpg highway_mpg price
## Min. :15.00 Min. :18.00 Min. : 5118
## 1st Qu.:23.00 1st Qu.:28.00 1st Qu.: 7372
## Median :26.00 Median :32.00 Median : 9233
## Mean :26.52 Mean :32.08 Mean :11446
## 3rd Qu.:31.00 3rd Qu.:37.00 3rd Qu.:14720
## Max. :49.00 Max. :54.00 Max. :35056
# Exploratory Data Analysis
automobile_data$price = as.numeric(automobile_data$price)
hist(automobile_data$price, main = "Automobile Price", xlab = "Price", ylab = "No. of Cars", ylim=c(0,100), xlim = c(0,40000), col = rainbow(10))

Provides more insight on Make(car brands) their price range associations and distribution.
# Check covarriance
automobile_numeric <- automobile_data[,c('wheel_base','length','width','height','curb_weight','engine_size','bore','stroke','compression_ratio','horse_power','peak_rpm','city_mpg','highway_mpg','price')]
# Find the standard deviations
standard_deviation = lapply(automobile_numeric, sd)
standard_deviation
## $wheel_base
## [1] 5.167416
##
## $length
## [1] 11.52318
##
## $width
## [1] 1.947883
##
## $height
## [1] 2.268761
##
## $curb_weight
## [1] 481.9413
##
## $engine_size
## [1] 30.46079
##
## $bore
## [1] 0.2673356
##
## $stroke
## [1] 0.2948877
##
## $compression_ratio
## [1] 3.889475
##
## $horse_power
## [1] 30.71858
##
## $peak_rpm
## [1] 465.7549
##
## $city_mpg
## [1] 6.097142
##
## $highway_mpg
## [1] 6.459189
##
## $price
## [1] 5877.856
covarriance = cov(automobile_numeric, method = 'pearson')
covarriance
## wheel_base length width height
## wheel_base 26.7021877 51.8955625 8.2033102 6.51561022
## length 51.8955625 132.7836048 18.8171734 13.05209736
## width 8.2033102 18.8171734 3.7942465 1.29354908
## height 6.5156102 13.0520974 1.2935491 5.14727808
## curb_weight 2017.6689157 4838.7107316 817.2837593 401.33806226
## engine_size 102.1872821 254.8133031 46.2362551 7.67672558
## bore 0.7986881 1.9910172 0.2981509 0.15456337
## stroke 0.2551595 0.4114116 0.1129391 -0.06109092
## compression_ratio 5.8573446 8.2832058 1.9603648 2.05877920
## horse_power 82.0580248 237.8940490 40.8006090 2.39166866
## peak_rpm -696.1147600 -1256.2686092 -210.6747074 -259.80136932
## city_mpg -18.2944590 -50.9053698 -7.9178887 -2.76296075
## highway_mpg -20.4185694 -53.9321511 -8.7234058 -3.31387230
## price 22306.7472056 51540.4904745 9656.0640793 3265.00236048
## curb_weight engine_size bore stroke
## wheel_base 2017.66892 102.187282 0.79868808 0.25515954
## length 4838.71073 254.813303 1.99101724 0.41141155
## width 817.28376 46.236255 0.29815094 0.11293910
## height 401.33806 7.676726 0.15456337 -0.06109092
## curb_weight 232267.43643 13045.310246 83.20384324 24.70652058
## engine_size 13045.31025 927.859804 4.85123716 2.69190709
## bore 83.20384 4.851237 0.07146834 -0.00808688
## stroke 24.70652 2.691907 -0.00808688 0.08695876
## compression_ratio 421.24478 16.716641 0.01572074 0.27938390
## horse_power 11697.01011 759.866372 4.60078019 1.34794403
## peak_rpm -58358.57217 -4038.912109 -38.88149829 -1.55363825
## city_mpg -2239.56636 -129.846788 -0.96240785 -0.03605843
## highway_mpg -2457.16961 -140.499642 -1.02026351 -0.02463657
## price 2531484.42373 150664.865417 838.93414816 278.48071611
## compression_ratio horse_power peak_rpm
## wheel_base 5.85734464 82.058025 -6.961148e+02
## length 8.28320576 237.894049 -1.256269e+03
## width 1.96036482 40.800609 -2.106747e+02
## height 2.05877920 2.391669 -2.598014e+02
## curb_weight 421.24477908 11697.010111 -5.835857e+04
## engine_size 16.71664079 759.866372 -4.038912e+03
## bore 0.01572074 4.600780 -3.888150e+01
## stroke 0.27938390 1.347944 -1.553638e+00
## compression_ratio 15.12801263 -19.392092 -7.549936e+02
## horse_power -19.39209219 943.631319 1.059555e+03
## peak_rpm -754.99361118 1059.555370 2.169276e+05
## city_mpg 6.60054454 -156.806504 -1.503065e+02
## highway_mpg 5.56427394 -164.277685 -9.860680e+01
## price 4786.37423215 137202.392206 -4.706443e+05
## city_mpg highway_mpg price
## wheel_base -1.829446e+01 -2.041857e+01 22306.7472
## length -5.090537e+01 -5.393215e+01 51540.4905
## width -7.917889e+00 -8.723406e+00 9656.0641
## height -2.762961e+00 -3.313872e+00 3265.0024
## curb_weight -2.239566e+03 -2.457170e+03 2531484.4237
## engine_size -1.298468e+02 -1.404996e+02 150664.8654
## bore -9.624078e-01 -1.020264e+00 838.9341
## stroke -3.605843e-02 -2.463657e-02 278.4807
## compression_ratio 6.600545e+00 5.564274e+00 4786.3742
## horse_power -1.568065e+02 -1.642777e+02 137202.3922
## peak_rpm -1.503065e+02 -9.860680e+01 -470644.3356
## city_mpg 3.717515e+01 3.827983e+01 -24809.7693
## highway_mpg 3.827983e+01 4.172112e+01 -27339.0727
## price -2.480977e+04 -2.733907e+04 34549193.4517
# Check correlation
#install.packages("Hmisc")
library("psych")
## Warning: package 'psych' was built under R version 3.5.2
correlation_matrix <- cor(as.matrix(automobile_numeric))
correlation_matrix
## wheel_base length width height curb_weight
## wheel_base 1.0000000 0.8715345 0.8149912 0.55576713 0.8101815
## length 0.8715345 1.0000000 0.8383385 0.49925137 0.8712911
## width 0.8149912 0.8383385 1.0000000 0.29270580 0.8705945
## height 0.5557671 0.4992514 0.2927058 1.00000000 0.3670518
## curb_weight 0.8101815 0.8712911 0.8705945 0.36705181 1.0000000
## engine_size 0.6492056 0.7259533 0.7792534 0.11108260 0.8886261
## bore 0.5781585 0.6463176 0.5725542 0.25483608 0.6457916
## stroke 0.1674487 0.1210731 0.1966187 -0.09131269 0.1738444
## compression_ratio 0.2914314 0.1848142 0.2587517 0.23330821 0.2247240
## horse_power 0.5169475 0.6720633 0.6818718 0.03431713 0.7900954
## peak_rpm -0.2892345 -0.2340738 -0.2322160 -0.24586416 -0.2599879
## city_mpg -0.5806572 -0.7245444 -0.6666844 -0.19973748 -0.7621552
## highway_mpg -0.6117499 -0.7245987 -0.6933385 -0.22613562 -0.7893380
## price 0.7344189 0.7609522 0.8433705 0.24483625 0.8936391
## engine_size bore stroke compression_ratio
## wheel_base 0.6492056 0.57815853 0.16744868 0.29143145
## length 0.7259533 0.64631755 0.12107308 0.18481418
## width 0.7792534 0.57255416 0.19661872 0.25875169
## height 0.1110826 0.25483608 -0.09131269 0.23330821
## curb_weight 0.8886261 0.64579158 0.17384442 0.22472399
## engine_size 1.0000000 0.59573688 0.29968307 0.14109671
## bore 0.5957369 1.00000000 -0.10258113 0.01511908
## stroke 0.2996831 -0.10258113 1.00000000 0.24358681
## compression_ratio 0.1410967 0.01511908 0.24358681 1.00000000
## horse_power 0.8120726 0.56023917 0.14880380 -0.16230524
## peak_rpm -0.2846858 -0.31226891 -0.01131191 -0.41676855
## city_mpg -0.6991393 -0.59044028 -0.02005506 0.27833158
## highway_mpg -0.7140951 -0.59085039 -0.01293438 0.22148258
## price 0.8414956 0.53389035 0.16066434 0.20936147
## horse_power peak_rpm city_mpg highway_mpg
## wheel_base 0.51694753 -0.28923445 -0.58065720 -0.61174990
## length 0.67206330 -0.23407384 -0.72454445 -0.72459867
## width 0.68187176 -0.23221605 -0.66668439 -0.69333851
## height 0.03431713 -0.24586416 -0.19973748 -0.22613562
## curb_weight 0.79009539 -0.25998788 -0.76215523 -0.78933796
## engine_size 0.81207263 -0.28468581 -0.69913926 -0.71409510
## bore 0.56023917 -0.31226891 -0.59044028 -0.59085039
## stroke 0.14880380 -0.01131191 -0.02005506 -0.01293438
## compression_ratio -0.16230524 -0.41676855 0.27833158 0.22148258
## horse_power 1.00000000 0.07405682 -0.83721415 -0.82794105
## peak_rpm 0.07405682 1.00000000 -0.05292904 -0.03277717
## city_mpg -0.83721415 -0.05292904 1.00000000 0.97199880
## highway_mpg -0.82794105 -0.03277717 0.97199880 1.00000000
## price 0.75987395 -0.17191607 -0.69227306 -0.72009010
## price
## wheel_base 0.7344189
## length 0.7609522
## width 0.8433705
## height 0.2448363
## curb_weight 0.8936391
## engine_size 0.8414956
## bore 0.5338904
## stroke 0.1606643
## compression_ratio 0.2093615
## horse_power 0.7598739
## peak_rpm -0.1719161
## city_mpg -0.6922731
## highway_mpg -0.7200901
## price 1.0000000
# Regression Analysis
#select the subset of numeric variables for regression modelling
auto_regression <- subset(automobile_data, select = c(horse_power,city_mpg,peak_rpm,curb_weight,no_of_doors,price))
library(caTools)
set.seed(2017)
SplitRatio <- 0.8
#Split the data into training and validatation set of data
split = sample.split(auto_regression$price, SplitRatio = 0.8)
train.sample <- subset(auto_regression, split == TRUE)
valid.sample <- subset(auto_regression, split == FALSE)
#Fit the linear model in training sample set of data
fit <- lm(formula = price ~ horse_power + city_mpg + curb_weight + peak_rpm + no_of_doors, data = train.sample)
summary(fit)
##
## Call:
## lm(formula = price ~ horse_power + city_mpg + curb_weight + peak_rpm +
## no_of_doors, data = train.sample)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6658.7 -1487.8 -152.7 987.7 9029.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.357e+04 5.482e+03 -4.299 3.49e-05 ***
## horse_power 5.652e+00 1.844e+01 0.307 0.760
## city_mpg 5.123e+01 7.657e+01 0.669 0.505
## curb_weight 1.187e+01 1.058e+00 11.221 < 2e-16 ***
## peak_rpm 7.273e-01 5.886e-01 1.236 0.219
## no_of_doorstwo 7.147e+02 5.117e+02 1.397 0.165
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2570 on 121 degrees of freedom
## Multiple R-squared: 0.8278, Adjusted R-squared: 0.8207
## F-statistic: 116.3 on 5 and 121 DF, p-value: < 2.2e-16
#Backward Elimination removed no_of_doors as it is not significant
fit <- lm(formula = price ~ horse_power + city_mpg + peak_rpm + curb_weight , data = train.sample)
summary(fit)
##
## Call:
## lm(formula = price ~ horse_power + city_mpg + peak_rpm + curb_weight,
## data = train.sample)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6900.2 -1329.0 -12.2 1022.5 9662.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.366e+04 5.503e+03 -4.300 3.46e-05 ***
## horse_power 1.417e+01 1.747e+01 0.811 0.419
## city_mpg 7.007e+01 7.566e+01 0.926 0.356
## peak_rpm 7.426e-01 5.908e-01 1.257 0.211
## curb_weight 1.145e+01 1.020e+00 11.235 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2580 on 122 degrees of freedom
## Multiple R-squared: 0.825, Adjusted R-squared: 0.8193
## F-statistic: 143.8 on 4 and 122 DF, p-value: < 2.2e-16
#Remove city_mpg as it is not significant
fit <- lm(formula = price ~ horse_power + peak_rpm + curb_weight , data = train.sample)
summary(fit)
##
## Call:
## lm(formula = price ~ horse_power + peak_rpm + curb_weight, data = train.sample)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6882.4 -1310.7 -68.7 1185.8 9701.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.002e+04 3.842e+03 -5.209 7.74e-07 ***
## horse_power 5.739e+00 1.490e+01 0.385 0.701
## peak_rpm 6.720e-01 5.855e-01 1.148 0.253
## curb_weight 1.120e+01 9.826e-01 11.403 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2578 on 123 degrees of freedom
## Multiple R-squared: 0.8238, Adjusted R-squared: 0.8195
## F-statistic: 191.7 on 3 and 123 DF, p-value: < 2.2e-16
#Remove peak_rpm as it is not significant
fit <- lm(formula = price ~ horse_power + curb_weight , data = train.sample)
summary(fit)
##
## Call:
## lm(formula = price ~ horse_power + curb_weight, data = train.sample)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6765 -1260 -160 1223 9738
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.584e+04 1.244e+03 -12.738 <2e-16 ***
## horse_power 1.431e+01 1.291e+01 1.109 0.27
## curb_weight 1.057e+01 8.119e-01 13.016 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2582 on 124 degrees of freedom
## Multiple R-squared: 0.8219, Adjusted R-squared: 0.819
## F-statistic: 286.1 on 2 and 124 DF, p-value: < 2.2e-16
#Residual Plot Analysis
par(mfrow = c(2,2))
plot(fit)

#Evaluate the final linear model
train.sample$pred.price <- predict(fit,
newdata = subset(train.sample, select = c(
price,horse_power,curb_weight
)))
valid.sample$pred.price <- predict(fit,
newdata = subset(valid.sample, select = c(
price,horse_power,curb_weight
)))
summary(fit)
##
## Call:
## lm(formula = price ~ horse_power + curb_weight, data = train.sample)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6765 -1260 -160 1223 9738
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.584e+04 1.244e+03 -12.738 <2e-16 ***
## horse_power 1.431e+01 1.291e+01 1.109 0.27
## curb_weight 1.057e+01 8.119e-01 13.016 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2582 on 124 degrees of freedom
## Multiple R-squared: 0.8219, Adjusted R-squared: 0.819
## F-statistic: 286.1 on 2 and 124 DF, p-value: < 2.2e-16
#Training set of data Check how good is the model on the training set
train.corr <- round(cor(train.sample$price, train.sample$pred.price),2)
#Root mean square error
train.RMSE <- round(sqrt(mean(train.sample$pred.price - train.sample$price)^2),2)
#Mean absolute error
train.MAE <- round(mean(abs(train.sample$pred.price - train.sample$price)),2)
c(train.corr^2,train.RMSE,train.MAE)
## [1] 0.8281 0.0000 1809.2900
#validation data set model check on the training set
valid.corr <- round(cor(valid.sample$price, valid.sample$pred.price),2)
#Root mean square error
valid.RMSE <- round(sqrt(mean(valid.sample$pred.price - valid.sample$price)^2),2)
#Mean absolute error
valid.MAE <- round(mean(abs(valid.sample$pred.price - valid.sample$price)),2)
c(valid.corr^2,valid.RMSE,valid.MAE)
## [1] 0.7396 457.4800 2013.6400
#This results could be improved when eliminating extreme values and normalising vars