The given dataset is about cars and it’s various prices. OBJECTIVE: To create a predictive model to price cars based on its features.
car = read.csv("/Users/vickyzhang/Documents/MSBA/predictive/project/cars.csv", header=T, na.strings="?")
dim(car)
## [1] 29466 17
We looked at all the variables of the dataset and tried to analyse the significant ones and overlook the ones, that are not helpful in designing the model.
attach(car)
summary(car)
## X trim subTrim condition isOneOwner
## Min. : 2 550 :21836 Hybrid: 190 CPO : 3586 f:25340
## 1st Qu.:13231 430 : 2071 unsp :29276 New :10317 t: 4126
## Median :26254 500 : 2002 Used:15563
## Mean :26269 63 AMG : 1413
## 3rd Qu.:39293 600 : 527
## Max. :52572 350 : 416
## (Other): 1201
## mileage year color displacement
## Min. : 1 Min. :1988 Black :12838 4.6 L :13599
## 1st Qu.: 14 1st Qu.:2007 Silver : 6095 5.5 L : 9154
## Median : 26120 Median :2012 White : 4418 4.3 L : 2071
## Mean : 40387 Mean :2010 Gray : 2007 5.0 L : 2002
## 3rd Qu.: 68234 3rd Qu.:2015 Blue : 1599 6.0 L : 403
## Max. :488525 Max. :2015 unsp : 1467 6.3 L : 391
## (Other): 1042 (Other): 1846
## fuel state region soundSystem
## Diesel : 312 CA : 5262 SoA :7805 Alpine : 2
## Gasoline:28628 FL : 3559 Pac :5844 Bang Olufsen : 177
## Hybrid : 189 NY : 2754 Mid :5824 Bose : 943
## unsp : 337 TX : 2458 WSC :2865 Boston Acoustic: 1
## NJ : 2266 ENC :2496 Harman Kardon : 4120
## GA : 1408 New :1421 Premium : 9694
## (Other):11759 (Other):3211 unsp :14529
## wheelType wheelSize featureCount price
## Alloy :14565 unsp :25293 Min. : 0.00 Min. : 599
## Chrome : 80 18 : 1774 1st Qu.: 18.00 1st Qu.: 28995
## Premium: 424 19 : 1297 Median : 53.00 Median : 56991
## Steel : 49 20 : 813 Mean : 46.48 Mean : 67001
## unsp :14348 17 : 149 3rd Qu.: 70.00 3rd Qu.:108815
## 16 : 107 Max. :132.00 Max. :299000
## (Other): 33
Looking at the variables and their nature, we now know the CATEGORICAL and the QUANTITATIVE variables. VARIABLES:
#Looking if the value 488525 is really an outlier or not ? One should only drop an outlier when it's obviously invalid or forges relationship.
car[car$mileage > 400000, ]
## X trim subTrim condition isOneOwner mileage year color
## 9011 16086 550 unsp Used f 488525 2012 White
## 12567 22462 500 unsp Used f 407725 2000 Gold
## 13073 23304 550 unsp CPO t 411103 2012 Black
## 26285 46886 500 unsp Used f 467834 2006 Black
## displacement fuel state region soundSystem wheelType wheelSize
## 9011 4.6 L Gasoline NJ Mid unsp Alloy unsp
## 12567 5.0 L Gasoline CA Pac unsp Alloy unsp
## 13073 4.6 L Gasoline FL SoA Premium Alloy unsp
## 26285 5.0 L Gasoline TX WSC unsp unsp unsp
## featureCount price
## 9011 54 46995
## 12567 20 8995
## 13073 64 59892
## 26285 54 21995
Turns out that one outlier does change estimates a little bit but not much.Did not drop the data.
car = read.csv("/Users/vickyzhang/Documents/MSBA/predictive/project/cars.csv", header = TRUE)
attach(car)
## The following objects are masked from car (pos = 3):
##
## color, condition, displacement, featureCount, fuel,
## isOneOwner, mileage, price, region, soundSystem, state,
## subTrim, trim, wheelSize, wheelType, X, year
library(leaps)
# we would have 129 variables if we do the full model.. (50 states!!)
############### initial manual subset selection ###############
# start from doing linear regression on the intuitively most possible variables
summary(lm(price ~ trim+condition+mileage+year))
##
## Call:
## lm(formula = price ~ trim + condition + mileage + year)
##
## Residuals:
## Min 1Q Median 3Q Max
## -69342 -6041 -1072 3762 272938
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.433e+06 9.136e+04 -103.246 < 2e-16 ***
## trim350 -3.830e+04 1.148e+03 -33.346 < 2e-16 ***
## trim400 -4.202e+04 1.323e+03 -31.760 < 2e-16 ***
## trim420 1.364e+03 1.261e+03 1.082 0.279449
## trim430 -2.678e+04 9.226e+02 -29.028 < 2e-16 ***
## trim450 -4.131e+04 1.213e+04 -3.405 0.000662 ***
## trim500 -2.367e+04 9.170e+02 -25.807 < 2e-16 ***
## trim55 AMG -2.560e+04 1.159e+03 -22.092 < 2e-16 ***
## trim550 -3.580e+04 9.900e+02 -36.164 < 2e-16 ***
## trim600 -1.524e+04 1.059e+03 -14.387 < 2e-16 ***
## trim63 AMG 1.227e+03 1.040e+03 1.180 0.237837
## trim65 AMG 2.031e+04 1.196e+03 16.976 < 2e-16 ***
## trimunsp 1.816e+04 1.644e+03 11.045 < 2e-16 ***
## conditionNew 3.741e+04 2.612e+02 143.232 < 2e-16 ***
## conditionUsed -5.484e+03 2.523e+02 -21.738 < 2e-16 ***
## mileage -1.351e-01 3.236e-03 -41.733 < 2e-16 ***
## year 4.738e+03 4.561e+01 103.897 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12090 on 29449 degrees of freedom
## Multiple R-squared: 0.9273, Adjusted R-squared: 0.9273
## F-statistic: 2.349e+04 on 16 and 29449 DF, p-value: < 2.2e-16
############### stepwise selection, trial and error ###############
# adjusted R square is 0.9273, pretty good!!
summary(lm(price ~ condition+mileage+year))
##
## Call:
## lm(formula = price ~ condition + mileage + year)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57657 -8171 -3915 2095 266422
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.785e+06 8.817e+04 -76.96 <2e-16 ***
## conditionNew 4.235e+04 3.214e+02 131.77 <2e-16 ***
## conditionUsed -5.938e+03 3.287e+02 -18.07 <2e-16 ***
## mileage -1.646e-01 4.222e-03 -38.99 <2e-16 ***
## year 3.406e+03 4.379e+01 77.78 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15970 on 29461 degrees of freedom
## Multiple R-squared: 0.8732, Adjusted R-squared: 0.8732
## F-statistic: 5.073e+04 on 4 and 29461 DF, p-value: < 2.2e-16
# adjusted R: 0.8732 so we better keep trim even though it's cumbersome (too many dummies!)
summary(lm(price ~ trim+condition+mileage+year+fuel))
##
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel)
##
## Residuals:
## Min 1Q Median 3Q Max
## -75103 -5823 -972 3749 272745
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.321e+06 9.083e+04 -102.621 < 2e-16 ***
## trim350 -3.943e+04 1.444e+03 -27.314 < 2e-16 ***
## trim400 -3.210e+04 1.197e+04 -2.681 0.007339 **
## trim420 1.447e+03 1.245e+03 1.162 0.245110
## trim430 -2.638e+04 9.109e+02 -28.957 < 2e-16 ***
## trim450 -4.062e+04 1.197e+04 -3.393 0.000692 ***
## trim500 -2.330e+04 9.053e+02 -25.733 < 2e-16 ***
## trim55 AMG -2.519e+04 1.144e+03 -22.020 < 2e-16 ***
## trim550 -3.503e+04 9.796e+02 -35.757 < 2e-16 ***
## trim600 -1.554e+04 1.046e+03 -14.851 < 2e-16 ***
## trim63 AMG 1.592e+03 1.028e+03 1.548 0.121728
## trim65 AMG 2.024e+04 1.182e+03 17.126 < 2e-16 ***
## trimunsp 5.217e+03 1.687e+03 3.093 0.001983 **
## conditionNew 3.708e+04 2.594e+02 142.920 < 2e-16 ***
## conditionUsed -5.483e+03 2.490e+02 -22.019 < 2e-16 ***
## mileage -1.362e-01 3.194e-03 -42.643 < 2e-16 ***
## year 4.683e+03 4.527e+01 103.458 < 2e-16 ***
## fuelGasoline -2.553e+03 1.307e+03 -1.953 0.050847 .
## fuelHybrid -1.179e+04 1.204e+04 -0.980 0.327171
## fuelunsp 1.742e+04 1.476e+03 11.800 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11930 on 29446 degrees of freedom
## Multiple R-squared: 0.9293, Adjusted R-squared: 0.9292
## F-statistic: 2.036e+04 on 19 and 29446 DF, p-value: < 2.2e-16
# adjusted R 0.9292
summary(lm(price ~ trim+condition+mileage+year+fuel+featureCount))
##
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel +
## featureCount)
##
## Residuals:
## Min 1Q Median 3Q Max
## -75454 -5839 -959 3766 272804
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.367e+06 9.128e+04 -102.619 < 2e-16 ***
## trim350 -3.945e+04 1.443e+03 -27.334 < 2e-16 ***
## trim400 -3.246e+04 1.197e+04 -2.713 0.006676 **
## trim420 1.443e+03 1.244e+03 1.160 0.246042
## trim430 -2.635e+04 9.105e+02 -28.939 < 2e-16 ***
## trim450 -4.060e+04 1.197e+04 -3.392 0.000694 ***
## trim500 -2.327e+04 9.050e+02 -25.718 < 2e-16 ***
## trim55 AMG -2.514e+04 1.144e+03 -21.988 < 2e-16 ***
## trim550 -3.502e+04 9.792e+02 -35.761 < 2e-16 ***
## trim600 -1.550e+04 1.046e+03 -14.823 < 2e-16 ***
## trim63 AMG 1.592e+03 1.028e+03 1.548 0.121578
## trim65 AMG 2.028e+04 1.182e+03 17.168 < 2e-16 ***
## trimunsp 5.225e+03 1.686e+03 3.099 0.001945 **
## conditionNew 3.691e+04 2.616e+02 141.093 < 2e-16 ***
## conditionUsed -5.566e+03 2.495e+02 -22.309 < 2e-16 ***
## mileage -1.360e-01 3.193e-03 -42.590 < 2e-16 ***
## year 4.707e+03 4.550e+01 103.444 < 2e-16 ***
## fuelGasoline -2.571e+03 1.307e+03 -1.967 0.049159 *
## fuelHybrid -1.143e+04 1.203e+04 -0.950 0.342302
## fuelunsp 1.716e+04 1.477e+03 11.624 < 2e-16 ***
## featureCount -1.203e+01 2.449e+00 -4.911 9.11e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11930 on 29445 degrees of freedom
## Multiple R-squared: 0.9293, Adjusted R-squared: 0.9293
## F-statistic: 1.936e+04 on 20 and 29445 DF, p-value: < 2.2e-16
# 0.9293 little improvement, consider removing featureCount
summary(lm(price ~ trim+condition+mileage+year+fuel+displacement))
##
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel +
## displacement)
##
## Residuals:
## Min 1Q Median 3Q Max
## -65934 -5093 -968 3408 271481
##
## Coefficients: (4 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.215e+06 9.754e+04 -84.220 < 2e-16 ***
## trim350 2.815e+04 1.363e+04 2.065 0.038964 *
## trim400 -7.461e+03 1.808e+04 -0.413 0.679841
## trim420 5.610e+04 1.358e+04 4.131 3.62e-05 ***
## trim430 3.206e+04 1.356e+04 2.364 0.018065 *
## trim450 -3.708e+04 1.536e+04 -2.413 0.015813 *
## trim500 3.482e+04 1.356e+04 2.568 0.010242 *
## trim55 AMG 3.353e+04 1.358e+04 2.470 0.013514 *
## trim550 -4.726e+03 1.089e+04 -0.434 0.664343
## trim600 5.153e+03 1.090e+04 0.473 0.636298
## trim63 AMG 4.646e+04 1.091e+04 4.259 2.06e-05 ***
## trim65 AMG 6.386e+03 1.095e+04 0.583 0.559681
## trimunsp 2.808e+04 1.093e+04 2.569 0.010196 *
## conditionNew 3.596e+04 2.372e+02 151.602 < 2e-16 ***
## conditionUsed -4.382e+03 2.292e+02 -19.120 < 2e-16 ***
## mileage -1.319e-01 2.920e-03 -45.174 < 2e-16 ***
## year 4.100e+03 4.829e+01 84.893 < 2e-16 ***
## fuelGasoline 8.141e+02 5.209e+03 0.156 0.875823
## fuelHybrid -9.159e+03 1.205e+04 -0.760 0.447068
## fuelunsp 1.350e+04 5.162e+03 2.615 0.008930 **
## displacement3.2 L 5.491e+04 1.358e+04 4.044 5.26e-05 ***
## displacement3.5 L 3.932e+04 4.948e+03 7.947 1.97e-15 ***
## displacement3.7 L -7.844e+03 5.354e+03 -1.465 0.142913
## displacement4.2 L NA NA NA NA
## displacement4.3 L NA NA NA NA
## displacement4.6 L 3.621e+04 8.194e+03 4.419 9.93e-06 ***
## displacement4.7 L 5.858e+04 8.203e+03 7.142 9.42e-13 ***
## displacement5.0 L NA NA NA NA
## displacement5.4 L NA NA NA NA
## displacement5.5 L 3.193e+04 8.194e+03 3.897 9.76e-05 ***
## displacement5.8 L 3.133e+04 8.294e+03 3.777 0.000159 ***
## displacement6.0 L 7.898e+04 8.226e+03 9.602 < 2e-16 ***
## displacement6.3 L -7.724e+03 8.225e+03 -0.939 0.347655
## displacement8.0 L 5.878e+04 1.359e+04 4.324 1.54e-05 ***
## displacementunsp 4.121e+04 8.164e+03 5.048 4.50e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10830 on 29435 degrees of freedom
## Multiple R-squared: 0.9417, Adjusted R-squared: 0.9417
## F-statistic: 1.586e+04 on 30 and 29435 DF, p-value: < 2.2e-16
# adjusted R 0.9417. If continuous variable - 0.9311 ????
summary(lm(price ~ trim+condition+mileage+year+fuel+displacement+color))
##
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel +
## displacement + color)
##
## Residuals:
## Min 1Q Median 3Q Max
## -65951 -5103 -962 3369 272133
##
## Coefficients: (4 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.216e+06 9.748e+04 -84.279 < 2e-16 ***
## trim350 2.837e+04 1.361e+04 2.085 0.03705 *
## trim400 -8.156e+03 1.804e+04 -0.452 0.65123
## trim420 5.613e+04 1.355e+04 4.141 3.47e-05 ***
## trim430 3.256e+04 1.353e+04 2.406 0.01613 *
## trim450 -3.554e+04 1.534e+04 -2.318 0.02047 *
## trim500 3.527e+04 1.353e+04 2.606 0.00916 **
## trim55 AMG 3.418e+04 1.355e+04 2.523 0.01164 *
## trim550 -4.386e+03 1.087e+04 -0.403 0.68660
## trim600 5.573e+03 1.088e+04 0.512 0.60840
## trim63 AMG 4.664e+04 1.089e+04 4.284 1.84e-05 ***
## trim65 AMG 6.966e+03 1.093e+04 0.638 0.52376
## trimunsp 2.847e+04 1.091e+04 2.610 0.00906 **
## conditionNew 3.596e+04 2.381e+02 151.007 < 2e-16 ***
## conditionUsed -4.368e+03 2.289e+02 -19.081 < 2e-16 ***
## mileage -1.314e-01 2.917e-03 -45.065 < 2e-16 ***
## year 4.100e+03 4.827e+01 84.940 < 2e-16 ***
## fuelGasoline 6.670e+02 5.198e+03 0.128 0.89791
## fuelHybrid -7.896e+03 1.202e+04 -0.657 0.51126
## fuelunsp 1.341e+04 5.151e+03 2.604 0.00921 **
## displacement3.2 L 5.501e+04 1.355e+04 4.060 4.91e-05 ***
## displacement3.5 L 3.895e+04 4.939e+03 7.887 3.20e-15 ***
## displacement3.7 L -7.493e+03 5.343e+03 -1.403 0.16076
## displacement4.2 L NA NA NA NA
## displacement4.3 L NA NA NA NA
## displacement4.6 L 3.621e+04 8.176e+03 4.429 9.49e-06 ***
## displacement4.7 L 5.851e+04 8.185e+03 7.148 9.01e-13 ***
## displacement5.0 L NA NA NA NA
## displacement5.4 L NA NA NA NA
## displacement5.5 L 3.202e+04 8.177e+03 3.917 9.00e-05 ***
## displacement5.8 L 3.152e+04 8.276e+03 3.808 0.00014 ***
## displacement6.0 L 7.886e+04 8.208e+03 9.607 < 2e-16 ***
## displacement6.3 L -7.486e+03 8.207e+03 -0.912 0.36170
## displacement8.0 L 5.884e+04 1.356e+04 4.338 1.44e-05 ***
## displacementunsp 4.093e+04 8.146e+03 5.024 5.09e-07 ***
## colorBlack -4.331e+02 7.547e+02 -0.574 0.56600
## colorBlue -8.082e+02 7.954e+02 -1.016 0.30957
## colorBronze 4.144e+03 3.894e+03 1.064 0.28726
## colorBrown -1.986e+01 1.537e+03 -0.013 0.98969
## colorGold 1.137e+03 1.015e+03 1.120 0.26257
## colorGray -1.474e+03 7.861e+02 -1.876 0.06073 .
## colorGreen 8.609e+01 1.111e+03 0.078 0.93821
## colorPurple 5.767e+03 3.895e+03 1.480 0.13876
## colorRed -3.988e+01 9.610e+02 -0.042 0.96690
## colorSilver -1.095e+03 7.596e+02 -1.442 0.14939
## colorTurquoise -1.690e+03 4.892e+03 -0.346 0.72969
## colorunsp 1.841e+02 7.997e+02 0.230 0.81794
## colorWhite 1.079e+03 7.665e+02 1.408 0.15919
## colorYellow -6.485e+03 7.679e+03 -0.845 0.39836
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10810 on 29421 degrees of freedom
## Multiple R-squared: 0.942, Adjusted R-squared: 0.9419
## F-statistic: 1.086e+04 on 44 and 29421 DF, p-value: < 2.2e-16
# 0.9419, not much improvement for adjusted R, much more variance though. drop.
summary(lm(price ~ trim+condition+mileage+year+fuel+displacement+state))
##
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel +
## displacement + state)
##
## Residuals:
## Min 1Q Median 3Q Max
## -66378 -5064 -978 3355 272116
##
## Coefficients: (4 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.266e+06 9.802e+04 -84.327 < 2e-16 ***
## trim350 2.844e+04 1.360e+04 2.091 0.03658 *
## trim400 -7.096e+03 1.805e+04 -0.393 0.69414
## trim420 5.666e+04 1.355e+04 4.181 2.91e-05 ***
## trim430 3.259e+04 1.353e+04 2.408 0.01604 *
## trim450 -3.720e+04 1.627e+04 -2.287 0.02222 *
## trim500 3.529e+04 1.353e+04 2.608 0.00912 **
## trim55 AMG 3.395e+04 1.355e+04 2.506 0.01221 *
## trim550 -4.795e+03 1.087e+04 -0.441 0.65897
## trim600 5.016e+03 1.087e+04 0.461 0.64455
## trim63 AMG 4.626e+04 1.088e+04 4.251 2.14e-05 ***
## trim65 AMG 6.175e+03 1.092e+04 0.565 0.57184
## trimunsp 2.831e+04 1.090e+04 2.596 0.00944 **
## conditionNew 3.600e+04 2.388e+02 150.705 < 2e-16 ***
## conditionUsed -4.388e+03 2.315e+02 -18.953 < 2e-16 ***
## mileage -1.313e-01 2.933e-03 -44.772 < 2e-16 ***
## year 4.122e+03 4.839e+01 85.186 < 2e-16 ***
## fuelGasoline 7.349e+02 5.204e+03 0.141 0.88771
## fuelHybrid -9.052e+03 1.202e+04 -0.753 0.45142
## fuelunsp 1.352e+04 5.156e+03 2.622 0.00874 **
## displacement3.2 L 5.547e+04 1.355e+04 4.094 4.25e-05 ***
## displacement3.5 L 3.929e+04 4.964e+03 7.914 2.58e-15 ***
## displacement3.7 L -7.674e+03 5.348e+03 -1.435 0.15133
## displacement4.2 L NA NA NA NA
## displacement4.3 L NA NA NA NA
## displacement4.6 L 3.679e+04 8.177e+03 4.499 6.86e-06 ***
## displacement4.7 L 5.907e+04 8.186e+03 7.215 5.52e-13 ***
## displacement5.0 L NA NA NA NA
## displacement5.4 L NA NA NA NA
## displacement5.5 L 3.254e+04 8.178e+03 3.979 6.95e-05 ***
## displacement5.8 L 3.201e+04 8.278e+03 3.867 0.00011 ***
## displacement6.0 L 7.959e+04 8.210e+03 9.694 < 2e-16 ***
## displacement6.3 L -7.129e+03 8.208e+03 -0.869 0.38510
## displacement8.0 L 5.919e+04 1.357e+04 4.362 1.29e-05 ***
## displacementunsp 4.168e+04 8.147e+03 5.116 3.14e-07 ***
## stateAL 5.528e+03 7.665e+03 0.721 0.47080
## stateAR 6.300e+03 7.754e+03 0.813 0.41649
## stateAZ 6.433e+03 7.662e+03 0.840 0.40113
## stateCA 5.824e+03 7.644e+03 0.762 0.44610
## stateCO 6.300e+03 7.660e+03 0.822 0.41089
## stateCT 5.308e+03 7.664e+03 0.693 0.48856
## stateDC -8.039e+03 9.864e+03 -0.815 0.41509
## stateDE 6.210e+03 7.721e+03 0.804 0.42124
## stateFL 5.222e+03 7.644e+03 0.683 0.49458
## stateGA 4.687e+03 7.648e+03 0.613 0.53993
## stateHI 4.802e+03 7.714e+03 0.623 0.53359
## stateIA 7.375e+03 7.841e+03 0.941 0.34689
## stateID 8.983e+03 7.967e+03 1.128 0.25954
## stateIL 5.712e+03 7.648e+03 0.747 0.45518
## stateIN 4.874e+03 7.682e+03 0.635 0.52575
## stateKS 5.903e+03 7.732e+03 0.764 0.44516
## stateKY 7.915e+03 7.680e+03 1.031 0.30277
## stateLA 7.165e+03 7.692e+03 0.932 0.35160
## stateMA 5.588e+03 7.652e+03 0.730 0.46524
## stateMD 6.112e+03 7.654e+03 0.798 0.42459
## stateME 3.760e+03 7.954e+03 0.473 0.63647
## stateMI 5.180e+03 7.676e+03 0.675 0.49981
## stateMN 6.839e+03 7.675e+03 0.891 0.37287
## stateMO 6.993e+03 7.663e+03 0.913 0.36149
## stateMS 7.859e+03 7.701e+03 1.021 0.30749
## stateMT 8.189e+03 8.309e+03 0.986 0.32437
## stateNC 6.524e+03 7.651e+03 0.853 0.39387
## stateND 9.384e+03 9.043e+03 1.038 0.29940
## stateNE 5.873e+03 8.018e+03 0.732 0.46388
## stateNH 6.605e+03 7.695e+03 0.858 0.39068
## stateNJ 5.042e+03 7.646e+03 0.659 0.50959
## stateNM 6.362e+03 7.864e+03 0.809 0.41853
## stateNV 8.502e+03 7.662e+03 1.110 0.26720
## stateNY 4.410e+03 7.645e+03 0.577 0.56407
## stateOH 4.956e+03 7.654e+03 0.647 0.51733
## stateOK 6.212e+03 7.683e+03 0.809 0.41878
## stateON 5.803e+03 9.366e+03 0.620 0.53555
## stateOR 6.773e+03 7.690e+03 0.881 0.37848
## statePA 5.557e+03 7.652e+03 0.726 0.46767
## stateRI 4.101e+03 7.785e+03 0.527 0.59835
## stateSC 5.943e+03 7.672e+03 0.775 0.43855
## stateSD 2.341e+04 1.081e+04 2.166 0.03030 *
## stateTN 4.366e+03 7.660e+03 0.570 0.56872
## stateTX 6.447e+03 7.645e+03 0.843 0.39910
## stateunsp -2.494e+04 1.083e+04 -2.303 0.02128 *
## stateUT 7.820e+03 7.701e+03 1.015 0.30990
## stateVA 5.677e+03 7.650e+03 0.742 0.45803
## stateWA 6.872e+03 7.666e+03 0.896 0.37001
## stateWI 6.326e+03 7.693e+03 0.822 0.41086
## stateWV 6.624e+03 7.836e+03 0.845 0.39793
## stateWY 1.339e+03 1.081e+04 0.124 0.90136
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10800 on 29384 degrees of freedom
## Multiple R-squared: 0.9421, Adjusted R-squared: 0.942
## F-statistic: 5905 on 81 and 29384 DF, p-value: < 2.2e-16
# 0.942 6not much improvement for adjusted R, much more variance though. drop.
summary(lm(price ~ trim+condition+mileage+year+fuel+displacement+region))
##
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel +
## displacement + region)
##
## Residuals:
## Min 1Q Median 3Q Max
## -67227 -5092 -969 3375 272322
##
## Coefficients: (4 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.235e+06 9.752e+04 -84.446 < 2e-16 ***
## trim350 2.863e+04 1.362e+04 2.103 0.035516 *
## trim400 -6.821e+03 1.805e+04 -0.378 0.705560
## trim420 5.658e+04 1.356e+04 4.172 3.03e-05 ***
## trim430 3.253e+04 1.354e+04 2.402 0.016312 *
## trim450 -2.677e+04 1.598e+04 -1.675 0.093933 .
## trim500 3.524e+04 1.354e+04 2.602 0.009264 **
## trim55 AMG 3.391e+04 1.356e+04 2.501 0.012380 *
## trim550 -4.533e+03 1.088e+04 -0.417 0.676804
## trim600 5.286e+03 1.088e+04 0.486 0.627174
## trim63 AMG 4.654e+04 1.089e+04 4.272 1.94e-05 ***
## trim65 AMG 6.315e+03 1.093e+04 0.578 0.563503
## trimunsp 2.824e+04 1.091e+04 2.587 0.009679 **
## conditionNew 3.602e+04 2.381e+02 151.312 < 2e-16 ***
## conditionUsed -4.433e+03 2.300e+02 -19.276 < 2e-16 ***
## mileage -1.311e-01 2.918e-03 -44.909 < 2e-16 ***
## year 4.110e+03 4.828e+01 85.113 < 2e-16 ***
## fuelGasoline 7.116e+02 5.203e+03 0.137 0.891226
## fuelHybrid -9.041e+03 1.203e+04 -0.752 0.452274
## fuelunsp 1.349e+04 5.155e+03 2.618 0.008860 **
## displacement3.2 L 5.539e+04 1.356e+04 4.085 4.42e-05 ***
## displacement3.5 L 3.892e+04 4.942e+03 7.877 3.48e-15 ***
## displacement3.7 L -7.904e+03 5.347e+03 -1.478 0.139394
## displacement4.2 L NA NA NA NA
## displacement4.3 L NA NA NA NA
## displacement4.6 L 3.653e+04 8.183e+03 4.464 8.09e-06 ***
## displacement4.7 L 5.887e+04 8.192e+03 7.186 6.84e-13 ***
## displacement5.0 L NA NA NA NA
## displacement5.4 L NA NA NA NA
## displacement5.5 L 3.226e+04 8.184e+03 3.942 8.10e-05 ***
## displacement5.8 L 3.157e+04 8.284e+03 3.811 0.000139 ***
## displacement6.0 L 7.943e+04 8.216e+03 9.668 < 2e-16 ***
## displacement6.3 L -7.392e+03 8.214e+03 -0.900 0.368175
## displacement8.0 L 5.900e+04 1.358e+04 4.345 1.40e-05 ***
## displacementunsp 4.145e+04 8.153e+03 5.084 3.71e-07 ***
## regionESC 3.532e+02 3.932e+02 0.898 0.369104
## regionMid -6.198e+02 2.618e+02 -2.367 0.017927 *
## regionMtn 1.692e+03 3.641e+02 4.646 3.40e-06 ***
## regionNew 9.890e+01 3.608e+02 0.274 0.783996
## regionPac 4.464e+02 2.604e+02 1.714 0.086549 .
## regionSoA -7.030e+00 2.491e+02 -0.028 0.977484
## regionunsp -9.875e+03 4.434e+03 -2.227 0.025962 *
## regionWNC 1.427e+03 4.520e+02 3.157 0.001593 **
## regionWSC 1.031e+03 2.972e+02 3.468 0.000524 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10810 on 29426 degrees of freedom
## Multiple R-squared: 0.9419, Adjusted R-squared: 0.9419
## F-statistic: 1.224e+04 on 39 and 29426 DF, p-value: < 2.2e-16
# 0.9419 drop.
summary(lm(price ~ trim+condition+mileage+year+fuel+displacement+soundSystem))
##
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel +
## displacement + soundSystem)
##
## Residuals:
## Min 1Q Median 3Q Max
## -65902 -5079 -910 3477 270568
##
## Coefficients: (4 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.205e+06 9.763e+04 -84.042 < 2e-16 ***
## trim350 2.810e+04 1.359e+04 2.068 0.038682 *
## trim400 -5.851e+03 1.802e+04 -0.325 0.745483
## trim420 5.677e+04 1.354e+04 4.193 2.76e-05 ***
## trim430 3.290e+04 1.352e+04 2.433 0.014972 *
## trim450 -3.490e+04 1.532e+04 -2.278 0.022734 *
## trim500 3.564e+04 1.352e+04 2.636 0.008383 **
## trim55 AMG 3.458e+04 1.354e+04 2.555 0.010621 *
## trim550 -4.119e+03 1.086e+04 -0.379 0.704425
## trim600 5.549e+03 1.086e+04 0.511 0.609537
## trim63 AMG 4.688e+04 1.088e+04 4.310 1.64e-05 ***
## trim65 AMG 6.815e+03 1.091e+04 0.624 0.532387
## trimunsp 2.861e+04 1.090e+04 2.626 0.008649 **
## conditionNew 3.547e+04 2.424e+02 146.299 < 2e-16 ***
## conditionUsed -4.250e+03 2.289e+02 -18.565 < 2e-16 ***
## mileage -1.331e-01 2.915e-03 -45.645 < 2e-16 ***
## year 4.098e+03 4.824e+01 84.944 < 2e-16 ***
## fuelGasoline -4.827e+02 5.195e+03 -0.093 0.925969
## fuelHybrid -1.027e+04 1.201e+04 -0.855 0.392673
## fuelunsp 1.223e+04 5.147e+03 2.375 0.017533 *
## displacement3.2 L 5.549e+04 1.354e+04 4.099 4.15e-05 ***
## displacement3.5 L 3.867e+04 4.933e+03 7.839 4.69e-15 ***
## displacement3.7 L -7.005e+03 5.338e+03 -1.312 0.189452
## displacement4.2 L NA NA NA NA
## displacement4.3 L NA NA NA NA
## displacement4.6 L 3.665e+04 8.169e+03 4.486 7.30e-06 ***
## displacement4.7 L 5.903e+04 8.178e+03 7.217 5.43e-13 ***
## displacement5.0 L NA NA NA NA
## displacement5.4 L NA NA NA NA
## displacement5.5 L 3.240e+04 8.170e+03 3.966 7.32e-05 ***
## displacement5.8 L 3.173e+04 8.269e+03 3.836 0.000125 ***
## displacement6.0 L 7.935e+04 8.202e+03 9.675 < 2e-16 ***
## displacement6.3 L -6.933e+03 8.200e+03 -0.845 0.397847
## displacement8.0 L 5.887e+04 1.355e+04 4.343 1.41e-05 ***
## displacementunsp 4.140e+04 8.140e+03 5.086 3.69e-07 ***
## soundSystemBang Olufsen -3.039e+03 7.683e+03 -0.396 0.692414
## soundSystemBose -7.535e+03 7.645e+03 -0.986 0.324333
## soundSystemBoston Acoustic -9.412e+03 1.323e+04 -0.712 0.476775
## soundSystemHarman Kardon -7.560e+03 7.639e+03 -0.990 0.322362
## soundSystemPremium -5.803e+03 7.638e+03 -0.760 0.447440
## soundSystemunsp -5.108e+03 7.638e+03 -0.669 0.503661
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10800 on 29429 degrees of freedom
## Multiple R-squared: 0.9421, Adjusted R-squared: 0.942
## F-statistic: 1.33e+04 on 36 and 29429 DF, p-value: < 2.2e-16
# 0.942 drop
summary(lm(price ~ trim+condition+mileage+year+fuel+displacement+wheelType))
##
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel +
## displacement + wheelType)
##
## Residuals:
## Min 1Q Median 3Q Max
## -66389 -5098 -926 3419 271832
##
## Coefficients: (4 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.203e+06 9.733e+04 -84.281 < 2e-16 ***
## trim350 2.706e+04 1.360e+04 1.990 0.046609 *
## trim400 -8.667e+03 1.803e+04 -0.481 0.630825
## trim420 5.529e+04 1.355e+04 4.081 4.49e-05 ***
## trim430 3.127e+04 1.353e+04 2.312 0.020797 *
## trim450 -3.730e+04 1.532e+04 -2.434 0.014935 *
## trim500 3.404e+04 1.353e+04 2.517 0.011848 *
## trim55 AMG 3.277e+04 1.354e+04 2.420 0.015524 *
## trim550 -5.258e+03 1.086e+04 -0.484 0.628350
## trim600 4.513e+03 1.087e+04 0.415 0.678024
## trim63 AMG 4.591e+04 1.088e+04 4.220 2.46e-05 ***
## trim65 AMG 5.680e+03 1.092e+04 0.520 0.602978
## trimunsp 2.752e+04 1.090e+04 2.525 0.011589 *
## conditionNew 3.585e+04 2.388e+02 150.135 < 2e-16 ***
## conditionUsed -4.458e+03 2.289e+02 -19.482 < 2e-16 ***
## mileage -1.316e-01 2.913e-03 -45.162 < 2e-16 ***
## year 4.094e+03 4.819e+01 84.962 < 2e-16 ***
## fuelGasoline 4.810e+02 5.196e+03 0.093 0.926255
## fuelHybrid -8.945e+03 1.201e+04 -0.745 0.456565
## fuelunsp 1.319e+04 5.149e+03 2.562 0.010403 *
## displacement3.2 L 5.411e+04 1.354e+04 3.995 6.47e-05 ***
## displacement3.5 L 3.925e+04 4.935e+03 7.953 1.89e-15 ***
## displacement3.7 L -7.576e+03 5.340e+03 -1.419 0.156009
## displacement4.2 L NA NA NA NA
## displacement4.3 L NA NA NA NA
## displacement4.6 L 3.590e+04 8.173e+03 4.393 1.12e-05 ***
## displacement4.7 L 5.839e+04 8.182e+03 7.137 9.76e-13 ***
## displacement5.0 L NA NA NA NA
## displacement5.4 L NA NA NA NA
## displacement5.5 L 3.169e+04 8.173e+03 3.877 0.000106 ***
## displacement5.8 L 3.124e+04 8.273e+03 3.776 0.000160 ***
## displacement6.0 L 7.892e+04 8.205e+03 9.618 < 2e-16 ***
## displacement6.3 L -7.949e+03 8.204e+03 -0.969 0.332569
## displacement8.0 L 5.815e+04 1.356e+04 4.289 1.80e-05 ***
## displacementunsp 4.068e+04 8.143e+03 4.996 5.88e-07 ***
## wheelTypeChrome 6.753e+02 1.214e+03 0.556 0.577943
## wheelTypePremium -4.057e+02 5.336e+02 -0.760 0.447033
## wheelTypeSteel 1.700e+04 1.548e+03 10.981 < 2e-16 ***
## wheelTypeunsp 8.083e+02 1.297e+02 6.234 4.61e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10800 on 29431 degrees of freedom
## Multiple R-squared: 0.942, Adjusted R-squared: 0.942
## F-statistic: 1.407e+04 on 34 and 29431 DF, p-value: < 2.2e-16
# 0.942 drop
summary(lm(price ~ trim+condition+mileage+year+fuel+displacement+wheelSize))
##
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel +
## displacement + wheelSize)
##
## Residuals:
## Min 1Q Median 3Q Max
## -65260 -5144 -926 3447 271523
##
## Coefficients: (4 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.251e+06 9.793e+04 -84.247 < 2e-16 ***
## trim350 2.762e+04 1.359e+04 2.033 0.042090 *
## trim400 -8.261e+03 1.802e+04 -0.458 0.646598
## trim420 5.571e+04 1.353e+04 4.117 3.85e-05 ***
## trim430 3.174e+04 1.351e+04 2.349 0.018840 *
## trim450 -3.672e+04 1.531e+04 -2.398 0.016482 *
## trim500 3.454e+04 1.351e+04 2.556 0.010593 *
## trim55 AMG 3.320e+04 1.353e+04 2.454 0.014138 *
## trim550 -4.780e+03 1.085e+04 -0.440 0.659669
## trim600 5.230e+03 1.086e+04 0.482 0.630093
## trim63 AMG 4.542e+04 1.087e+04 4.178 2.94e-05 ***
## trim65 AMG 6.267e+03 1.091e+04 0.574 0.565674
## trimunsp 2.764e+04 1.089e+04 2.538 0.011167 *
## conditionNew 3.583e+04 2.390e+02 149.919 < 2e-16 ***
## conditionUsed -4.344e+03 2.286e+02 -19.004 < 2e-16 ***
## mileage -1.316e-01 2.912e-03 -45.200 < 2e-16 ***
## year 4.120e+03 4.853e+01 84.896 < 2e-16 ***
## fuelGasoline 8.225e+02 5.191e+03 0.158 0.874119
## fuelHybrid -9.191e+03 1.200e+04 -0.766 0.443887
## fuelunsp 1.339e+04 5.144e+03 2.603 0.009244 **
## displacement3.2 L 5.431e+04 1.353e+04 4.014 5.98e-05 ***
## displacement3.5 L 3.966e+04 4.931e+03 8.043 9.08e-16 ***
## displacement3.7 L -7.245e+03 5.336e+03 -1.358 0.174573
## displacement4.2 L NA NA NA NA
## displacement4.3 L NA NA NA NA
## displacement4.6 L 3.578e+04 8.165e+03 4.382 1.18e-05 ***
## displacement4.7 L 5.774e+04 8.175e+03 7.063 1.66e-12 ***
## displacement5.0 L NA NA NA NA
## displacement5.4 L NA NA NA NA
## displacement5.5 L 3.150e+04 8.166e+03 3.858 0.000115 ***
## displacement5.8 L 3.122e+04 8.266e+03 3.777 0.000159 ***
## displacement6.0 L 7.801e+04 8.198e+03 9.516 < 2e-16 ***
## displacement6.3 L -7.406e+03 8.196e+03 -0.904 0.366218
## displacement8.0 L 5.853e+04 1.355e+04 4.320 1.56e-05 ***
## displacementunsp 4.107e+04 8.136e+03 5.048 4.50e-07 ***
## wheelSize17 -9.986e+03 1.382e+03 -7.226 5.10e-13 ***
## wheelSize18 -5.215e+03 1.097e+03 -4.754 2.00e-06 ***
## wheelSize19 -5.071e+03 1.110e+03 -4.570 4.91e-06 ***
## wheelSize20 1.377e+02 1.136e+03 0.121 0.903539
## wheelSize21 -2.997e+03 7.706e+03 -0.389 0.697344
## wheelSize22 -1.376e+02 2.212e+03 -0.062 0.950404
## wheelSizeunsp -4.579e+03 1.062e+03 -4.312 1.62e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10790 on 29428 degrees of freedom
## Multiple R-squared: 0.9422, Adjusted R-squared: 0.9421
## F-statistic: 1.296e+04 on 37 and 29428 DF, p-value: < 2.2e-16
# 0.9421 drop
summary(lm(price ~ trim+condition+mileage+year+fuel+displacement+featureCount))
##
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel +
## displacement + featureCount)
##
## Residuals:
## Min 1Q Median 3Q Max
## -66008 -5090 -956 3396 271548
##
## Coefficients: (4 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.270e+06 9.793e+04 -84.451 < 2e-16 ***
## trim350 2.739e+04 1.363e+04 2.010 0.044442 *
## trim400 -8.559e+03 1.807e+04 -0.474 0.635753
## trim420 5.561e+04 1.357e+04 4.097 4.20e-05 ***
## trim430 3.158e+04 1.355e+04 2.330 0.019808 *
## trim450 -3.750e+04 1.536e+04 -2.442 0.014618 *
## trim500 3.433e+04 1.355e+04 2.533 0.011304 *
## trim55 AMG 3.307e+04 1.357e+04 2.437 0.014806 *
## trim550 -5.144e+03 1.088e+04 -0.473 0.636515
## trim600 4.739e+03 1.089e+04 0.435 0.663484
## trim63 AMG 4.601e+04 1.090e+04 4.220 2.45e-05 ***
## trim65 AMG 5.955e+03 1.094e+04 0.544 0.586244
## trimunsp 2.772e+04 1.092e+04 2.538 0.011159 *
## conditionNew 3.577e+04 2.391e+02 149.584 < 2e-16 ***
## conditionUsed -4.476e+03 2.296e+02 -19.495 < 2e-16 ***
## mileage -1.317e-01 2.919e-03 -45.124 < 2e-16 ***
## year 4.128e+03 4.850e+01 85.115 < 2e-16 ***
## fuelGasoline 5.903e+02 5.207e+03 0.113 0.909728
## fuelHybrid -8.956e+03 1.204e+04 -0.744 0.456910
## fuelunsp 1.318e+04 5.159e+03 2.555 0.010631 *
## displacement3.2 L 5.441e+04 1.357e+04 4.010 6.08e-05 ***
## displacement3.5 L 3.948e+04 4.945e+03 7.984 1.46e-15 ***
## displacement3.7 L -7.623e+03 5.351e+03 -1.425 0.154290
## displacement4.2 L NA NA NA NA
## displacement4.3 L NA NA NA NA
## displacement4.6 L 3.609e+04 8.189e+03 4.408 1.05e-05 ***
## displacement4.7 L 5.850e+04 8.198e+03 7.135 9.88e-13 ***
## displacement5.0 L NA NA NA NA
## displacement5.4 L NA NA NA NA
## displacement5.5 L 3.184e+04 8.190e+03 3.887 0.000102 ***
## displacement5.8 L 3.133e+04 8.289e+03 3.779 0.000158 ***
## displacement6.0 L 7.893e+04 8.221e+03 9.601 < 2e-16 ***
## displacement6.3 L -7.779e+03 8.220e+03 -0.946 0.343996
## displacement8.0 L 5.810e+04 1.359e+04 4.277 1.90e-05 ***
## displacementunsp 4.070e+04 8.160e+03 4.988 6.12e-07 ***
## featureCount -1.323e+01 2.227e+00 -5.942 2.84e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10820 on 29434 degrees of freedom
## Multiple R-squared: 0.9418, Adjusted R-squared: 0.9417
## F-statistic: 1.537e+04 on 31 and 29434 DF, p-value: < 2.2e-16
# 0.9417 drop
# so best bet is: adjusted R 0.9417:
summary(lm(price ~ trim+condition+mileage+year+fuel+displacement))
##
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel +
## displacement)
##
## Residuals:
## Min 1Q Median 3Q Max
## -65934 -5093 -968 3408 271481
##
## Coefficients: (4 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.215e+06 9.754e+04 -84.220 < 2e-16 ***
## trim350 2.815e+04 1.363e+04 2.065 0.038964 *
## trim400 -7.461e+03 1.808e+04 -0.413 0.679841
## trim420 5.610e+04 1.358e+04 4.131 3.62e-05 ***
## trim430 3.206e+04 1.356e+04 2.364 0.018065 *
## trim450 -3.708e+04 1.536e+04 -2.413 0.015813 *
## trim500 3.482e+04 1.356e+04 2.568 0.010242 *
## trim55 AMG 3.353e+04 1.358e+04 2.470 0.013514 *
## trim550 -4.726e+03 1.089e+04 -0.434 0.664343
## trim600 5.153e+03 1.090e+04 0.473 0.636298
## trim63 AMG 4.646e+04 1.091e+04 4.259 2.06e-05 ***
## trim65 AMG 6.386e+03 1.095e+04 0.583 0.559681
## trimunsp 2.808e+04 1.093e+04 2.569 0.010196 *
## conditionNew 3.596e+04 2.372e+02 151.602 < 2e-16 ***
## conditionUsed -4.382e+03 2.292e+02 -19.120 < 2e-16 ***
## mileage -1.319e-01 2.920e-03 -45.174 < 2e-16 ***
## year 4.100e+03 4.829e+01 84.893 < 2e-16 ***
## fuelGasoline 8.141e+02 5.209e+03 0.156 0.875823
## fuelHybrid -9.159e+03 1.205e+04 -0.760 0.447068
## fuelunsp 1.350e+04 5.162e+03 2.615 0.008930 **
## displacement3.2 L 5.491e+04 1.358e+04 4.044 5.26e-05 ***
## displacement3.5 L 3.932e+04 4.948e+03 7.947 1.97e-15 ***
## displacement3.7 L -7.844e+03 5.354e+03 -1.465 0.142913
## displacement4.2 L NA NA NA NA
## displacement4.3 L NA NA NA NA
## displacement4.6 L 3.621e+04 8.194e+03 4.419 9.93e-06 ***
## displacement4.7 L 5.858e+04 8.203e+03 7.142 9.42e-13 ***
## displacement5.0 L NA NA NA NA
## displacement5.4 L NA NA NA NA
## displacement5.5 L 3.193e+04 8.194e+03 3.897 9.76e-05 ***
## displacement5.8 L 3.133e+04 8.294e+03 3.777 0.000159 ***
## displacement6.0 L 7.898e+04 8.226e+03 9.602 < 2e-16 ***
## displacement6.3 L -7.724e+03 8.225e+03 -0.939 0.347655
## displacement8.0 L 5.878e+04 1.359e+04 4.324 1.54e-05 ***
## displacementunsp 4.121e+04 8.164e+03 5.048 4.50e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10830 on 29435 degrees of freedom
## Multiple R-squared: 0.9417, Adjusted R-squared: 0.9417
## F-statistic: 1.586e+04 on 30 and 29435 DF, p-value: < 2.2e-16
best.mlr = lm(price ~ trim+condition+mileage+year+fuel+displacement)
# - mileage:.9269 -year: 0.9133 -fuel: .9292 -displacement: .9292
############ predicted values are stored in mlr.yhat #############
test_x = read.csv("/Users/vickyzhang/Documents/MSBA/predictive/project/Cars_X_out.csv", header = TRUE)
test_x = test_x[,c(-1, -3, -8, -13)]
train=sample(1:nrow(car), nrow(car)*9/10)
test=(-train)
testset = car[test,]
y.test = car[test,'price']
mlr.yhat = predict(best.mlr, newdata=test_x)
## Warning in predict.lm(best.mlr, newdata = test_x): prediction from a rank-
## deficient fit may be misleading
head(mlr.yhat)
## 1 2 3 4 5 6
## 114326.42 33249.19 45226.87 110227.17 56987.16 114326.55
MSE = mean((mlr.yhat - y.test)^2)
## Warning in mlr.yhat - y.test: longer object length is not a multiple of
## shorter object length
RMSE = sqrt(MSE) # RMSE isn't as good as random forest though, even though they have the same variables
RMSE
## [1] 62748.92
library(tree)
library(rpart)
attach(car)
## The following objects are masked from car (pos = 6):
##
## color, condition, displacement, featureCount, fuel,
## isOneOwner, mileage, price, region, soundSystem, state,
## subTrim, trim, wheelSize, wheelType, X, year
##
## The following objects are masked from car (pos = 7):
##
## color, condition, displacement, featureCount, fuel,
## isOneOwner, mileage, price, region, soundSystem, state,
## subTrim, trim, wheelSize, wheelType, X, year
#--------------------------------------------------
#reduce df to just lmed and lrat
bdf = car[,c(7,17)] #year and price
#--------------------------------------------------
#fit a big tree using rpart.control
big.tree = rpart(price~year,method="anova",data=bdf,
control=rpart.control(minsplit=5,cp=.0005))
nbig = length(unique(big.tree$where))
cat('size of big tree: ',nbig,'\n')
## size of big tree: 10
#--------------------------------------------------
#look at cross-validation
par(mfrow=c(1,1))
plotcp(big.tree)
#--------------------------------------------------
#show fit from some trees
oo=order(bdf$year)
bestcp=big.tree$cptable[which.min(big.tree$cptable[,"xerror"]),"CP"]
# because relative error doesn't change much when cp >= 0.0025, let's just set bestcp as 0.0025.
bestcp = 0.0025
cat('bestcp: ',bestcp,'\n')
## bestcp: 0.0025
cpvec = c(.0157,bestcp,.004)
par(mfrow=c(3,2))
for(i in 1:3) {
plot(bdf,pch=16,col='blue',cex=.5)
ptree = prune(big.tree,cp=cpvec[i])
pfit = predict(ptree)
lines(bdf$year[oo],pfit[oo],col='red',lwd=2)
title(paste('alpha = ',round(cpvec[i],3)))
plot(ptree,uniform=TRUE)
text(ptree,digits=3,cex=1)
}
#--------------------------------------------------
#plot best tree
#reference: http://rankexploits.com/musings/2011/margin-control-in-r-oma-and-mar-and-the-vanishing-axis-label/
# http://research.stowers-institute.org/mcm/efg/R/Graphics/Basics/mar-oma/index.htm
# still can't figure out why the edges of the graph are cut off???!!!
par(mfrow=c(1,1), mar=c(1,1,1,1), oma=c(0,0,0,0))
#par(mar=c(5,3,2,2)+0.1)
best.tree = prune(big.tree,cp=bestcp) # best tree is here!
plot(best.tree,uniform=TRUE) # get rid of margin etc to see better
box('figure',lty='solid', col='green')
text(best.tree,digits=2,use.n=TRUE,cex=1) # get rid of fancy parameters!
rm(list=ls())
library(randomForest)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
car = read.csv("/Users/vickyzhang/Documents/MSBA/predictive/project/cars.csv", header = TRUE)
car = car[,c(-1, -3, -8, -13)]
# converting dis to numerical didn't really help much
#dis = sapply(displacement, function(x) as.numeric(substr(x, 1, 3)))
#cars$displacement = dis
attach(car)
## The following objects are masked from car (pos = 4):
##
## condition, displacement, featureCount, fuel, isOneOwner,
## mileage, price, region, state, trim, wheelSize, wheelType,
## year
##
## The following objects are masked from car (pos = 8):
##
## condition, displacement, featureCount, fuel, isOneOwner,
## mileage, price, region, state, trim, wheelSize, wheelType,
## year
##
## The following objects are masked from car (pos = 9):
##
## condition, displacement, featureCount, fuel, isOneOwner,
## mileage, price, region, state, trim, wheelSize, wheelType,
## year
#--------------------------------------------------
#get rf fits for different number of trees
#note: to get this to work I had to use maxnodes parameter of randomForest!!!
set.seed(99)
n = nrow(car)
ntreev = c(10,500,1000) # I don't think you need 5000 trees.. changed it to 1000, save computational time
nset = length(ntreev)
fmat = matrix(0,n,nset) # predicted values, yhat, are stored in this matrix
for(i in 1:nset) {
cat('doing car rf: ',i,'\n')
rffit = randomForest(price~mileage,data=car,ntree=ntreev[i],maxnodes=15)
fmat[,i] = predict(rffit)
}
## doing car rf: 1
## doing car rf: 2
## doing car rf: 3
# try fitting using 500 trees, this time mileage + year
#rf500 = randomForest(price~mileage+year,data=car,ntree=500,maxnodes=15)
#head(rf500)
# get error of validation set. use 90% of the dataaset as training, 10% as test
train=sample(1:nrow(car), nrow(car)*9/10)
test=(-train)
testset = car[test,]
# get model based on training dataset,using mileage and year
rftraining = randomForest(price~trim+condition+mileage+year+fuel+displacement,data=car[train,],ntree=500,maxnodes=15)
# get RMSE on validation dataset
y.pred = predict(rftraining,newdata = testset)
y.test = car[test,'price']
MSE = mean((y.pred - y.test)^2)
RMSE = sqrt(MSE)
RMSE
## [1] 9120.888
# 17084.87 with only mileage
# 13826.19 mileage + year
# 10603.14 mileage + year + trim
# 9243 trim+condition+mileage+year+fuel+displacement best so far. Interestingly, it's the same model obtained by mmultivariate linear regression!
#
# try to fit the model on test x that prof gave
test_x = read.csv("/Users/vickyzhang/Documents/MSBA/predictive/project/Cars_X_out.csv", header = TRUE)
test_x = test_x[,c(-1, -3, -8, -13)]
### have to make sure factor variables in training and test data have the same levels, otherwise predict() would through an error
levels(test_x$trim) = levels(car$trim)
levels(test_x$condition) = levels(car$condition)
levels(test_x$fuel) = levels(car$fuel)
levels(test_x$displacement) = levels(car$displacement)
rftraining = randomForest(price~trim+condition+mileage+year+fuel+displacement, data=car[train,],ntree=500,maxnodes=15)
rftest = predict(rftraining, newdata=test_x)
head(rftest,10)
## 1 2 3 4 5 6 7
## 106163.86 30025.39 46069.17 104940.75 51335.02 106154.24 56180.88
## 8 9 10
## 28154.26 106199.93 106204.11
#--------------------------------------------------
#plot oob error using last fitted rffit which has the largest ntree.
par(mfrow=c(1,1))
plot(rffit)
#--------------------------------------------------
#plot fits
par(mfrow=c(1,3))
oo = order(car$mileage)
for(i in 1:nset) {
plot(car$mileage,car$price,xlab='mileage',ylab='price')
lines(car$mileage[oo],fmat[oo,i],col=i,lwd=3)
title(main=paste('bagging ntrees = ',ntreev[i]))
}
#--------------------------------------------------
rm(list=ls())