The given dataset is about cars and it’s various prices. OBJECTIVE: To create a predictive model to price cars based on its features.

car = read.csv("/Users/vickyzhang/Documents/MSBA/predictive/project/cars.csv", header=T, na.strings="?")
dim(car)
## [1] 29466    17

We looked at all the variables of the dataset and tried to analyse the significant ones and overlook the ones, that are not helpful in designing the model.

attach(car)
summary(car)
##        X              trim         subTrim      condition    isOneOwner
##  Min.   :    2   550    :21836   Hybrid:  190   CPO : 3586   f:25340   
##  1st Qu.:13231   430    : 2071   unsp  :29276   New :10317   t: 4126   
##  Median :26254   500    : 2002                  Used:15563             
##  Mean   :26269   63 AMG : 1413                                         
##  3rd Qu.:39293   600    :  527                                         
##  Max.   :52572   350    :  416                                         
##                  (Other): 1201                                         
##     mileage            year          color        displacement  
##  Min.   :     1   Min.   :1988   Black  :12838   4.6 L  :13599  
##  1st Qu.:    14   1st Qu.:2007   Silver : 6095   5.5 L  : 9154  
##  Median : 26120   Median :2012   White  : 4418   4.3 L  : 2071  
##  Mean   : 40387   Mean   :2010   Gray   : 2007   5.0 L  : 2002  
##  3rd Qu.: 68234   3rd Qu.:2015   Blue   : 1599   6.0 L  :  403  
##  Max.   :488525   Max.   :2015   unsp   : 1467   6.3 L  :  391  
##                                  (Other): 1042   (Other): 1846  
##        fuel           state           region              soundSystem   
##  Diesel  :  312   CA     : 5262   SoA    :7805   Alpine         :    2  
##  Gasoline:28628   FL     : 3559   Pac    :5844   Bang Olufsen   :  177  
##  Hybrid  :  189   NY     : 2754   Mid    :5824   Bose           :  943  
##  unsp    :  337   TX     : 2458   WSC    :2865   Boston Acoustic:    1  
##                   NJ     : 2266   ENC    :2496   Harman Kardon  : 4120  
##                   GA     : 1408   New    :1421   Premium        : 9694  
##                   (Other):11759   (Other):3211   unsp           :14529  
##    wheelType       wheelSize      featureCount        price       
##  Alloy  :14565   unsp   :25293   Min.   :  0.00   Min.   :   599  
##  Chrome :   80   18     : 1774   1st Qu.: 18.00   1st Qu.: 28995  
##  Premium:  424   19     : 1297   Median : 53.00   Median : 56991  
##  Steel  :   49   20     :  813   Mean   : 46.48   Mean   : 67001  
##  unsp   :14348   17     :  149   3rd Qu.: 70.00   3rd Qu.:108815  
##                  16     :  107   Max.   :132.00   Max.   :299000  
##                  (Other):   33

Looking at the variables and their nature, we now know the CATEGORICAL and the QUANTITATIVE variables. VARIABLES:

#Looking if the value 488525 is really an outlier or not ? One should only drop an outlier when it's obviously invalid or forges relationship.

car[car$mileage > 400000, ]
##           X trim subTrim condition isOneOwner mileage year color
## 9011  16086  550    unsp      Used          f  488525 2012 White
## 12567 22462  500    unsp      Used          f  407725 2000  Gold
## 13073 23304  550    unsp       CPO          t  411103 2012 Black
## 26285 46886  500    unsp      Used          f  467834 2006 Black
##       displacement     fuel state region soundSystem wheelType wheelSize
## 9011         4.6 L Gasoline    NJ    Mid        unsp     Alloy      unsp
## 12567        5.0 L Gasoline    CA    Pac        unsp     Alloy      unsp
## 13073        4.6 L Gasoline    FL    SoA     Premium     Alloy      unsp
## 26285        5.0 L Gasoline    TX    WSC        unsp      unsp      unsp
##       featureCount price
## 9011            54 46995
## 12567           20  8995
## 13073           64 59892
## 26285           54 21995

Turns out that one outlier does change estimates a little bit but not much.Did not drop the data.

car = read.csv("/Users/vickyzhang/Documents/MSBA/predictive/project/cars.csv", header = TRUE)
attach(car)
## The following objects are masked from car (pos = 3):
## 
##     color, condition, displacement, featureCount, fuel,
##     isOneOwner, mileage, price, region, soundSystem, state,
##     subTrim, trim, wheelSize, wheelType, X, year
library(leaps)
# we would have 129 variables if we do the full model.. (50 states!!)
###############  initial manual subset selection  ###############
# start from doing linear regression on the intuitively most possible variables
summary(lm(price ~ trim+condition+mileage+year))
## 
## Call:
## lm(formula = price ~ trim + condition + mileage + year)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -69342  -6041  -1072   3762 272938 
## 
## Coefficients:
##                 Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)   -9.433e+06  9.136e+04 -103.246  < 2e-16 ***
## trim350       -3.830e+04  1.148e+03  -33.346  < 2e-16 ***
## trim400       -4.202e+04  1.323e+03  -31.760  < 2e-16 ***
## trim420        1.364e+03  1.261e+03    1.082 0.279449    
## trim430       -2.678e+04  9.226e+02  -29.028  < 2e-16 ***
## trim450       -4.131e+04  1.213e+04   -3.405 0.000662 ***
## trim500       -2.367e+04  9.170e+02  -25.807  < 2e-16 ***
## trim55 AMG    -2.560e+04  1.159e+03  -22.092  < 2e-16 ***
## trim550       -3.580e+04  9.900e+02  -36.164  < 2e-16 ***
## trim600       -1.524e+04  1.059e+03  -14.387  < 2e-16 ***
## trim63 AMG     1.227e+03  1.040e+03    1.180 0.237837    
## trim65 AMG     2.031e+04  1.196e+03   16.976  < 2e-16 ***
## trimunsp       1.816e+04  1.644e+03   11.045  < 2e-16 ***
## conditionNew   3.741e+04  2.612e+02  143.232  < 2e-16 ***
## conditionUsed -5.484e+03  2.523e+02  -21.738  < 2e-16 ***
## mileage       -1.351e-01  3.236e-03  -41.733  < 2e-16 ***
## year           4.738e+03  4.561e+01  103.897  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12090 on 29449 degrees of freedom
## Multiple R-squared:  0.9273, Adjusted R-squared:  0.9273 
## F-statistic: 2.349e+04 on 16 and 29449 DF,  p-value: < 2.2e-16
###############  stepwise selection, trial and error  ###############
# adjusted R square is 0.9273, pretty good!!
summary(lm(price ~ condition+mileage+year))
## 
## Call:
## lm(formula = price ~ condition + mileage + year)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -57657  -8171  -3915   2095 266422 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -6.785e+06  8.817e+04  -76.96   <2e-16 ***
## conditionNew   4.235e+04  3.214e+02  131.77   <2e-16 ***
## conditionUsed -5.938e+03  3.287e+02  -18.07   <2e-16 ***
## mileage       -1.646e-01  4.222e-03  -38.99   <2e-16 ***
## year           3.406e+03  4.379e+01   77.78   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15970 on 29461 degrees of freedom
## Multiple R-squared:  0.8732, Adjusted R-squared:  0.8732 
## F-statistic: 5.073e+04 on 4 and 29461 DF,  p-value: < 2.2e-16
# adjusted R: 0.8732 so we better keep trim even though it's cumbersome (too many dummies!)
summary(lm(price ~ trim+condition+mileage+year+fuel))
## 
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -75103  -5823   -972   3749 272745 
## 
## Coefficients:
##                 Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)   -9.321e+06  9.083e+04 -102.621  < 2e-16 ***
## trim350       -3.943e+04  1.444e+03  -27.314  < 2e-16 ***
## trim400       -3.210e+04  1.197e+04   -2.681 0.007339 ** 
## trim420        1.447e+03  1.245e+03    1.162 0.245110    
## trim430       -2.638e+04  9.109e+02  -28.957  < 2e-16 ***
## trim450       -4.062e+04  1.197e+04   -3.393 0.000692 ***
## trim500       -2.330e+04  9.053e+02  -25.733  < 2e-16 ***
## trim55 AMG    -2.519e+04  1.144e+03  -22.020  < 2e-16 ***
## trim550       -3.503e+04  9.796e+02  -35.757  < 2e-16 ***
## trim600       -1.554e+04  1.046e+03  -14.851  < 2e-16 ***
## trim63 AMG     1.592e+03  1.028e+03    1.548 0.121728    
## trim65 AMG     2.024e+04  1.182e+03   17.126  < 2e-16 ***
## trimunsp       5.217e+03  1.687e+03    3.093 0.001983 ** 
## conditionNew   3.708e+04  2.594e+02  142.920  < 2e-16 ***
## conditionUsed -5.483e+03  2.490e+02  -22.019  < 2e-16 ***
## mileage       -1.362e-01  3.194e-03  -42.643  < 2e-16 ***
## year           4.683e+03  4.527e+01  103.458  < 2e-16 ***
## fuelGasoline  -2.553e+03  1.307e+03   -1.953 0.050847 .  
## fuelHybrid    -1.179e+04  1.204e+04   -0.980 0.327171    
## fuelunsp       1.742e+04  1.476e+03   11.800  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11930 on 29446 degrees of freedom
## Multiple R-squared:  0.9293, Adjusted R-squared:  0.9292 
## F-statistic: 2.036e+04 on 19 and 29446 DF,  p-value: < 2.2e-16
# adjusted R 0.9292
summary(lm(price ~ trim+condition+mileage+year+fuel+featureCount))
## 
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel + 
##     featureCount)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -75454  -5839   -959   3766 272804 
## 
## Coefficients:
##                 Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)   -9.367e+06  9.128e+04 -102.619  < 2e-16 ***
## trim350       -3.945e+04  1.443e+03  -27.334  < 2e-16 ***
## trim400       -3.246e+04  1.197e+04   -2.713 0.006676 ** 
## trim420        1.443e+03  1.244e+03    1.160 0.246042    
## trim430       -2.635e+04  9.105e+02  -28.939  < 2e-16 ***
## trim450       -4.060e+04  1.197e+04   -3.392 0.000694 ***
## trim500       -2.327e+04  9.050e+02  -25.718  < 2e-16 ***
## trim55 AMG    -2.514e+04  1.144e+03  -21.988  < 2e-16 ***
## trim550       -3.502e+04  9.792e+02  -35.761  < 2e-16 ***
## trim600       -1.550e+04  1.046e+03  -14.823  < 2e-16 ***
## trim63 AMG     1.592e+03  1.028e+03    1.548 0.121578    
## trim65 AMG     2.028e+04  1.182e+03   17.168  < 2e-16 ***
## trimunsp       5.225e+03  1.686e+03    3.099 0.001945 ** 
## conditionNew   3.691e+04  2.616e+02  141.093  < 2e-16 ***
## conditionUsed -5.566e+03  2.495e+02  -22.309  < 2e-16 ***
## mileage       -1.360e-01  3.193e-03  -42.590  < 2e-16 ***
## year           4.707e+03  4.550e+01  103.444  < 2e-16 ***
## fuelGasoline  -2.571e+03  1.307e+03   -1.967 0.049159 *  
## fuelHybrid    -1.143e+04  1.203e+04   -0.950 0.342302    
## fuelunsp       1.716e+04  1.477e+03   11.624  < 2e-16 ***
## featureCount  -1.203e+01  2.449e+00   -4.911 9.11e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11930 on 29445 degrees of freedom
## Multiple R-squared:  0.9293, Adjusted R-squared:  0.9293 
## F-statistic: 1.936e+04 on 20 and 29445 DF,  p-value: < 2.2e-16
# 0.9293 little improvement, consider removing featureCount
summary(lm(price ~ trim+condition+mileage+year+fuel+displacement))
## 
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel + 
##     displacement)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -65934  -5093   -968   3408 271481 
## 
## Coefficients: (4 not defined because of singularities)
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -8.215e+06  9.754e+04 -84.220  < 2e-16 ***
## trim350            2.815e+04  1.363e+04   2.065 0.038964 *  
## trim400           -7.461e+03  1.808e+04  -0.413 0.679841    
## trim420            5.610e+04  1.358e+04   4.131 3.62e-05 ***
## trim430            3.206e+04  1.356e+04   2.364 0.018065 *  
## trim450           -3.708e+04  1.536e+04  -2.413 0.015813 *  
## trim500            3.482e+04  1.356e+04   2.568 0.010242 *  
## trim55 AMG         3.353e+04  1.358e+04   2.470 0.013514 *  
## trim550           -4.726e+03  1.089e+04  -0.434 0.664343    
## trim600            5.153e+03  1.090e+04   0.473 0.636298    
## trim63 AMG         4.646e+04  1.091e+04   4.259 2.06e-05 ***
## trim65 AMG         6.386e+03  1.095e+04   0.583 0.559681    
## trimunsp           2.808e+04  1.093e+04   2.569 0.010196 *  
## conditionNew       3.596e+04  2.372e+02 151.602  < 2e-16 ***
## conditionUsed     -4.382e+03  2.292e+02 -19.120  < 2e-16 ***
## mileage           -1.319e-01  2.920e-03 -45.174  < 2e-16 ***
## year               4.100e+03  4.829e+01  84.893  < 2e-16 ***
## fuelGasoline       8.141e+02  5.209e+03   0.156 0.875823    
## fuelHybrid        -9.159e+03  1.205e+04  -0.760 0.447068    
## fuelunsp           1.350e+04  5.162e+03   2.615 0.008930 ** 
## displacement3.2 L  5.491e+04  1.358e+04   4.044 5.26e-05 ***
## displacement3.5 L  3.932e+04  4.948e+03   7.947 1.97e-15 ***
## displacement3.7 L -7.844e+03  5.354e+03  -1.465 0.142913    
## displacement4.2 L         NA         NA      NA       NA    
## displacement4.3 L         NA         NA      NA       NA    
## displacement4.6 L  3.621e+04  8.194e+03   4.419 9.93e-06 ***
## displacement4.7 L  5.858e+04  8.203e+03   7.142 9.42e-13 ***
## displacement5.0 L         NA         NA      NA       NA    
## displacement5.4 L         NA         NA      NA       NA    
## displacement5.5 L  3.193e+04  8.194e+03   3.897 9.76e-05 ***
## displacement5.8 L  3.133e+04  8.294e+03   3.777 0.000159 ***
## displacement6.0 L  7.898e+04  8.226e+03   9.602  < 2e-16 ***
## displacement6.3 L -7.724e+03  8.225e+03  -0.939 0.347655    
## displacement8.0 L  5.878e+04  1.359e+04   4.324 1.54e-05 ***
## displacementunsp   4.121e+04  8.164e+03   5.048 4.50e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10830 on 29435 degrees of freedom
## Multiple R-squared:  0.9417, Adjusted R-squared:  0.9417 
## F-statistic: 1.586e+04 on 30 and 29435 DF,  p-value: < 2.2e-16
# adjusted R 0.9417. If continuous variable - 0.9311 ????
summary(lm(price ~ trim+condition+mileage+year+fuel+displacement+color))
## 
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel + 
##     displacement + color)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -65951  -5103   -962   3369 272133 
## 
## Coefficients: (4 not defined because of singularities)
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -8.216e+06  9.748e+04 -84.279  < 2e-16 ***
## trim350            2.837e+04  1.361e+04   2.085  0.03705 *  
## trim400           -8.156e+03  1.804e+04  -0.452  0.65123    
## trim420            5.613e+04  1.355e+04   4.141 3.47e-05 ***
## trim430            3.256e+04  1.353e+04   2.406  0.01613 *  
## trim450           -3.554e+04  1.534e+04  -2.318  0.02047 *  
## trim500            3.527e+04  1.353e+04   2.606  0.00916 ** 
## trim55 AMG         3.418e+04  1.355e+04   2.523  0.01164 *  
## trim550           -4.386e+03  1.087e+04  -0.403  0.68660    
## trim600            5.573e+03  1.088e+04   0.512  0.60840    
## trim63 AMG         4.664e+04  1.089e+04   4.284 1.84e-05 ***
## trim65 AMG         6.966e+03  1.093e+04   0.638  0.52376    
## trimunsp           2.847e+04  1.091e+04   2.610  0.00906 ** 
## conditionNew       3.596e+04  2.381e+02 151.007  < 2e-16 ***
## conditionUsed     -4.368e+03  2.289e+02 -19.081  < 2e-16 ***
## mileage           -1.314e-01  2.917e-03 -45.065  < 2e-16 ***
## year               4.100e+03  4.827e+01  84.940  < 2e-16 ***
## fuelGasoline       6.670e+02  5.198e+03   0.128  0.89791    
## fuelHybrid        -7.896e+03  1.202e+04  -0.657  0.51126    
## fuelunsp           1.341e+04  5.151e+03   2.604  0.00921 ** 
## displacement3.2 L  5.501e+04  1.355e+04   4.060 4.91e-05 ***
## displacement3.5 L  3.895e+04  4.939e+03   7.887 3.20e-15 ***
## displacement3.7 L -7.493e+03  5.343e+03  -1.403  0.16076    
## displacement4.2 L         NA         NA      NA       NA    
## displacement4.3 L         NA         NA      NA       NA    
## displacement4.6 L  3.621e+04  8.176e+03   4.429 9.49e-06 ***
## displacement4.7 L  5.851e+04  8.185e+03   7.148 9.01e-13 ***
## displacement5.0 L         NA         NA      NA       NA    
## displacement5.4 L         NA         NA      NA       NA    
## displacement5.5 L  3.202e+04  8.177e+03   3.917 9.00e-05 ***
## displacement5.8 L  3.152e+04  8.276e+03   3.808  0.00014 ***
## displacement6.0 L  7.886e+04  8.208e+03   9.607  < 2e-16 ***
## displacement6.3 L -7.486e+03  8.207e+03  -0.912  0.36170    
## displacement8.0 L  5.884e+04  1.356e+04   4.338 1.44e-05 ***
## displacementunsp   4.093e+04  8.146e+03   5.024 5.09e-07 ***
## colorBlack        -4.331e+02  7.547e+02  -0.574  0.56600    
## colorBlue         -8.082e+02  7.954e+02  -1.016  0.30957    
## colorBronze        4.144e+03  3.894e+03   1.064  0.28726    
## colorBrown        -1.986e+01  1.537e+03  -0.013  0.98969    
## colorGold          1.137e+03  1.015e+03   1.120  0.26257    
## colorGray         -1.474e+03  7.861e+02  -1.876  0.06073 .  
## colorGreen         8.609e+01  1.111e+03   0.078  0.93821    
## colorPurple        5.767e+03  3.895e+03   1.480  0.13876    
## colorRed          -3.988e+01  9.610e+02  -0.042  0.96690    
## colorSilver       -1.095e+03  7.596e+02  -1.442  0.14939    
## colorTurquoise    -1.690e+03  4.892e+03  -0.346  0.72969    
## colorunsp          1.841e+02  7.997e+02   0.230  0.81794    
## colorWhite         1.079e+03  7.665e+02   1.408  0.15919    
## colorYellow       -6.485e+03  7.679e+03  -0.845  0.39836    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10810 on 29421 degrees of freedom
## Multiple R-squared:  0.942,  Adjusted R-squared:  0.9419 
## F-statistic: 1.086e+04 on 44 and 29421 DF,  p-value: < 2.2e-16
# 0.9419, not much improvement for adjusted R, much more variance though. drop.
summary(lm(price ~ trim+condition+mileage+year+fuel+displacement+state))
## 
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel + 
##     displacement + state)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -66378  -5064   -978   3355 272116 
## 
## Coefficients: (4 not defined because of singularities)
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -8.266e+06  9.802e+04 -84.327  < 2e-16 ***
## trim350            2.844e+04  1.360e+04   2.091  0.03658 *  
## trim400           -7.096e+03  1.805e+04  -0.393  0.69414    
## trim420            5.666e+04  1.355e+04   4.181 2.91e-05 ***
## trim430            3.259e+04  1.353e+04   2.408  0.01604 *  
## trim450           -3.720e+04  1.627e+04  -2.287  0.02222 *  
## trim500            3.529e+04  1.353e+04   2.608  0.00912 ** 
## trim55 AMG         3.395e+04  1.355e+04   2.506  0.01221 *  
## trim550           -4.795e+03  1.087e+04  -0.441  0.65897    
## trim600            5.016e+03  1.087e+04   0.461  0.64455    
## trim63 AMG         4.626e+04  1.088e+04   4.251 2.14e-05 ***
## trim65 AMG         6.175e+03  1.092e+04   0.565  0.57184    
## trimunsp           2.831e+04  1.090e+04   2.596  0.00944 ** 
## conditionNew       3.600e+04  2.388e+02 150.705  < 2e-16 ***
## conditionUsed     -4.388e+03  2.315e+02 -18.953  < 2e-16 ***
## mileage           -1.313e-01  2.933e-03 -44.772  < 2e-16 ***
## year               4.122e+03  4.839e+01  85.186  < 2e-16 ***
## fuelGasoline       7.349e+02  5.204e+03   0.141  0.88771    
## fuelHybrid        -9.052e+03  1.202e+04  -0.753  0.45142    
## fuelunsp           1.352e+04  5.156e+03   2.622  0.00874 ** 
## displacement3.2 L  5.547e+04  1.355e+04   4.094 4.25e-05 ***
## displacement3.5 L  3.929e+04  4.964e+03   7.914 2.58e-15 ***
## displacement3.7 L -7.674e+03  5.348e+03  -1.435  0.15133    
## displacement4.2 L         NA         NA      NA       NA    
## displacement4.3 L         NA         NA      NA       NA    
## displacement4.6 L  3.679e+04  8.177e+03   4.499 6.86e-06 ***
## displacement4.7 L  5.907e+04  8.186e+03   7.215 5.52e-13 ***
## displacement5.0 L         NA         NA      NA       NA    
## displacement5.4 L         NA         NA      NA       NA    
## displacement5.5 L  3.254e+04  8.178e+03   3.979 6.95e-05 ***
## displacement5.8 L  3.201e+04  8.278e+03   3.867  0.00011 ***
## displacement6.0 L  7.959e+04  8.210e+03   9.694  < 2e-16 ***
## displacement6.3 L -7.129e+03  8.208e+03  -0.869  0.38510    
## displacement8.0 L  5.919e+04  1.357e+04   4.362 1.29e-05 ***
## displacementunsp   4.168e+04  8.147e+03   5.116 3.14e-07 ***
## stateAL            5.528e+03  7.665e+03   0.721  0.47080    
## stateAR            6.300e+03  7.754e+03   0.813  0.41649    
## stateAZ            6.433e+03  7.662e+03   0.840  0.40113    
## stateCA            5.824e+03  7.644e+03   0.762  0.44610    
## stateCO            6.300e+03  7.660e+03   0.822  0.41089    
## stateCT            5.308e+03  7.664e+03   0.693  0.48856    
## stateDC           -8.039e+03  9.864e+03  -0.815  0.41509    
## stateDE            6.210e+03  7.721e+03   0.804  0.42124    
## stateFL            5.222e+03  7.644e+03   0.683  0.49458    
## stateGA            4.687e+03  7.648e+03   0.613  0.53993    
## stateHI            4.802e+03  7.714e+03   0.623  0.53359    
## stateIA            7.375e+03  7.841e+03   0.941  0.34689    
## stateID            8.983e+03  7.967e+03   1.128  0.25954    
## stateIL            5.712e+03  7.648e+03   0.747  0.45518    
## stateIN            4.874e+03  7.682e+03   0.635  0.52575    
## stateKS            5.903e+03  7.732e+03   0.764  0.44516    
## stateKY            7.915e+03  7.680e+03   1.031  0.30277    
## stateLA            7.165e+03  7.692e+03   0.932  0.35160    
## stateMA            5.588e+03  7.652e+03   0.730  0.46524    
## stateMD            6.112e+03  7.654e+03   0.798  0.42459    
## stateME            3.760e+03  7.954e+03   0.473  0.63647    
## stateMI            5.180e+03  7.676e+03   0.675  0.49981    
## stateMN            6.839e+03  7.675e+03   0.891  0.37287    
## stateMO            6.993e+03  7.663e+03   0.913  0.36149    
## stateMS            7.859e+03  7.701e+03   1.021  0.30749    
## stateMT            8.189e+03  8.309e+03   0.986  0.32437    
## stateNC            6.524e+03  7.651e+03   0.853  0.39387    
## stateND            9.384e+03  9.043e+03   1.038  0.29940    
## stateNE            5.873e+03  8.018e+03   0.732  0.46388    
## stateNH            6.605e+03  7.695e+03   0.858  0.39068    
## stateNJ            5.042e+03  7.646e+03   0.659  0.50959    
## stateNM            6.362e+03  7.864e+03   0.809  0.41853    
## stateNV            8.502e+03  7.662e+03   1.110  0.26720    
## stateNY            4.410e+03  7.645e+03   0.577  0.56407    
## stateOH            4.956e+03  7.654e+03   0.647  0.51733    
## stateOK            6.212e+03  7.683e+03   0.809  0.41878    
## stateON            5.803e+03  9.366e+03   0.620  0.53555    
## stateOR            6.773e+03  7.690e+03   0.881  0.37848    
## statePA            5.557e+03  7.652e+03   0.726  0.46767    
## stateRI            4.101e+03  7.785e+03   0.527  0.59835    
## stateSC            5.943e+03  7.672e+03   0.775  0.43855    
## stateSD            2.341e+04  1.081e+04   2.166  0.03030 *  
## stateTN            4.366e+03  7.660e+03   0.570  0.56872    
## stateTX            6.447e+03  7.645e+03   0.843  0.39910    
## stateunsp         -2.494e+04  1.083e+04  -2.303  0.02128 *  
## stateUT            7.820e+03  7.701e+03   1.015  0.30990    
## stateVA            5.677e+03  7.650e+03   0.742  0.45803    
## stateWA            6.872e+03  7.666e+03   0.896  0.37001    
## stateWI            6.326e+03  7.693e+03   0.822  0.41086    
## stateWV            6.624e+03  7.836e+03   0.845  0.39793    
## stateWY            1.339e+03  1.081e+04   0.124  0.90136    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10800 on 29384 degrees of freedom
## Multiple R-squared:  0.9421, Adjusted R-squared:  0.942 
## F-statistic:  5905 on 81 and 29384 DF,  p-value: < 2.2e-16
# 0.942 6not much improvement for adjusted R, much more variance though. drop.
summary(lm(price ~ trim+condition+mileage+year+fuel+displacement+region))
## 
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel + 
##     displacement + region)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -67227  -5092   -969   3375 272322 
## 
## Coefficients: (4 not defined because of singularities)
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -8.235e+06  9.752e+04 -84.446  < 2e-16 ***
## trim350            2.863e+04  1.362e+04   2.103 0.035516 *  
## trim400           -6.821e+03  1.805e+04  -0.378 0.705560    
## trim420            5.658e+04  1.356e+04   4.172 3.03e-05 ***
## trim430            3.253e+04  1.354e+04   2.402 0.016312 *  
## trim450           -2.677e+04  1.598e+04  -1.675 0.093933 .  
## trim500            3.524e+04  1.354e+04   2.602 0.009264 ** 
## trim55 AMG         3.391e+04  1.356e+04   2.501 0.012380 *  
## trim550           -4.533e+03  1.088e+04  -0.417 0.676804    
## trim600            5.286e+03  1.088e+04   0.486 0.627174    
## trim63 AMG         4.654e+04  1.089e+04   4.272 1.94e-05 ***
## trim65 AMG         6.315e+03  1.093e+04   0.578 0.563503    
## trimunsp           2.824e+04  1.091e+04   2.587 0.009679 ** 
## conditionNew       3.602e+04  2.381e+02 151.312  < 2e-16 ***
## conditionUsed     -4.433e+03  2.300e+02 -19.276  < 2e-16 ***
## mileage           -1.311e-01  2.918e-03 -44.909  < 2e-16 ***
## year               4.110e+03  4.828e+01  85.113  < 2e-16 ***
## fuelGasoline       7.116e+02  5.203e+03   0.137 0.891226    
## fuelHybrid        -9.041e+03  1.203e+04  -0.752 0.452274    
## fuelunsp           1.349e+04  5.155e+03   2.618 0.008860 ** 
## displacement3.2 L  5.539e+04  1.356e+04   4.085 4.42e-05 ***
## displacement3.5 L  3.892e+04  4.942e+03   7.877 3.48e-15 ***
## displacement3.7 L -7.904e+03  5.347e+03  -1.478 0.139394    
## displacement4.2 L         NA         NA      NA       NA    
## displacement4.3 L         NA         NA      NA       NA    
## displacement4.6 L  3.653e+04  8.183e+03   4.464 8.09e-06 ***
## displacement4.7 L  5.887e+04  8.192e+03   7.186 6.84e-13 ***
## displacement5.0 L         NA         NA      NA       NA    
## displacement5.4 L         NA         NA      NA       NA    
## displacement5.5 L  3.226e+04  8.184e+03   3.942 8.10e-05 ***
## displacement5.8 L  3.157e+04  8.284e+03   3.811 0.000139 ***
## displacement6.0 L  7.943e+04  8.216e+03   9.668  < 2e-16 ***
## displacement6.3 L -7.392e+03  8.214e+03  -0.900 0.368175    
## displacement8.0 L  5.900e+04  1.358e+04   4.345 1.40e-05 ***
## displacementunsp   4.145e+04  8.153e+03   5.084 3.71e-07 ***
## regionESC          3.532e+02  3.932e+02   0.898 0.369104    
## regionMid         -6.198e+02  2.618e+02  -2.367 0.017927 *  
## regionMtn          1.692e+03  3.641e+02   4.646 3.40e-06 ***
## regionNew          9.890e+01  3.608e+02   0.274 0.783996    
## regionPac          4.464e+02  2.604e+02   1.714 0.086549 .  
## regionSoA         -7.030e+00  2.491e+02  -0.028 0.977484    
## regionunsp        -9.875e+03  4.434e+03  -2.227 0.025962 *  
## regionWNC          1.427e+03  4.520e+02   3.157 0.001593 ** 
## regionWSC          1.031e+03  2.972e+02   3.468 0.000524 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10810 on 29426 degrees of freedom
## Multiple R-squared:  0.9419, Adjusted R-squared:  0.9419 
## F-statistic: 1.224e+04 on 39 and 29426 DF,  p-value: < 2.2e-16
# 0.9419 drop.
summary(lm(price ~ trim+condition+mileage+year+fuel+displacement+soundSystem))
## 
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel + 
##     displacement + soundSystem)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -65902  -5079   -910   3477 270568 
## 
## Coefficients: (4 not defined because of singularities)
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                -8.205e+06  9.763e+04 -84.042  < 2e-16 ***
## trim350                     2.810e+04  1.359e+04   2.068 0.038682 *  
## trim400                    -5.851e+03  1.802e+04  -0.325 0.745483    
## trim420                     5.677e+04  1.354e+04   4.193 2.76e-05 ***
## trim430                     3.290e+04  1.352e+04   2.433 0.014972 *  
## trim450                    -3.490e+04  1.532e+04  -2.278 0.022734 *  
## trim500                     3.564e+04  1.352e+04   2.636 0.008383 ** 
## trim55 AMG                  3.458e+04  1.354e+04   2.555 0.010621 *  
## trim550                    -4.119e+03  1.086e+04  -0.379 0.704425    
## trim600                     5.549e+03  1.086e+04   0.511 0.609537    
## trim63 AMG                  4.688e+04  1.088e+04   4.310 1.64e-05 ***
## trim65 AMG                  6.815e+03  1.091e+04   0.624 0.532387    
## trimunsp                    2.861e+04  1.090e+04   2.626 0.008649 ** 
## conditionNew                3.547e+04  2.424e+02 146.299  < 2e-16 ***
## conditionUsed              -4.250e+03  2.289e+02 -18.565  < 2e-16 ***
## mileage                    -1.331e-01  2.915e-03 -45.645  < 2e-16 ***
## year                        4.098e+03  4.824e+01  84.944  < 2e-16 ***
## fuelGasoline               -4.827e+02  5.195e+03  -0.093 0.925969    
## fuelHybrid                 -1.027e+04  1.201e+04  -0.855 0.392673    
## fuelunsp                    1.223e+04  5.147e+03   2.375 0.017533 *  
## displacement3.2 L           5.549e+04  1.354e+04   4.099 4.15e-05 ***
## displacement3.5 L           3.867e+04  4.933e+03   7.839 4.69e-15 ***
## displacement3.7 L          -7.005e+03  5.338e+03  -1.312 0.189452    
## displacement4.2 L                  NA         NA      NA       NA    
## displacement4.3 L                  NA         NA      NA       NA    
## displacement4.6 L           3.665e+04  8.169e+03   4.486 7.30e-06 ***
## displacement4.7 L           5.903e+04  8.178e+03   7.217 5.43e-13 ***
## displacement5.0 L                  NA         NA      NA       NA    
## displacement5.4 L                  NA         NA      NA       NA    
## displacement5.5 L           3.240e+04  8.170e+03   3.966 7.32e-05 ***
## displacement5.8 L           3.173e+04  8.269e+03   3.836 0.000125 ***
## displacement6.0 L           7.935e+04  8.202e+03   9.675  < 2e-16 ***
## displacement6.3 L          -6.933e+03  8.200e+03  -0.845 0.397847    
## displacement8.0 L           5.887e+04  1.355e+04   4.343 1.41e-05 ***
## displacementunsp            4.140e+04  8.140e+03   5.086 3.69e-07 ***
## soundSystemBang Olufsen    -3.039e+03  7.683e+03  -0.396 0.692414    
## soundSystemBose            -7.535e+03  7.645e+03  -0.986 0.324333    
## soundSystemBoston Acoustic -9.412e+03  1.323e+04  -0.712 0.476775    
## soundSystemHarman Kardon   -7.560e+03  7.639e+03  -0.990 0.322362    
## soundSystemPremium         -5.803e+03  7.638e+03  -0.760 0.447440    
## soundSystemunsp            -5.108e+03  7.638e+03  -0.669 0.503661    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10800 on 29429 degrees of freedom
## Multiple R-squared:  0.9421, Adjusted R-squared:  0.942 
## F-statistic: 1.33e+04 on 36 and 29429 DF,  p-value: < 2.2e-16
# 0.942 drop
summary(lm(price ~ trim+condition+mileage+year+fuel+displacement+wheelType))
## 
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel + 
##     displacement + wheelType)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -66389  -5098   -926   3419 271832 
## 
## Coefficients: (4 not defined because of singularities)
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -8.203e+06  9.733e+04 -84.281  < 2e-16 ***
## trim350            2.706e+04  1.360e+04   1.990 0.046609 *  
## trim400           -8.667e+03  1.803e+04  -0.481 0.630825    
## trim420            5.529e+04  1.355e+04   4.081 4.49e-05 ***
## trim430            3.127e+04  1.353e+04   2.312 0.020797 *  
## trim450           -3.730e+04  1.532e+04  -2.434 0.014935 *  
## trim500            3.404e+04  1.353e+04   2.517 0.011848 *  
## trim55 AMG         3.277e+04  1.354e+04   2.420 0.015524 *  
## trim550           -5.258e+03  1.086e+04  -0.484 0.628350    
## trim600            4.513e+03  1.087e+04   0.415 0.678024    
## trim63 AMG         4.591e+04  1.088e+04   4.220 2.46e-05 ***
## trim65 AMG         5.680e+03  1.092e+04   0.520 0.602978    
## trimunsp           2.752e+04  1.090e+04   2.525 0.011589 *  
## conditionNew       3.585e+04  2.388e+02 150.135  < 2e-16 ***
## conditionUsed     -4.458e+03  2.289e+02 -19.482  < 2e-16 ***
## mileage           -1.316e-01  2.913e-03 -45.162  < 2e-16 ***
## year               4.094e+03  4.819e+01  84.962  < 2e-16 ***
## fuelGasoline       4.810e+02  5.196e+03   0.093 0.926255    
## fuelHybrid        -8.945e+03  1.201e+04  -0.745 0.456565    
## fuelunsp           1.319e+04  5.149e+03   2.562 0.010403 *  
## displacement3.2 L  5.411e+04  1.354e+04   3.995 6.47e-05 ***
## displacement3.5 L  3.925e+04  4.935e+03   7.953 1.89e-15 ***
## displacement3.7 L -7.576e+03  5.340e+03  -1.419 0.156009    
## displacement4.2 L         NA         NA      NA       NA    
## displacement4.3 L         NA         NA      NA       NA    
## displacement4.6 L  3.590e+04  8.173e+03   4.393 1.12e-05 ***
## displacement4.7 L  5.839e+04  8.182e+03   7.137 9.76e-13 ***
## displacement5.0 L         NA         NA      NA       NA    
## displacement5.4 L         NA         NA      NA       NA    
## displacement5.5 L  3.169e+04  8.173e+03   3.877 0.000106 ***
## displacement5.8 L  3.124e+04  8.273e+03   3.776 0.000160 ***
## displacement6.0 L  7.892e+04  8.205e+03   9.618  < 2e-16 ***
## displacement6.3 L -7.949e+03  8.204e+03  -0.969 0.332569    
## displacement8.0 L  5.815e+04  1.356e+04   4.289 1.80e-05 ***
## displacementunsp   4.068e+04  8.143e+03   4.996 5.88e-07 ***
## wheelTypeChrome    6.753e+02  1.214e+03   0.556 0.577943    
## wheelTypePremium  -4.057e+02  5.336e+02  -0.760 0.447033    
## wheelTypeSteel     1.700e+04  1.548e+03  10.981  < 2e-16 ***
## wheelTypeunsp      8.083e+02  1.297e+02   6.234 4.61e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10800 on 29431 degrees of freedom
## Multiple R-squared:  0.942,  Adjusted R-squared:  0.942 
## F-statistic: 1.407e+04 on 34 and 29431 DF,  p-value: < 2.2e-16
# 0.942 drop
summary(lm(price ~ trim+condition+mileage+year+fuel+displacement+wheelSize))
## 
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel + 
##     displacement + wheelSize)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -65260  -5144   -926   3447 271523 
## 
## Coefficients: (4 not defined because of singularities)
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -8.251e+06  9.793e+04 -84.247  < 2e-16 ***
## trim350            2.762e+04  1.359e+04   2.033 0.042090 *  
## trim400           -8.261e+03  1.802e+04  -0.458 0.646598    
## trim420            5.571e+04  1.353e+04   4.117 3.85e-05 ***
## trim430            3.174e+04  1.351e+04   2.349 0.018840 *  
## trim450           -3.672e+04  1.531e+04  -2.398 0.016482 *  
## trim500            3.454e+04  1.351e+04   2.556 0.010593 *  
## trim55 AMG         3.320e+04  1.353e+04   2.454 0.014138 *  
## trim550           -4.780e+03  1.085e+04  -0.440 0.659669    
## trim600            5.230e+03  1.086e+04   0.482 0.630093    
## trim63 AMG         4.542e+04  1.087e+04   4.178 2.94e-05 ***
## trim65 AMG         6.267e+03  1.091e+04   0.574 0.565674    
## trimunsp           2.764e+04  1.089e+04   2.538 0.011167 *  
## conditionNew       3.583e+04  2.390e+02 149.919  < 2e-16 ***
## conditionUsed     -4.344e+03  2.286e+02 -19.004  < 2e-16 ***
## mileage           -1.316e-01  2.912e-03 -45.200  < 2e-16 ***
## year               4.120e+03  4.853e+01  84.896  < 2e-16 ***
## fuelGasoline       8.225e+02  5.191e+03   0.158 0.874119    
## fuelHybrid        -9.191e+03  1.200e+04  -0.766 0.443887    
## fuelunsp           1.339e+04  5.144e+03   2.603 0.009244 ** 
## displacement3.2 L  5.431e+04  1.353e+04   4.014 5.98e-05 ***
## displacement3.5 L  3.966e+04  4.931e+03   8.043 9.08e-16 ***
## displacement3.7 L -7.245e+03  5.336e+03  -1.358 0.174573    
## displacement4.2 L         NA         NA      NA       NA    
## displacement4.3 L         NA         NA      NA       NA    
## displacement4.6 L  3.578e+04  8.165e+03   4.382 1.18e-05 ***
## displacement4.7 L  5.774e+04  8.175e+03   7.063 1.66e-12 ***
## displacement5.0 L         NA         NA      NA       NA    
## displacement5.4 L         NA         NA      NA       NA    
## displacement5.5 L  3.150e+04  8.166e+03   3.858 0.000115 ***
## displacement5.8 L  3.122e+04  8.266e+03   3.777 0.000159 ***
## displacement6.0 L  7.801e+04  8.198e+03   9.516  < 2e-16 ***
## displacement6.3 L -7.406e+03  8.196e+03  -0.904 0.366218    
## displacement8.0 L  5.853e+04  1.355e+04   4.320 1.56e-05 ***
## displacementunsp   4.107e+04  8.136e+03   5.048 4.50e-07 ***
## wheelSize17       -9.986e+03  1.382e+03  -7.226 5.10e-13 ***
## wheelSize18       -5.215e+03  1.097e+03  -4.754 2.00e-06 ***
## wheelSize19       -5.071e+03  1.110e+03  -4.570 4.91e-06 ***
## wheelSize20        1.377e+02  1.136e+03   0.121 0.903539    
## wheelSize21       -2.997e+03  7.706e+03  -0.389 0.697344    
## wheelSize22       -1.376e+02  2.212e+03  -0.062 0.950404    
## wheelSizeunsp     -4.579e+03  1.062e+03  -4.312 1.62e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10790 on 29428 degrees of freedom
## Multiple R-squared:  0.9422, Adjusted R-squared:  0.9421 
## F-statistic: 1.296e+04 on 37 and 29428 DF,  p-value: < 2.2e-16
# 0.9421 drop
summary(lm(price ~ trim+condition+mileage+year+fuel+displacement+featureCount))
## 
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel + 
##     displacement + featureCount)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -66008  -5090   -956   3396 271548 
## 
## Coefficients: (4 not defined because of singularities)
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -8.270e+06  9.793e+04 -84.451  < 2e-16 ***
## trim350            2.739e+04  1.363e+04   2.010 0.044442 *  
## trim400           -8.559e+03  1.807e+04  -0.474 0.635753    
## trim420            5.561e+04  1.357e+04   4.097 4.20e-05 ***
## trim430            3.158e+04  1.355e+04   2.330 0.019808 *  
## trim450           -3.750e+04  1.536e+04  -2.442 0.014618 *  
## trim500            3.433e+04  1.355e+04   2.533 0.011304 *  
## trim55 AMG         3.307e+04  1.357e+04   2.437 0.014806 *  
## trim550           -5.144e+03  1.088e+04  -0.473 0.636515    
## trim600            4.739e+03  1.089e+04   0.435 0.663484    
## trim63 AMG         4.601e+04  1.090e+04   4.220 2.45e-05 ***
## trim65 AMG         5.955e+03  1.094e+04   0.544 0.586244    
## trimunsp           2.772e+04  1.092e+04   2.538 0.011159 *  
## conditionNew       3.577e+04  2.391e+02 149.584  < 2e-16 ***
## conditionUsed     -4.476e+03  2.296e+02 -19.495  < 2e-16 ***
## mileage           -1.317e-01  2.919e-03 -45.124  < 2e-16 ***
## year               4.128e+03  4.850e+01  85.115  < 2e-16 ***
## fuelGasoline       5.903e+02  5.207e+03   0.113 0.909728    
## fuelHybrid        -8.956e+03  1.204e+04  -0.744 0.456910    
## fuelunsp           1.318e+04  5.159e+03   2.555 0.010631 *  
## displacement3.2 L  5.441e+04  1.357e+04   4.010 6.08e-05 ***
## displacement3.5 L  3.948e+04  4.945e+03   7.984 1.46e-15 ***
## displacement3.7 L -7.623e+03  5.351e+03  -1.425 0.154290    
## displacement4.2 L         NA         NA      NA       NA    
## displacement4.3 L         NA         NA      NA       NA    
## displacement4.6 L  3.609e+04  8.189e+03   4.408 1.05e-05 ***
## displacement4.7 L  5.850e+04  8.198e+03   7.135 9.88e-13 ***
## displacement5.0 L         NA         NA      NA       NA    
## displacement5.4 L         NA         NA      NA       NA    
## displacement5.5 L  3.184e+04  8.190e+03   3.887 0.000102 ***
## displacement5.8 L  3.133e+04  8.289e+03   3.779 0.000158 ***
## displacement6.0 L  7.893e+04  8.221e+03   9.601  < 2e-16 ***
## displacement6.3 L -7.779e+03  8.220e+03  -0.946 0.343996    
## displacement8.0 L  5.810e+04  1.359e+04   4.277 1.90e-05 ***
## displacementunsp   4.070e+04  8.160e+03   4.988 6.12e-07 ***
## featureCount      -1.323e+01  2.227e+00  -5.942 2.84e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10820 on 29434 degrees of freedom
## Multiple R-squared:  0.9418, Adjusted R-squared:  0.9417 
## F-statistic: 1.537e+04 on 31 and 29434 DF,  p-value: < 2.2e-16
# 0.9417 drop
# so best bet is: adjusted R 0.9417:
summary(lm(price ~ trim+condition+mileage+year+fuel+displacement))
## 
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel + 
##     displacement)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -65934  -5093   -968   3408 271481 
## 
## Coefficients: (4 not defined because of singularities)
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -8.215e+06  9.754e+04 -84.220  < 2e-16 ***
## trim350            2.815e+04  1.363e+04   2.065 0.038964 *  
## trim400           -7.461e+03  1.808e+04  -0.413 0.679841    
## trim420            5.610e+04  1.358e+04   4.131 3.62e-05 ***
## trim430            3.206e+04  1.356e+04   2.364 0.018065 *  
## trim450           -3.708e+04  1.536e+04  -2.413 0.015813 *  
## trim500            3.482e+04  1.356e+04   2.568 0.010242 *  
## trim55 AMG         3.353e+04  1.358e+04   2.470 0.013514 *  
## trim550           -4.726e+03  1.089e+04  -0.434 0.664343    
## trim600            5.153e+03  1.090e+04   0.473 0.636298    
## trim63 AMG         4.646e+04  1.091e+04   4.259 2.06e-05 ***
## trim65 AMG         6.386e+03  1.095e+04   0.583 0.559681    
## trimunsp           2.808e+04  1.093e+04   2.569 0.010196 *  
## conditionNew       3.596e+04  2.372e+02 151.602  < 2e-16 ***
## conditionUsed     -4.382e+03  2.292e+02 -19.120  < 2e-16 ***
## mileage           -1.319e-01  2.920e-03 -45.174  < 2e-16 ***
## year               4.100e+03  4.829e+01  84.893  < 2e-16 ***
## fuelGasoline       8.141e+02  5.209e+03   0.156 0.875823    
## fuelHybrid        -9.159e+03  1.205e+04  -0.760 0.447068    
## fuelunsp           1.350e+04  5.162e+03   2.615 0.008930 ** 
## displacement3.2 L  5.491e+04  1.358e+04   4.044 5.26e-05 ***
## displacement3.5 L  3.932e+04  4.948e+03   7.947 1.97e-15 ***
## displacement3.7 L -7.844e+03  5.354e+03  -1.465 0.142913    
## displacement4.2 L         NA         NA      NA       NA    
## displacement4.3 L         NA         NA      NA       NA    
## displacement4.6 L  3.621e+04  8.194e+03   4.419 9.93e-06 ***
## displacement4.7 L  5.858e+04  8.203e+03   7.142 9.42e-13 ***
## displacement5.0 L         NA         NA      NA       NA    
## displacement5.4 L         NA         NA      NA       NA    
## displacement5.5 L  3.193e+04  8.194e+03   3.897 9.76e-05 ***
## displacement5.8 L  3.133e+04  8.294e+03   3.777 0.000159 ***
## displacement6.0 L  7.898e+04  8.226e+03   9.602  < 2e-16 ***
## displacement6.3 L -7.724e+03  8.225e+03  -0.939 0.347655    
## displacement8.0 L  5.878e+04  1.359e+04   4.324 1.54e-05 ***
## displacementunsp   4.121e+04  8.164e+03   5.048 4.50e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10830 on 29435 degrees of freedom
## Multiple R-squared:  0.9417, Adjusted R-squared:  0.9417 
## F-statistic: 1.586e+04 on 30 and 29435 DF,  p-value: < 2.2e-16
best.mlr = lm(price ~ trim+condition+mileage+year+fuel+displacement)
# - mileage:.9269    -year: 0.9133 -fuel: .9292 -displacement: .9292
############ predicted values are stored in mlr.yhat #############
test_x = read.csv("/Users/vickyzhang/Documents/MSBA/predictive/project/Cars_X_out.csv", header = TRUE)
test_x = test_x[,c(-1, -3, -8, -13)]
train=sample(1:nrow(car), nrow(car)*9/10)
test=(-train)
testset = car[test,]
y.test = car[test,'price']
mlr.yhat = predict(best.mlr, newdata=test_x)
## Warning in predict.lm(best.mlr, newdata = test_x): prediction from a rank-
## deficient fit may be misleading
head(mlr.yhat)
##         1         2         3         4         5         6 
## 114326.42  33249.19  45226.87 110227.17  56987.16 114326.55
MSE = mean((mlr.yhat - y.test)^2)
## Warning in mlr.yhat - y.test: longer object length is not a multiple of
## shorter object length
RMSE = sqrt(MSE)   # RMSE isn't as good as random forest though, even though they have the same variables
RMSE
## [1] 62748.92
library(tree)
library(rpart)
attach(car)
## The following objects are masked from car (pos = 6):
## 
##     color, condition, displacement, featureCount, fuel,
##     isOneOwner, mileage, price, region, soundSystem, state,
##     subTrim, trim, wheelSize, wheelType, X, year
## 
## The following objects are masked from car (pos = 7):
## 
##     color, condition, displacement, featureCount, fuel,
##     isOneOwner, mileage, price, region, soundSystem, state,
##     subTrim, trim, wheelSize, wheelType, X, year
#--------------------------------------------------
#reduce df to just lmed and lrat
bdf = car[,c(7,17)] #year and price
#--------------------------------------------------
#fit a big tree using rpart.control
big.tree = rpart(price~year,method="anova",data=bdf,
                        control=rpart.control(minsplit=5,cp=.0005))
nbig = length(unique(big.tree$where))
cat('size of big tree: ',nbig,'\n')
## size of big tree:  10
#--------------------------------------------------
#look at cross-validation
par(mfrow=c(1,1))
plotcp(big.tree)

#--------------------------------------------------
#show fit from some trees

oo=order(bdf$year)
bestcp=big.tree$cptable[which.min(big.tree$cptable[,"xerror"]),"CP"]
# because relative error doesn't change much when cp >= 0.0025, let's just set bestcp as 0.0025.
bestcp = 0.0025
cat('bestcp: ',bestcp,'\n')
## bestcp:  0.0025
cpvec = c(.0157,bestcp,.004)
par(mfrow=c(3,2))
for(i in 1:3) {
   plot(bdf,pch=16,col='blue',cex=.5)
   ptree = prune(big.tree,cp=cpvec[i])
   pfit = predict(ptree)
   lines(bdf$year[oo],pfit[oo],col='red',lwd=2)
   title(paste('alpha = ',round(cpvec[i],3)))
   plot(ptree,uniform=TRUE)
   text(ptree,digits=3,cex=1)
}

#--------------------------------------------------
#plot best tree

#reference: http://rankexploits.com/musings/2011/margin-control-in-r-oma-and-mar-and-the-vanishing-axis-label/
# http://research.stowers-institute.org/mcm/efg/R/Graphics/Basics/mar-oma/index.htm
# still can't figure out why the edges of the graph are cut off???!!!
par(mfrow=c(1,1), mar=c(1,1,1,1), oma=c(0,0,0,0))
#par(mar=c(5,3,2,2)+0.1)
best.tree = prune(big.tree,cp=bestcp) # best tree is here!
plot(best.tree,uniform=TRUE) # get rid of margin etc to see better
box('figure',lty='solid', col='green')
text(best.tree,digits=2,use.n=TRUE,cex=1) # get rid of fancy parameters!

rm(list=ls())
library(randomForest)
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
car = read.csv("/Users/vickyzhang/Documents/MSBA/predictive/project/cars.csv", header = TRUE)
car = car[,c(-1, -3, -8, -13)]
# converting dis to numerical didn't really help much
#dis = sapply(displacement, function(x) as.numeric(substr(x, 1, 3)))
#cars$displacement = dis
attach(car)
## The following objects are masked from car (pos = 4):
## 
##     condition, displacement, featureCount, fuel, isOneOwner,
##     mileage, price, region, state, trim, wheelSize, wheelType,
##     year
## 
## The following objects are masked from car (pos = 8):
## 
##     condition, displacement, featureCount, fuel, isOneOwner,
##     mileage, price, region, state, trim, wheelSize, wheelType,
##     year
## 
## The following objects are masked from car (pos = 9):
## 
##     condition, displacement, featureCount, fuel, isOneOwner,
##     mileage, price, region, state, trim, wheelSize, wheelType,
##     year
#--------------------------------------------------
#get rf fits for different number of trees
#note: to get this to work I had to use maxnodes parameter of randomForest!!!
set.seed(99)
n = nrow(car)
ntreev = c(10,500,1000) # I don't think you need 5000 trees.. changed it to 1000, save computational time
nset = length(ntreev)
fmat = matrix(0,n,nset) # predicted values, yhat, are stored in this matrix
for(i in 1:nset) {
   cat('doing car rf: ',i,'\n')
   rffit = randomForest(price~mileage,data=car,ntree=ntreev[i],maxnodes=15)
   fmat[,i] = predict(rffit)
}
## doing car rf:  1 
## doing car rf:  2 
## doing car rf:  3
# try fitting using 500 trees, this time mileage + year
#rf500 = randomForest(price~mileage+year,data=car,ntree=500,maxnodes=15)
#head(rf500)
# get error of validation set. use 90% of the dataaset as training, 10% as test
train=sample(1:nrow(car), nrow(car)*9/10)
test=(-train)
testset = car[test,]
# get model based on training dataset,using mileage and year
rftraining = randomForest(price~trim+condition+mileage+year+fuel+displacement,data=car[train,],ntree=500,maxnodes=15)
# get RMSE on validation dataset
y.pred = predict(rftraining,newdata = testset)
y.test = car[test,'price']
MSE = mean((y.pred - y.test)^2)
RMSE = sqrt(MSE)
RMSE
## [1] 9120.888
# 17084.87 with only mileage
# 13826.19 mileage + year
# 10603.14 mileage + year + trim
# 9243 trim+condition+mileage+year+fuel+displacement best so far. Interestingly, it's the same model obtained by mmultivariate linear regression!
# 
# try to fit the model on test x that prof gave
test_x = read.csv("/Users/vickyzhang/Documents/MSBA/predictive/project/Cars_X_out.csv", header = TRUE)
test_x = test_x[,c(-1, -3, -8, -13)]
### have to make sure factor variables in training and test data have the same levels, otherwise predict() would through an error
levels(test_x$trim) = levels(car$trim)
levels(test_x$condition) = levels(car$condition)
levels(test_x$fuel) = levels(car$fuel)
levels(test_x$displacement) = levels(car$displacement)
rftraining = randomForest(price~trim+condition+mileage+year+fuel+displacement, data=car[train,],ntree=500,maxnodes=15)

rftest = predict(rftraining, newdata=test_x)
head(rftest,10)
##         1         2         3         4         5         6         7 
## 106163.86  30025.39  46069.17 104940.75  51335.02 106154.24  56180.88 
##         8         9        10 
##  28154.26 106199.93 106204.11
#--------------------------------------------------
#plot oob error using last fitted rffit which has the largest ntree.


par(mfrow=c(1,1))
plot(rffit)

#--------------------------------------------------
#plot fits

par(mfrow=c(1,3))
oo = order(car$mileage)
for(i in 1:nset) {
   plot(car$mileage,car$price,xlab='mileage',ylab='price')
   lines(car$mileage[oo],fmat[oo,i],col=i,lwd=3)
   title(main=paste('bagging ntrees = ',ntreev[i]))
}

#--------------------------------------------------
rm(list=ls())