car

big steps:

cleaning
- outliers
- change qualitative ones to dummy variables - use boxplot to identify the boxplot
- google how to deal with wheelsize
- amg - qualitative
- year - quantitative
- x- serial number
choose variables - chap 6
fit different models
- linear model - choosing variables, using kfold for sample and test - gdm package
- qualitative - boxplots
- quadratic model
- chap 3 - coefficient
testing - kfold
data visualization

car = read.csv("/Users/vickyzhang/Documents/MSBA/predictive/project/cars.csv", header = TRUE)
attach(car)
# how many variables and records do we have?
dim(car)

## [1] 29466    17

# do we have duplicates?
car_noduplicate = car[!duplicated(car),]
dim(car_noduplicate)

## [1] 29466    17

# no we don't have duplicates.
# give me a brief summary of each variable.
summary(car)

##        X              trim         subTrim      condition    isOneOwner
##  Min.   :    2   550    :21836   Hybrid:  190   CPO : 3586   f:25340   
##  1st Qu.:13231   430    : 2071   unsp  :29276   New :10317   t: 4126   
##  Median :26254   500    : 2002                  Used:15563             
##  Mean   :26269   63 AMG : 1413                                         
##  3rd Qu.:39293   600    :  527                                         
##  Max.   :52572   350    :  416                                         
##                  (Other): 1201                                         
##     mileage            year          color        displacement  
##  Min.   :     1   Min.   :1988   Black  :12838   4.6 L  :13599  
##  1st Qu.:    14   1st Qu.:2007   Silver : 6095   5.5 L  : 9154  
##  Median : 26120   Median :2012   White  : 4418   4.3 L  : 2071  
##  Mean   : 40387   Mean   :2010   Gray   : 2007   5.0 L  : 2002  
##  3rd Qu.: 68234   3rd Qu.:2015   Blue   : 1599   6.0 L  :  403  
##  Max.   :488525   Max.   :2015   unsp   : 1467   6.3 L  :  391  
##                                  (Other): 1042   (Other): 1846  
##        fuel           state           region              soundSystem   
##  Diesel  :  312   CA     : 5262   SoA    :7805   Alpine         :    2  
##  Gasoline:28628   FL     : 3559   Pac    :5844   Bang Olufsen   :  177  
##  Hybrid  :  189   NY     : 2754   Mid    :5824   Bose           :  943  
##  unsp    :  337   TX     : 2458   WSC    :2865   Boston Acoustic:    1  
##                   NJ     : 2266   ENC    :2496   Harman Kardon  : 4120  
##                   GA     : 1408   New    :1421   Premium        : 9694  
##                   (Other):11759   (Other):3211   unsp           :14529  
##    wheelType       wheelSize      featureCount        price       
##  Alloy  :14565   unsp   :25293   Min.   :  0.00   Min.   :   599  
##  Chrome :   80   18     : 1774   1st Qu.: 18.00   1st Qu.: 28995  
##  Premium:  424   19     : 1297   Median : 53.00   Median : 56991  
##  Steel  :   49   20     :  813   Mean   : 46.48   Mean   : 67001  
##  unsp   :14348   17     :  149   3rd Qu.: 70.00   3rd Qu.:108815  
##                  16     :  107   Max.   :132.00   Max.   :299000  
##                  (Other):   33

names(car)

##  [1] "X"            "trim"         "subTrim"      "condition"   
##  [5] "isOneOwner"   "mileage"      "year"         "color"       
##  [9] "displacement" "fuel"         "state"        "region"      
## [13] "soundSystem"  "wheelType"    "wheelSize"    "featureCount"
## [17] "price"

# Is mileage 488525 really an outlier? I looked into that record and also cars with similar mileage, some of them actually got similarly high prices, so it might not be a phenomenon rather than an outlier
# Also, according to http://www.theanalysisfactor.com/outliers-to-drop-or-not-to-drop/, you should only drop an outlier when it's obviously invalid or forges relationship. In this case, I didn't see evidence of either 'invalid' or 'forge relationship'. 
car[car$mileage == 488525, ]

##          X trim subTrim condition isOneOwner mileage year color
## 9011 16086  550    unsp      Used          f  488525 2012 White
##      displacement     fuel state region soundSystem wheelType wheelSize
## 9011        4.6 L Gasoline    NJ    Mid        unsp     Alloy      unsp
##      featureCount price
## 9011           54 46995

car[car$mileage > 400000, ]

##           X trim subTrim condition isOneOwner mileage year color
## 9011  16086  550    unsp      Used          f  488525 2012 White
## 12567 22462  500    unsp      Used          f  407725 2000  Gold
## 13073 23304  550    unsp       CPO          t  411103 2012 Black
## 26285 46886  500    unsp      Used          f  467834 2006 Black
##       displacement     fuel state region soundSystem wheelType wheelSize
## 9011         4.6 L Gasoline    NJ    Mid        unsp     Alloy      unsp
## 12567        5.0 L Gasoline    CA    Pac        unsp     Alloy      unsp
## 13073        4.6 L Gasoline    FL    SoA     Premium     Alloy      unsp
## 26285        5.0 L Gasoline    TX    WSC        unsp      unsp      unsp
##       featureCount price
## 9011            54 46995
## 12567           20  8995
## 13073           64 59892
## 26285           54 21995

# To test if the outlier really forges the relationship, I'm going to compare the regression model between with-outlier dataset and without-outlier dataset.
carNoOutlier = car[car$mileage != 488525,]
fitNoOutlier = lm(carNoOutlier$price~carNoOutlier$mileage)
summary(fitNoOutlier)

## 
## Call:
## lm(formula = carNoOutlier$price ~ carNoOutlier$mileage)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -90893 -19012  -1053  13614 294207 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           9.904e+04  2.020e+02   490.2   <2e-16 ***
## carNoOutlier$mileage -7.936e-01  3.301e-03  -240.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 26060 on 29463 degrees of freedom
## Multiple R-squared:  0.6623, Adjusted R-squared:  0.6623 
## F-statistic: 5.779e+04 on 1 and 29463 DF,  p-value: < 2.2e-16

fit = lm(car$price ~ car$mileage)
summary(fit)

## 
## Call:
## lm(formula = car$price ~ car$mileage)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -90808 -19024  -1130  13624 334537 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.895e+04  2.025e+02   488.7   <2e-16 ***
## car$mileage -7.911e-01  3.305e-03  -239.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 26130 on 29464 degrees of freedom
## Multiple R-squared:  0.6604, Adjusted R-squared:  0.6604 
## F-statistic: 5.731e+04 on 1 and 29464 DF,  p-value: < 2.2e-16

# turns out that one outlier does change estimates a little bit but not much. R square is actually improved by 0.0018 by the outlier. So I think we can keep it.

proceed dealing with missing data with 2 steps.
step 1: see if missingness is totally random and not dependent on other variables. If yes, proceed to step 2; if not, infer missing variable from the variable it’s dependent on
step 2: test whether missingness (a dummy variable) is a function of y, if it’s not, do listwise deletion; if it is, fill the missing value with some sort of function (mean? mode? further discussion)

Let’s go through each variable.
column 1: serial number.
trim: the ‘unsp’ trim records tend to have the following attributes: they are all new, ‘isOneOwner’ is f, mileage almost 0, year is 2015 (new car), fuel tend to be unsp or gasoline, soundSystem, wheelType and wheelSize all tend to be unsp. FeatureCount are either above 3rd quartile or around 0. Price tend to be above 3rd quartile. So my guess is, these are two types of new, high-end cars. It’s not missing data, and ‘unsp’ has its own indication here. ‘unsp’ trim would tend to increase price. So we should treat ‘unsp’ as a level of categorical variable with its own meaning.
subtrim: subtrim is basically the sub-category of trim. In car dataset, subtrim usually refers to whether it’s hybrid fuel or not. So if a car is ‘hybrid’ in subtrim variable, it must be ‘hybrid’ too in fuel variable. And that is really the case, subtrim variable has 190 ‘hybrid’ records and fuel variable has 189 ‘hybrid’ records. The 1 difference might be from incorrect data input. So I would say subtrim is a proxy variable of fuel. We can just ignore subtrim and only look at fuel.

dim(car[subTrim == 'Hybrid', ])

## [1] 190  17

dim(car[fuel == 'Hybrid', ])

## [1] 189  17

# which is this one different record?
car[subTrim == 'Hybrid' & fuel != 'Hybrid', ]

##         X trim subTrim condition isOneOwner mileage year color
## 4225 7598  400  Hybrid       CPO          t   39973 2010 White
##      displacement     fuel state region soundSystem wheelType wheelSize
## 4225        3.5 L Gasoline    CA    Pac     Premium      unsp      unsp
##      featureCount price
## 4225           23 52971

# this record must be wrong data because if fuel is pure gasoline, subtrim must not be hybrid. Its trim is 400 and all other 400 cars are hybrid in both variables, so fuel variable must be wrong, it must be Hybrid. Correct it:
car[4225, 'fuel']

## [1] Gasoline
## Levels: Diesel Gasoline Hybrid unsp

car[4225, 'fuel'] = 'Hybrid'
car[4225, 'fuel']

## [1] Hybrid
## Levels: Diesel Gasoline Hybrid unsp

# now trim truthfully represent subtrim and we can safely delete subtrim variable
car1 = car[,-3]
# condition: categorical variable
# turn out you don't need to assign dummy to qualitative variables, R will do this for you.
summary(lm(price ~ condition))

## 
## Call:
## lm(formula = price ~ condition)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -63495 -12703  -4615   7344 265364 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    62603.2      365.6  171.22   <2e-16 ***
## conditionNew   56256.8      424.5  132.54   <2e-16 ***
## conditionUsed -28967.0      405.6  -71.42   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21900 on 29463 degrees of freedom
## Multiple R-squared:  0.7617, Adjusted R-squared:  0.7616 
## F-statistic: 4.708e+04 on 2 and 29463 DF,  p-value: < 2.2e-16

plot(price ~ condition)

summary(lm(price ~ isOneOwner))

## 
## Call:
## lm(formula = price ~ isOneOwner)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -68991 -39595  -6310  39530 229410 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  69590.0      278.8  249.57   <2e-16 ***
## isOneOwnert -18489.0      745.2  -24.81   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 44390 on 29464 degrees of freedom
## Multiple R-squared:  0.02047,    Adjusted R-squared:  0.02043 
## F-statistic: 615.6 on 1 and 29464 DF,  p-value: < 2.2e-16

plot(price ~ isOneOwner)

# I think the coefficient for isOwnOwnert is negative because of the new cars. Normally it should be positive in real world. so, take away [condition==new & isOneOwner == f] ones from others:
car2 = car[condition != 'New' | isOneOwner != 'f', ]
dim(car2)

## [1] 19156    17

summary(lm(car2$price ~ car2$isOneOwner)) # coefficient is positive now, but R square is low

## 
## Call:
## lm(formula = car2$price ~ car2$isOneOwner)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -47906 -19901  -2892  14198 263208 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       35791.5      197.0  181.64   <2e-16 ***
## car2$isOneOwnert  15309.5      424.6   36.06   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24160 on 19154 degrees of freedom
## Multiple R-squared:  0.06357,    Adjusted R-squared:  0.06352 
## F-statistic:  1300 on 1 and 19154 DF,  p-value: < 2.2e-16

# it doesn't make sense that a new car has mileage on it ?!
plot(car2$price ~ car2$isOneOwner) # plot makes sense now, although there are many dots above 3rd quartile. A common characteristics among them is they are all high-end trim, such as Mercedes 63 AMG, 550.

This is too cumbersome. don’t have time to go through all variables. switch approach: trying adding one variable at a time into null model, if a variable doesn’t add to adjusted R square, ignore it, no need to care about its missing values.

library(leaps)
regfit = regsubsets(price~., data = car, nvmax=3, really.big = T)

## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax,
## force.in = force.in, : 15 linear dependencies found

## Reordering variables and trying again:

# we would have 129 variables if we do this.. (50 states!!)
# start from doing linear regression on the most possible variables
summary(lm(price ~ trim+condition+mileage+year))

## 
## Call:
## lm(formula = price ~ trim + condition + mileage + year)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -69342  -6041  -1072   3762 272938 
## 
## Coefficients:
##                 Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)   -9.433e+06  9.136e+04 -103.246  < 2e-16 ***
## trim350       -3.830e+04  1.148e+03  -33.346  < 2e-16 ***
## trim400       -4.202e+04  1.323e+03  -31.760  < 2e-16 ***
## trim420        1.364e+03  1.261e+03    1.082 0.279449    
## trim430       -2.678e+04  9.226e+02  -29.028  < 2e-16 ***
## trim450       -4.131e+04  1.213e+04   -3.405 0.000662 ***
## trim500       -2.367e+04  9.170e+02  -25.807  < 2e-16 ***
## trim55 AMG    -2.560e+04  1.159e+03  -22.092  < 2e-16 ***
## trim550       -3.580e+04  9.900e+02  -36.164  < 2e-16 ***
## trim600       -1.524e+04  1.059e+03  -14.387  < 2e-16 ***
## trim63 AMG     1.227e+03  1.040e+03    1.180 0.237837    
## trim65 AMG     2.031e+04  1.196e+03   16.976  < 2e-16 ***
## trimunsp       1.816e+04  1.644e+03   11.045  < 2e-16 ***
## conditionNew   3.741e+04  2.612e+02  143.232  < 2e-16 ***
## conditionUsed -5.484e+03  2.523e+02  -21.738  < 2e-16 ***
## mileage       -1.351e-01  3.236e-03  -41.733  < 2e-16 ***
## year           4.738e+03  4.561e+01  103.897  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12090 on 29449 degrees of freedom
## Multiple R-squared:  0.9273, Adjusted R-squared:  0.9273 
## F-statistic: 2.349e+04 on 16 and 29449 DF,  p-value: < 2.2e-16

# adjusted R square is 0.9273, pretty good!!
summary(lm(price ~ condition+mileage+year))

## 
## Call:
## lm(formula = price ~ condition + mileage + year)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -57657  -8171  -3915   2095 266422 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -6.785e+06  8.817e+04  -76.96   <2e-16 ***
## conditionNew   4.235e+04  3.214e+02  131.77   <2e-16 ***
## conditionUsed -5.938e+03  3.287e+02  -18.07   <2e-16 ***
## mileage       -1.646e-01  4.222e-03  -38.99   <2e-16 ***
## year           3.406e+03  4.379e+01   77.78   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15970 on 29461 degrees of freedom
## Multiple R-squared:  0.8732, Adjusted R-squared:  0.8732 
## F-statistic: 5.073e+04 on 4 and 29461 DF,  p-value: < 2.2e-16

# adjusted R: 0.8732 so we better keep trim even though it's cumbersome (too many dummies!)
summary(lm(price ~ trim+condition+mileage+year+fuel))

## 
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -75103  -5823   -972   3749 272745 
## 
## Coefficients:
##                 Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)   -9.321e+06  9.083e+04 -102.621  < 2e-16 ***
## trim350       -3.943e+04  1.444e+03  -27.314  < 2e-16 ***
## trim400       -3.210e+04  1.197e+04   -2.681 0.007339 ** 
## trim420        1.447e+03  1.245e+03    1.162 0.245110    
## trim430       -2.638e+04  9.109e+02  -28.957  < 2e-16 ***
## trim450       -4.062e+04  1.197e+04   -3.393 0.000692 ***
## trim500       -2.330e+04  9.053e+02  -25.733  < 2e-16 ***
## trim55 AMG    -2.519e+04  1.144e+03  -22.020  < 2e-16 ***
## trim550       -3.503e+04  9.796e+02  -35.757  < 2e-16 ***
## trim600       -1.554e+04  1.046e+03  -14.851  < 2e-16 ***
## trim63 AMG     1.592e+03  1.028e+03    1.548 0.121728    
## trim65 AMG     2.024e+04  1.182e+03   17.126  < 2e-16 ***
## trimunsp       5.217e+03  1.687e+03    3.093 0.001983 ** 
## conditionNew   3.708e+04  2.594e+02  142.920  < 2e-16 ***
## conditionUsed -5.483e+03  2.490e+02  -22.019  < 2e-16 ***
## mileage       -1.362e-01  3.194e-03  -42.643  < 2e-16 ***
## year           4.683e+03  4.527e+01  103.458  < 2e-16 ***
## fuelGasoline  -2.553e+03  1.307e+03   -1.953 0.050847 .  
## fuelHybrid    -1.179e+04  1.204e+04   -0.980 0.327171    
## fuelunsp       1.742e+04  1.476e+03   11.800  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11930 on 29446 degrees of freedom
## Multiple R-squared:  0.9293, Adjusted R-squared:  0.9292 
## F-statistic: 2.036e+04 on 19 and 29446 DF,  p-value: < 2.2e-16

# adjusted R 0.9292
summary(lm(price ~ trim+condition+mileage+year+fuel+featureCount))

## 
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel + 
##     featureCount)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -75454  -5839   -959   3766 272804 
## 
## Coefficients:
##                 Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)   -9.367e+06  9.128e+04 -102.619  < 2e-16 ***
## trim350       -3.945e+04  1.443e+03  -27.334  < 2e-16 ***
## trim400       -3.246e+04  1.197e+04   -2.713 0.006676 ** 
## trim420        1.443e+03  1.244e+03    1.160 0.246042    
## trim430       -2.635e+04  9.105e+02  -28.939  < 2e-16 ***
## trim450       -4.060e+04  1.197e+04   -3.392 0.000694 ***
## trim500       -2.327e+04  9.050e+02  -25.718  < 2e-16 ***
## trim55 AMG    -2.514e+04  1.144e+03  -21.988  < 2e-16 ***
## trim550       -3.502e+04  9.792e+02  -35.761  < 2e-16 ***
## trim600       -1.550e+04  1.046e+03  -14.823  < 2e-16 ***
## trim63 AMG     1.592e+03  1.028e+03    1.548 0.121578    
## trim65 AMG     2.028e+04  1.182e+03   17.168  < 2e-16 ***
## trimunsp       5.225e+03  1.686e+03    3.099 0.001945 ** 
## conditionNew   3.691e+04  2.616e+02  141.093  < 2e-16 ***
## conditionUsed -5.566e+03  2.495e+02  -22.309  < 2e-16 ***
## mileage       -1.360e-01  3.193e-03  -42.590  < 2e-16 ***
## year           4.707e+03  4.550e+01  103.444  < 2e-16 ***
## fuelGasoline  -2.571e+03  1.307e+03   -1.967 0.049159 *  
## fuelHybrid    -1.143e+04  1.203e+04   -0.950 0.342302    
## fuelunsp       1.716e+04  1.477e+03   11.624  < 2e-16 ***
## featureCount  -1.203e+01  2.449e+00   -4.911 9.11e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11930 on 29445 degrees of freedom
## Multiple R-squared:  0.9293, Adjusted R-squared:  0.9293 
## F-statistic: 1.936e+04 on 20 and 29445 DF,  p-value: < 2.2e-16

# 0.9293 little improvement, consider removing featureCount
# convert displacement data into numeric
dis = sapply(displacement, function(x) as.numeric(substr(x, 1, 3)))

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

## Warning in FUN(X[[i]], ...): NAs introduced by coercion

car$displacement = dis
# add displacement to regression
attach(car)

## The following objects are masked from car (pos = 4):
## 
##     color, condition, displacement, featureCount, fuel,
##     isOneOwner, mileage, price, region, soundSystem, state,
##     subTrim, trim, wheelSize, wheelType, X, year

summary(lm(price ~ trim+condition+mileage+year+fuel+displacement))

## 
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel + 
##     displacement)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -70179  -5707   -953   3729 272710 
## 
## Coefficients: (1 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -8.779e+06  1.022e+05 -85.857  < 2e-16 ***
## trim350       -3.539e+04  1.482e+03 -23.882  < 2e-16 ***
## trim400       -3.891e+04  1.848e+03 -21.055  < 2e-16 ***
## trim420        4.287e+03  1.254e+03   3.418 0.000632 ***
## trim430       -2.148e+04  1.017e+03 -21.119  < 2e-16 ***
## trim450       -3.279e+04  1.174e+04  -2.793 0.005229 ** 
## trim500       -1.644e+04  1.120e+03 -14.681  < 2e-16 ***
## trim55 AMG    -1.684e+04  1.394e+03 -12.078  < 2e-16 ***
## trim550       -2.528e+04  1.352e+03 -18.701  < 2e-16 ***
## trim600       -5.801e+03  1.437e+03  -4.036 5.45e-05 ***
## trim63 AMG     1.363e+04  1.571e+03   8.674  < 2e-16 ***
## trim65 AMG     3.165e+04  1.682e+03  18.818  < 2e-16 ***
## trimunsp       3.279e+04  2.391e+03  13.714  < 2e-16 ***
## conditionNew   3.705e+04  2.545e+02 145.574  < 2e-16 ***
## conditionUsed -5.083e+03  2.466e+02 -20.608  < 2e-16 ***
## mileage       -1.331e-01  3.148e-03 -42.287  < 2e-16 ***
## year           4.416e+03  5.081e+01  86.917  < 2e-16 ***
## fuelGasoline  -2.361e+03  1.311e+03  -1.801 0.071727 .  
## fuelHybrid            NA         NA      NA       NA    
## fuelunsp       2.329e+04  1.592e+03  14.629  < 2e-16 ***
## displacement  -3.010e+03  2.991e+02 -10.062  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11680 on 29272 degrees of freedom
##   (174 observations deleted due to missingness)
## Multiple R-squared:  0.9311, Adjusted R-squared:  0.9311 
## F-statistic: 2.083e+04 on 19 and 29272 DF,  p-value: < 2.2e-16

# adjusted R 0.9417. If continuous variable - 0.9311 ????
summary(lm(price ~ trim+condition+mileage+year+fuel+displacement+color))

## 
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel + 
##     displacement + color)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -69648  -5690   -968   3700 273575 
## 
## Coefficients: (1 not defined because of singularities)
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -8.773e+06  1.021e+05 -85.895  < 2e-16 ***
## trim350        -3.488e+04  1.479e+03 -23.576  < 2e-16 ***
## trim400        -3.846e+04  1.845e+03 -20.844  < 2e-16 ***
## trim420         4.108e+03  1.251e+03   3.283 0.001029 ** 
## trim430        -2.108e+04  1.016e+03 -20.742  < 2e-16 ***
## trim450        -3.143e+04  1.171e+04  -2.684 0.007279 ** 
## trim500        -1.617e+04  1.118e+03 -14.454  < 2e-16 ***
## trim55 AMG     -1.636e+04  1.393e+03 -11.748  < 2e-16 ***
## trim550        -2.507e+04  1.350e+03 -18.574  < 2e-16 ***
## trim600        -5.568e+03  1.435e+03  -3.879 0.000105 ***
## trim63 AMG      1.365e+04  1.568e+03   8.704  < 2e-16 ***
## trim65 AMG      3.184e+04  1.679e+03  18.961  < 2e-16 ***
## trimunsp        3.289e+04  2.386e+03  13.787  < 2e-16 ***
## conditionNew    3.705e+04  2.553e+02 145.090  < 2e-16 ***
## conditionUsed  -5.068e+03  2.462e+02 -20.584  < 2e-16 ***
## mileage        -1.326e-01  3.142e-03 -42.214  < 2e-16 ***
## year            4.413e+03  5.076e+01  86.944  < 2e-16 ***
## fuelGasoline   -2.205e+03  1.308e+03  -1.685 0.091978 .  
## fuelHybrid             NA         NA      NA       NA    
## fuelunsp        2.343e+04  1.589e+03  14.748  < 2e-16 ***
## displacement   -2.919e+03  2.990e+02  -9.763  < 2e-16 ***
## colorBlack     -5.525e+02  8.131e+02  -0.680 0.496818    
## colorBlue      -1.302e+03  8.571e+02  -1.519 0.128758    
## colorBronze     4.719e+03  4.196e+03   1.125 0.260730    
## colorBrown      3.602e+02  1.657e+03   0.217 0.827889    
## colorGold       1.132e+03  1.094e+03   1.035 0.300674    
## colorGray      -1.660e+03  8.471e+02  -1.960 0.050059 .  
## colorGreen     -2.693e+01  1.201e+03  -0.022 0.982103    
## colorPurple     6.530e+03  4.197e+03   1.556 0.119767    
## colorRed       -2.398e+02  1.036e+03  -0.231 0.816935    
## colorSilver    -1.448e+03  8.184e+02  -1.770 0.076768 .  
## colorTurquoise -1.345e+03  5.272e+03  -0.255 0.798631    
## colorunsp       5.846e+02  8.627e+02   0.678 0.498014    
## colorWhite      1.194e+03  8.260e+02   1.446 0.148303    
## colorYellow    -6.023e+03  8.274e+03  -0.728 0.466634    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11650 on 29258 degrees of freedom
##   (174 observations deleted due to missingness)
## Multiple R-squared:  0.9316, Adjusted R-squared:  0.9315 
## F-statistic: 1.207e+04 on 33 and 29258 DF,  p-value: < 2.2e-16

# 0.9419, not much improvement for adjusted R, much more variance though. drop.
summary(lm(price ~ trim+condition+mileage+year+fuel+displacement+state))

## 
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel + 
##     displacement + state)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -69928  -5715  -1011   3653 273216 
## 
## Coefficients: (1 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -8.842e+06  1.029e+05 -85.967  < 2e-16 ***
## trim350       -3.548e+04  1.481e+03 -23.952  < 2e-16 ***
## trim400       -3.878e+04  1.847e+03 -20.990  < 2e-16 ***
## trim420        4.285e+03  1.253e+03   3.419  0.00063 ***
## trim430       -2.156e+04  1.017e+03 -21.198  < 2e-16 ***
## trim450       -3.697e+04  1.352e+04  -2.734  0.00625 ** 
## trim500       -1.659e+04  1.119e+03 -14.823  < 2e-16 ***
## trim55 AMG    -1.705e+04  1.393e+03 -12.237  < 2e-16 ***
## trim550       -2.542e+04  1.351e+03 -18.813  < 2e-16 ***
## trim600       -6.021e+03  1.437e+03  -4.191 2.79e-05 ***
## trim63 AMG     1.337e+04  1.570e+03   8.515  < 2e-16 ***
## trim65 AMG     3.138e+04  1.681e+03  18.666  < 2e-16 ***
## trimunsp       3.251e+04  2.390e+03  13.604  < 2e-16 ***
## conditionNew   3.707e+04  2.564e+02 144.552  < 2e-16 ***
## conditionUsed -5.060e+03  2.493e+02 -20.296  < 2e-16 ***
## mileage       -1.326e-01  3.163e-03 -41.937  < 2e-16 ***
## year           4.445e+03  5.095e+01  87.237  < 2e-16 ***
## fuelGasoline  -2.198e+03  1.311e+03  -1.677  0.09361 .  
## fuelHybrid            NA         NA      NA       NA    
## fuelunsp       2.357e+04  1.592e+03  14.810  < 2e-16 ***
## displacement  -2.974e+03  2.990e+02  -9.945  < 2e-16 ***
## stateAL        5.530e+03  8.269e+03   0.669  0.50363    
## stateAR        6.379e+03  8.367e+03   0.762  0.44583    
## stateAZ        6.926e+03  8.266e+03   0.838  0.40207    
## stateCA        6.004e+03  8.246e+03   0.728  0.46658    
## stateCO        6.940e+03  8.264e+03   0.840  0.40107    
## stateCT        5.523e+03  8.268e+03   0.668  0.50411    
## stateDC       -7.360e+03  1.064e+04  -0.692  0.48918    
## stateDE        7.870e+03  8.330e+03   0.945  0.34475    
## stateFL        5.600e+03  8.247e+03   0.679  0.49708    
## stateGA        4.952e+03  8.251e+03   0.600  0.54838    
## stateHI        5.424e+03  8.323e+03   0.652  0.51462    
## stateIA        7.493e+03  8.459e+03   0.886  0.37572    
## stateID        1.007e+04  8.595e+03   1.171  0.24156    
## stateIL        6.038e+03  8.251e+03   0.732  0.46433    
## stateIN        5.113e+03  8.287e+03   0.617  0.53729    
## stateKS        6.633e+03  8.342e+03   0.795  0.42655    
## stateKY        9.236e+03  8.286e+03   1.115  0.26499    
## stateLA        8.012e+03  8.299e+03   0.965  0.33434    
## stateMA        6.081e+03  8.255e+03   0.737  0.46135    
## stateMD        6.591e+03  8.257e+03   0.798  0.42472    
## stateME        4.439e+03  8.581e+03   0.517  0.60492    
## stateMI        5.829e+03  8.282e+03   0.704  0.48155    
## stateMN        6.750e+03  8.280e+03   0.815  0.41492    
## stateMO        7.337e+03  8.267e+03   0.887  0.37483    
## stateMS        7.796e+03  8.308e+03   0.938  0.34808    
## stateMT        1.249e+04  8.963e+03   1.394  0.16337    
## stateNC        6.772e+03  8.254e+03   0.820  0.41200    
## stateND        8.337e+03  9.755e+03   0.855  0.39276    
## stateNE        7.869e+03  8.646e+03   0.910  0.36275    
## stateNH        7.445e+03  8.301e+03   0.897  0.36984    
## stateNJ        5.487e+03  8.249e+03   0.665  0.50593    
## stateNM        6.309e+03  8.484e+03   0.744  0.45706    
## stateNV        8.711e+03  8.267e+03   1.054  0.29200    
## stateNY        4.631e+03  8.248e+03   0.561  0.57450    
## stateOH        5.353e+03  8.258e+03   0.648  0.51683    
## stateOK        5.665e+03  8.289e+03   0.683  0.49433    
## stateON        1.000e+04  1.065e+04   0.939  0.34750    
## stateOR        6.987e+03  8.297e+03   0.842  0.39972    
## statePA        5.977e+03  8.255e+03   0.724  0.46907    
## stateRI        4.949e+03  8.399e+03   0.589  0.55571    
## stateSC        6.711e+03  8.277e+03   0.811  0.41752    
## stateSD        2.540e+04  1.166e+04   2.179  0.02937 *  
## stateTN        4.966e+03  8.264e+03   0.601  0.54787    
## stateTX        6.291e+03  8.248e+03   0.763  0.44563    
## stateunsp      8.728e+03  1.428e+04   0.611  0.54099    
## stateUT        8.867e+03  8.308e+03   1.067  0.28589    
## stateVA        6.043e+03  8.253e+03   0.732  0.46407    
## stateWA        7.459e+03  8.270e+03   0.902  0.36708    
## stateWI        7.044e+03  8.299e+03   0.849  0.39600    
## stateWV        6.446e+03  8.454e+03   0.763  0.44575    
## stateWY        2.201e+03  1.166e+04   0.189  0.85027    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11660 on 29221 degrees of freedom
##   (174 observations deleted due to missingness)
## Multiple R-squared:  0.9315, Adjusted R-squared:  0.9314 
## F-statistic:  5679 on 70 and 29221 DF,  p-value: < 2.2e-16

# 0.942 6not much improvement for adjusted R, much more variance though. drop.
summary(lm(price ~ trim+condition+mileage+year+fuel+displacement+region))

## 
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel + 
##     displacement + region)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -70288  -5705   -991   3701 273525 
## 
## Coefficients: (1 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -8.805e+06  1.023e+05 -86.079  < 2e-16 ***
## trim350       -3.549e+04  1.481e+03 -23.968  < 2e-16 ***
## trim400       -3.905e+04  1.847e+03 -21.140  < 2e-16 ***
## trim420        4.261e+03  1.253e+03   3.400 0.000674 ***
## trim430       -2.155e+04  1.016e+03 -21.206  < 2e-16 ***
## trim450       -3.652e+04  1.311e+04  -2.787 0.005323 ** 
## trim500       -1.657e+04  1.119e+03 -14.806  < 2e-16 ***
## trim55 AMG    -1.701e+04  1.393e+03 -12.208  < 2e-16 ***
## trim550       -2.535e+04  1.351e+03 -18.769  < 2e-16 ***
## trim600       -5.952e+03  1.436e+03  -4.144 3.42e-05 ***
## trim63 AMG     1.344e+04  1.570e+03   8.564  < 2e-16 ***
## trim65 AMG     3.144e+04  1.681e+03  18.705  < 2e-16 ***
## trimunsp       3.256e+04  2.389e+03  13.629  < 2e-16 ***
## conditionNew   3.710e+04  2.555e+02 145.173  < 2e-16 ***
## conditionUsed -5.119e+03  2.476e+02 -20.675  < 2e-16 ***
## mileage       -1.325e-01  3.147e-03 -42.104  < 2e-16 ***
## year           4.429e+03  5.083e+01  87.133  < 2e-16 ***
## fuelGasoline  -2.447e+03  1.311e+03  -1.867 0.061916 .  
## fuelHybrid            NA         NA      NA       NA    
## fuelunsp       2.331e+04  1.592e+03  14.647  < 2e-16 ***
## displacement  -2.985e+03  2.989e+02  -9.985  < 2e-16 ***
## regionESC      4.331e+02  4.247e+02   1.020 0.307850    
## regionMid     -6.712e+02  2.829e+02  -2.372 0.017687 *  
## regionMtn      1.838e+03  3.936e+02   4.671 3.01e-06 ***
## regionNew      1.871e+02  3.895e+02   0.480 0.630910    
## regionPac      2.691e+02  2.815e+02   0.956 0.339184    
## regionSoA     -1.718e+01  2.691e+02  -0.064 0.949087    
## regionunsp     3.798e+03  5.842e+03   0.650 0.515666    
## regionWNC      1.317e+03  4.884e+02   2.696 0.007019 ** 
## regionWSC      5.204e+02  3.213e+02   1.620 0.105318    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11670 on 29263 degrees of freedom
##   (174 observations deleted due to missingness)
## Multiple R-squared:  0.9313, Adjusted R-squared:  0.9312 
## F-statistic: 1.417e+04 on 28 and 29263 DF,  p-value: < 2.2e-16

# 0.9419 drop.
summary(lm(price ~ trim+condition+mileage+year+fuel+displacement+soundSystem))

## 
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel + 
##     displacement + soundSystem)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -69973  -5778   -913   3836 271711 
## 
## Coefficients: (1 not defined because of singularities)
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                -8.746e+06  1.024e+05 -85.443  < 2e-16 ***
## trim350                    -3.507e+04  1.477e+03 -23.737  < 2e-16 ***
## trim400                    -3.869e+04  1.842e+03 -21.004  < 2e-16 ***
## trim420                     4.345e+03  1.250e+03   3.477 0.000508 ***
## trim430                    -2.117e+04  1.015e+03 -20.856  < 2e-16 ***
## trim450                    -3.045e+04  1.170e+04  -2.603 0.009250 ** 
## trim500                    -1.617e+04  1.117e+03 -14.480  < 2e-16 ***
## trim55 AMG                 -1.632e+04  1.392e+03 -11.723  < 2e-16 ***
## trim550                    -2.470e+04  1.348e+03 -18.322  < 2e-16 ***
## trim600                    -5.643e+03  1.432e+03  -3.941 8.14e-05 ***
## trim63 AMG                  1.402e+04  1.566e+03   8.955  < 2e-16 ***
## trim65 AMG                  3.158e+04  1.677e+03  18.836  < 2e-16 ***
## trimunsp                    3.305e+04  2.383e+03  13.870  < 2e-16 ***
## conditionNew                3.656e+04  2.601e+02 140.544  < 2e-16 ***
## conditionUsed              -4.928e+03  2.462e+02 -20.020  < 2e-16 ***
## mileage                    -1.344e-01  3.140e-03 -42.794  < 2e-16 ***
## year                        4.403e+03  5.076e+01  86.748  < 2e-16 ***
## fuelGasoline               -2.889e+03  1.310e+03  -2.205 0.027466 *  
## fuelHybrid                         NA         NA      NA       NA    
## fuelunsp                    2.276e+04  1.589e+03  14.319  < 2e-16 ***
## displacement               -2.968e+03  2.981e+02  -9.955  < 2e-16 ***
## soundSystemBang Olufsen    -6.314e+02  8.278e+03  -0.076 0.939195    
## soundSystemBose            -8.569e+03  8.238e+03  -1.040 0.298216    
## soundSystemBoston Acoustic -1.050e+04  1.425e+04  -0.737 0.461419    
## soundSystemHarman Kardon   -8.522e+03  8.231e+03  -1.035 0.300535    
## soundSystemPremium         -6.496e+03  8.230e+03  -0.789 0.429954    
## soundSystemunsp            -5.826e+03  8.229e+03  -0.708 0.479012    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11630 on 29266 degrees of freedom
##   (174 observations deleted due to missingness)
## Multiple R-squared:  0.9317, Adjusted R-squared:  0.9316 
## F-statistic: 1.596e+04 on 25 and 29266 DF,  p-value: < 2.2e-16

# 0.942 drop
summary(lm(price ~ trim+condition+mileage+year+fuel+displacement+wheelType))

## 
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel + 
##     displacement + wheelType)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -69892  -5699   -926   3764 272938 
## 
## Coefficients: (1 not defined because of singularities)
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -8.767e+06  1.021e+05 -85.872  < 2e-16 ***
## trim350          -3.540e+04  1.479e+03 -23.930  < 2e-16 ***
## trim400          -3.894e+04  1.845e+03 -21.112  < 2e-16 ***
## trim420           4.216e+03  1.252e+03   3.368 0.000759 ***
## trim430          -2.152e+04  1.015e+03 -21.196  < 2e-16 ***
## trim450          -3.260e+04  1.172e+04  -2.782 0.005411 ** 
## trim500          -1.651e+04  1.118e+03 -14.774  < 2e-16 ***
## trim55 AMG       -1.691e+04  1.392e+03 -12.150  < 2e-16 ***
## trim550          -2.538e+04  1.350e+03 -18.808  < 2e-16 ***
## trim600          -5.956e+03  1.435e+03  -4.151 3.32e-05 ***
## trim63 AMG        1.351e+04  1.568e+03   8.613  < 2e-16 ***
## trim65 AMG        3.153e+04  1.679e+03  18.776  < 2e-16 ***
## trimunsp          3.276e+04  2.387e+03  13.726  < 2e-16 ***
## conditionNew      3.699e+04  2.563e+02 144.336  < 2e-16 ***
## conditionUsed    -5.137e+03  2.465e+02 -20.843  < 2e-16 ***
## mileage          -1.328e-01  3.142e-03 -42.262  < 2e-16 ***
## year              4.410e+03  5.074e+01  86.928  < 2e-16 ***
## fuelGasoline     -2.466e+03  1.309e+03  -1.884 0.059545 .  
## fuelHybrid               NA         NA      NA       NA    
## fuelunsp          2.323e+04  1.589e+03  14.620  < 2e-16 ***
## displacement     -2.941e+03  2.986e+02  -9.846  < 2e-16 ***
## wheelTypeChrome   7.816e+00  1.309e+03   0.006 0.995237    
## wheelTypePremium -9.616e+02  5.756e+02  -1.670 0.094844 .  
## wheelTypeSteel    1.682e+04  1.670e+03  10.073  < 2e-16 ***
## wheelTypeunsp     5.559e+02  1.400e+02   3.971 7.18e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11660 on 29268 degrees of freedom
##   (174 observations deleted due to missingness)
## Multiple R-squared:  0.9314, Adjusted R-squared:  0.9314 
## F-statistic: 1.728e+04 on 23 and 29268 DF,  p-value: < 2.2e-16

# 0.942 drop
summary(lm(price ~ trim+condition+mileage+year+fuel+displacement+wheelSize))

## 
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel + 
##     displacement + wheelSize)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -68074  -5741   -888   3718 272682 
## 
## Coefficients: (1 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -8.791e+06  1.023e+05 -85.942  < 2e-16 ***
## trim350       -3.451e+04  1.474e+03 -23.411  < 2e-16 ***
## trim400       -3.797e+04  1.837e+03 -20.668  < 2e-16 ***
## trim420        4.575e+03  1.246e+03   3.670 0.000243 ***
## trim430       -2.103e+04  1.011e+03 -20.798  < 2e-16 ***
## trim450       -3.242e+04  1.166e+04  -2.780 0.005445 ** 
## trim500       -1.590e+04  1.113e+03 -14.284  < 2e-16 ***
## trim55 AMG    -1.627e+04  1.387e+03 -11.735  < 2e-16 ***
## trim550       -2.481e+04  1.345e+03 -18.440  < 2e-16 ***
## trim600       -5.300e+03  1.429e+03  -3.708 0.000209 ***
## trim63 AMG     1.292e+04  1.563e+03   8.264  < 2e-16 ***
## trim65 AMG     3.118e+04  1.672e+03  18.645  < 2e-16 ***
## trimunsp       3.178e+04  2.377e+03  13.371  < 2e-16 ***
## conditionNew   3.688e+04  2.557e+02 144.229  < 2e-16 ***
## conditionUsed -5.006e+03  2.452e+02 -20.420  < 2e-16 ***
## mileage       -1.329e-01  3.128e-03 -42.498  < 2e-16 ***
## year           4.425e+03  5.087e+01  86.988  < 2e-16 ***
## fuelGasoline  -1.766e+03  1.306e+03  -1.352 0.176240    
## fuelHybrid            NA         NA      NA       NA    
## fuelunsp       2.339e+04  1.584e+03  14.765  < 2e-16 ***
## displacement  -3.071e+03  2.976e+02 -10.318  < 2e-16 ***
## wheelSize17   -1.116e+04  1.484e+03  -7.525 5.42e-14 ***
## wheelSize18   -6.631e+03  1.178e+03  -5.627 1.85e-08 ***
## wheelSize19   -5.607e+03  1.193e+03  -4.700 2.62e-06 ***
## wheelSize20    2.355e+03  1.221e+03   1.929 0.053687 .  
## wheelSize21   -2.470e+03  8.281e+03  -0.298 0.765538    
## wheelSize22    3.059e+00  2.377e+03   0.001 0.998973    
## wheelSizeunsp -5.050e+03  1.141e+03  -4.426 9.62e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11600 on 29265 degrees of freedom
##   (174 observations deleted due to missingness)
## Multiple R-squared:  0.9321, Adjusted R-squared:  0.932 
## F-statistic: 1.545e+04 on 26 and 29265 DF,  p-value: < 2.2e-16

# 0.9421 drop
summary(lm(price ~ trim+condition+mileage+year+fuel+displacement+featureCount))

## 
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel + 
##     displacement + featureCount)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -70088  -5740   -924   3715 272781 
## 
## Coefficients: (1 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -8.839e+06  1.027e+05 -86.079  < 2e-16 ***
## trim350       -3.545e+04  1.481e+03 -23.932  < 2e-16 ***
## trim400       -3.896e+04  1.847e+03 -21.092  < 2e-16 ***
## trim420        4.259e+03  1.254e+03   3.397 0.000681 ***
## trim430       -2.149e+04  1.016e+03 -21.141  < 2e-16 ***
## trim450       -3.283e+04  1.173e+04  -2.798 0.005149 ** 
## trim500       -1.647e+04  1.119e+03 -14.717  < 2e-16 ***
## trim55 AMG    -1.686e+04  1.393e+03 -12.097  < 2e-16 ***
## trim550       -2.535e+04  1.351e+03 -18.763  < 2e-16 ***
## trim600       -5.843e+03  1.436e+03  -4.068 4.76e-05 ***
## trim63 AMG     1.352e+04  1.570e+03   8.612  < 2e-16 ***
## trim65 AMG     3.160e+04  1.681e+03  18.798  < 2e-16 ***
## trimunsp       3.295e+04  2.390e+03  13.786  < 2e-16 ***
## conditionNew   3.684e+04  2.566e+02 143.590  < 2e-16 ***
## conditionUsed -5.185e+03  2.471e+02 -20.984  < 2e-16 ***
## mileage       -1.329e-01  3.147e-03 -42.238  < 2e-16 ***
## year           4.447e+03  5.103e+01  87.131  < 2e-16 ***
## fuelGasoline  -2.389e+03  1.310e+03  -1.823 0.068300 .  
## fuelHybrid            NA         NA      NA       NA    
## fuelunsp       2.315e+04  1.592e+03  14.546  < 2e-16 ***
## displacement  -2.985e+03  2.990e+02  -9.982  < 2e-16 ***
## featureCount  -1.436e+01  2.409e+00  -5.962 2.52e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11670 on 29271 degrees of freedom
##   (174 observations deleted due to missingness)
## Multiple R-squared:  0.9312, Adjusted R-squared:  0.9312 
## F-statistic: 1.982e+04 on 20 and 29271 DF,  p-value: < 2.2e-16

# 0.9417 drop
# so best bet is: adjusted R 0.9417:
summary(lm(price ~ trim+condition+mileage+year+fuel+displacement))

## 
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel + 
##     displacement)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -70179  -5707   -953   3729 272710 
## 
## Coefficients: (1 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -8.779e+06  1.022e+05 -85.857  < 2e-16 ***
## trim350       -3.539e+04  1.482e+03 -23.882  < 2e-16 ***
## trim400       -3.891e+04  1.848e+03 -21.055  < 2e-16 ***
## trim420        4.287e+03  1.254e+03   3.418 0.000632 ***
## trim430       -2.148e+04  1.017e+03 -21.119  < 2e-16 ***
## trim450       -3.279e+04  1.174e+04  -2.793 0.005229 ** 
## trim500       -1.644e+04  1.120e+03 -14.681  < 2e-16 ***
## trim55 AMG    -1.684e+04  1.394e+03 -12.078  < 2e-16 ***
## trim550       -2.528e+04  1.352e+03 -18.701  < 2e-16 ***
## trim600       -5.801e+03  1.437e+03  -4.036 5.45e-05 ***
## trim63 AMG     1.363e+04  1.571e+03   8.674  < 2e-16 ***
## trim65 AMG     3.165e+04  1.682e+03  18.818  < 2e-16 ***
## trimunsp       3.279e+04  2.391e+03  13.714  < 2e-16 ***
## conditionNew   3.705e+04  2.545e+02 145.574  < 2e-16 ***
## conditionUsed -5.083e+03  2.466e+02 -20.608  < 2e-16 ***
## mileage       -1.331e-01  3.148e-03 -42.287  < 2e-16 ***
## year           4.416e+03  5.081e+01  86.917  < 2e-16 ***
## fuelGasoline  -2.361e+03  1.311e+03  -1.801 0.071727 .  
## fuelHybrid            NA         NA      NA       NA    
## fuelunsp       2.329e+04  1.592e+03  14.629  < 2e-16 ***
## displacement  -3.010e+03  2.991e+02 -10.062  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11680 on 29272 degrees of freedom
##   (174 observations deleted due to missingness)
## Multiple R-squared:  0.9311, Adjusted R-squared:  0.9311 
## F-statistic: 2.083e+04 on 19 and 29272 DF,  p-value: < 2.2e-16

# try some quadratic function:
summary(lm(price ~ trim+condition+mileage+year+fuel+displacement))

## 
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel + 
##     displacement)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -70179  -5707   -953   3729 272710 
## 
## Coefficients: (1 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -8.779e+06  1.022e+05 -85.857  < 2e-16 ***
## trim350       -3.539e+04  1.482e+03 -23.882  < 2e-16 ***
## trim400       -3.891e+04  1.848e+03 -21.055  < 2e-16 ***
## trim420        4.287e+03  1.254e+03   3.418 0.000632 ***
## trim430       -2.148e+04  1.017e+03 -21.119  < 2e-16 ***
## trim450       -3.279e+04  1.174e+04  -2.793 0.005229 ** 
## trim500       -1.644e+04  1.120e+03 -14.681  < 2e-16 ***
## trim55 AMG    -1.684e+04  1.394e+03 -12.078  < 2e-16 ***
## trim550       -2.528e+04  1.352e+03 -18.701  < 2e-16 ***
## trim600       -5.801e+03  1.437e+03  -4.036 5.45e-05 ***
## trim63 AMG     1.363e+04  1.571e+03   8.674  < 2e-16 ***
## trim65 AMG     3.165e+04  1.682e+03  18.818  < 2e-16 ***
## trimunsp       3.279e+04  2.391e+03  13.714  < 2e-16 ***
## conditionNew   3.705e+04  2.545e+02 145.574  < 2e-16 ***
## conditionUsed -5.083e+03  2.466e+02 -20.608  < 2e-16 ***
## mileage       -1.331e-01  3.148e-03 -42.287  < 2e-16 ***
## year           4.416e+03  5.081e+01  86.917  < 2e-16 ***
## fuelGasoline  -2.361e+03  1.311e+03  -1.801 0.071727 .  
## fuelHybrid            NA         NA      NA       NA    
## fuelunsp       2.329e+04  1.592e+03  14.629  < 2e-16 ***
## displacement  -3.010e+03  2.991e+02 -10.062  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11680 on 29272 degrees of freedom
##   (174 observations deleted due to missingness)
## Multiple R-squared:  0.9311, Adjusted R-squared:  0.9311 
## F-statistic: 2.083e+04 on 19 and 29272 DF,  p-value: < 2.2e-16

set.seed(17)
# k = 10
train = sample(29466, 29466 * 0.9)
fit = lm(price ~ trim+condition+mileage+year+fuel+displacement, data= car, subset = train)
mean((price-predict(fit, car))[-train]^2)

## Warning in predict.lm(fit, car): prediction from a rank-deficient fit may
## be misleading

## [1] NA

car

Vicky

July 17, 2015

big steps: