car = read.csv("/Users/vickyzhang/Documents/MSBA/predictive/project/cars.csv", header = TRUE)
attach(car)
# how many variables and records do we have?
dim(car)
## [1] 29466 17
# do we have duplicates?
car_noduplicate = car[!duplicated(car),]
dim(car_noduplicate)
## [1] 29466 17
# no we don't have duplicates.
# give me a brief summary of each variable.
summary(car)
## X trim subTrim condition isOneOwner
## Min. : 2 550 :21836 Hybrid: 190 CPO : 3586 f:25340
## 1st Qu.:13231 430 : 2071 unsp :29276 New :10317 t: 4126
## Median :26254 500 : 2002 Used:15563
## Mean :26269 63 AMG : 1413
## 3rd Qu.:39293 600 : 527
## Max. :52572 350 : 416
## (Other): 1201
## mileage year color displacement
## Min. : 1 Min. :1988 Black :12838 4.6 L :13599
## 1st Qu.: 14 1st Qu.:2007 Silver : 6095 5.5 L : 9154
## Median : 26120 Median :2012 White : 4418 4.3 L : 2071
## Mean : 40387 Mean :2010 Gray : 2007 5.0 L : 2002
## 3rd Qu.: 68234 3rd Qu.:2015 Blue : 1599 6.0 L : 403
## Max. :488525 Max. :2015 unsp : 1467 6.3 L : 391
## (Other): 1042 (Other): 1846
## fuel state region soundSystem
## Diesel : 312 CA : 5262 SoA :7805 Alpine : 2
## Gasoline:28628 FL : 3559 Pac :5844 Bang Olufsen : 177
## Hybrid : 189 NY : 2754 Mid :5824 Bose : 943
## unsp : 337 TX : 2458 WSC :2865 Boston Acoustic: 1
## NJ : 2266 ENC :2496 Harman Kardon : 4120
## GA : 1408 New :1421 Premium : 9694
## (Other):11759 (Other):3211 unsp :14529
## wheelType wheelSize featureCount price
## Alloy :14565 unsp :25293 Min. : 0.00 Min. : 599
## Chrome : 80 18 : 1774 1st Qu.: 18.00 1st Qu.: 28995
## Premium: 424 19 : 1297 Median : 53.00 Median : 56991
## Steel : 49 20 : 813 Mean : 46.48 Mean : 67001
## unsp :14348 17 : 149 3rd Qu.: 70.00 3rd Qu.:108815
## 16 : 107 Max. :132.00 Max. :299000
## (Other): 33
names(car)
## [1] "X" "trim" "subTrim" "condition"
## [5] "isOneOwner" "mileage" "year" "color"
## [9] "displacement" "fuel" "state" "region"
## [13] "soundSystem" "wheelType" "wheelSize" "featureCount"
## [17] "price"
# Is mileage 488525 really an outlier? I looked into that record and also cars with similar mileage, some of them actually got similarly high prices, so it might not be a phenomenon rather than an outlier
# Also, according to http://www.theanalysisfactor.com/outliers-to-drop-or-not-to-drop/, you should only drop an outlier when it's obviously invalid or forges relationship. In this case, I didn't see evidence of either 'invalid' or 'forge relationship'.
car[car$mileage == 488525, ]
## X trim subTrim condition isOneOwner mileage year color
## 9011 16086 550 unsp Used f 488525 2012 White
## displacement fuel state region soundSystem wheelType wheelSize
## 9011 4.6 L Gasoline NJ Mid unsp Alloy unsp
## featureCount price
## 9011 54 46995
car[car$mileage > 400000, ]
## X trim subTrim condition isOneOwner mileage year color
## 9011 16086 550 unsp Used f 488525 2012 White
## 12567 22462 500 unsp Used f 407725 2000 Gold
## 13073 23304 550 unsp CPO t 411103 2012 Black
## 26285 46886 500 unsp Used f 467834 2006 Black
## displacement fuel state region soundSystem wheelType wheelSize
## 9011 4.6 L Gasoline NJ Mid unsp Alloy unsp
## 12567 5.0 L Gasoline CA Pac unsp Alloy unsp
## 13073 4.6 L Gasoline FL SoA Premium Alloy unsp
## 26285 5.0 L Gasoline TX WSC unsp unsp unsp
## featureCount price
## 9011 54 46995
## 12567 20 8995
## 13073 64 59892
## 26285 54 21995
# To test if the outlier really forges the relationship, I'm going to compare the regression model between with-outlier dataset and without-outlier dataset.
carNoOutlier = car[car$mileage != 488525,]
fitNoOutlier = lm(carNoOutlier$price~carNoOutlier$mileage)
summary(fitNoOutlier)
##
## Call:
## lm(formula = carNoOutlier$price ~ carNoOutlier$mileage)
##
## Residuals:
## Min 1Q Median 3Q Max
## -90893 -19012 -1053 13614 294207
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.904e+04 2.020e+02 490.2 <2e-16 ***
## carNoOutlier$mileage -7.936e-01 3.301e-03 -240.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26060 on 29463 degrees of freedom
## Multiple R-squared: 0.6623, Adjusted R-squared: 0.6623
## F-statistic: 5.779e+04 on 1 and 29463 DF, p-value: < 2.2e-16
fit = lm(car$price ~ car$mileage)
summary(fit)
##
## Call:
## lm(formula = car$price ~ car$mileage)
##
## Residuals:
## Min 1Q Median 3Q Max
## -90808 -19024 -1130 13624 334537
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.895e+04 2.025e+02 488.7 <2e-16 ***
## car$mileage -7.911e-01 3.305e-03 -239.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26130 on 29464 degrees of freedom
## Multiple R-squared: 0.6604, Adjusted R-squared: 0.6604
## F-statistic: 5.731e+04 on 1 and 29464 DF, p-value: < 2.2e-16
# turns out that one outlier does change estimates a little bit but not much. R square is actually improved by 0.0018 by the outlier. So I think we can keep it.
proceed dealing with missing data with 2 steps.
step 1: see if missingness is totally random and not dependent on other variables. If yes, proceed to step 2; if not, infer missing variable from the variable it’s dependent on
step 2: test whether missingness (a dummy variable) is a function of y, if it’s not, do listwise deletion; if it is, fill the missing value with some sort of function (mean? mode? further discussion)
Let’s go through each variable.
column 1: serial number.
trim: the ‘unsp’ trim records tend to have the following attributes: they are all new, ‘isOneOwner’ is f, mileage almost 0, year is 2015 (new car), fuel tend to be unsp or gasoline, soundSystem, wheelType and wheelSize all tend to be unsp. FeatureCount are either above 3rd quartile or around 0. Price tend to be above 3rd quartile. So my guess is, these are two types of new, high-end cars. It’s not missing data, and ‘unsp’ has its own indication here. ‘unsp’ trim would tend to increase price. So we should treat ‘unsp’ as a level of categorical variable with its own meaning.
subtrim: subtrim is basically the sub-category of trim. In car dataset, subtrim usually refers to whether it’s hybrid fuel or not. So if a car is ‘hybrid’ in subtrim variable, it must be ‘hybrid’ too in fuel variable. And that is really the case, subtrim variable has 190 ‘hybrid’ records and fuel variable has 189 ‘hybrid’ records. The 1 difference might be from incorrect data input. So I would say subtrim is a proxy variable of fuel. We can just ignore subtrim and only look at fuel.
dim(car[subTrim == 'Hybrid', ])
## [1] 190 17
dim(car[fuel == 'Hybrid', ])
## [1] 189 17
# which is this one different record?
car[subTrim == 'Hybrid' & fuel != 'Hybrid', ]
## X trim subTrim condition isOneOwner mileage year color
## 4225 7598 400 Hybrid CPO t 39973 2010 White
## displacement fuel state region soundSystem wheelType wheelSize
## 4225 3.5 L Gasoline CA Pac Premium unsp unsp
## featureCount price
## 4225 23 52971
# this record must be wrong data because if fuel is pure gasoline, subtrim must not be hybrid. Its trim is 400 and all other 400 cars are hybrid in both variables, so fuel variable must be wrong, it must be Hybrid. Correct it:
car[4225, 'fuel']
## [1] Gasoline
## Levels: Diesel Gasoline Hybrid unsp
car[4225, 'fuel'] = 'Hybrid'
car[4225, 'fuel']
## [1] Hybrid
## Levels: Diesel Gasoline Hybrid unsp
# now trim truthfully represent subtrim and we can safely delete subtrim variable
car1 = car[,-3]
# condition: categorical variable
# turn out you don't need to assign dummy to qualitative variables, R will do this for you.
summary(lm(price ~ condition))
##
## Call:
## lm(formula = price ~ condition)
##
## Residuals:
## Min 1Q Median 3Q Max
## -63495 -12703 -4615 7344 265364
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 62603.2 365.6 171.22 <2e-16 ***
## conditionNew 56256.8 424.5 132.54 <2e-16 ***
## conditionUsed -28967.0 405.6 -71.42 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21900 on 29463 degrees of freedom
## Multiple R-squared: 0.7617, Adjusted R-squared: 0.7616
## F-statistic: 4.708e+04 on 2 and 29463 DF, p-value: < 2.2e-16
plot(price ~ condition)
summary(lm(price ~ isOneOwner))
##
## Call:
## lm(formula = price ~ isOneOwner)
##
## Residuals:
## Min 1Q Median 3Q Max
## -68991 -39595 -6310 39530 229410
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 69590.0 278.8 249.57 <2e-16 ***
## isOneOwnert -18489.0 745.2 -24.81 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 44390 on 29464 degrees of freedom
## Multiple R-squared: 0.02047, Adjusted R-squared: 0.02043
## F-statistic: 615.6 on 1 and 29464 DF, p-value: < 2.2e-16
plot(price ~ isOneOwner)
# I think the coefficient for isOwnOwnert is negative because of the new cars. Normally it should be positive in real world. so, take away [condition==new & isOneOwner == f] ones from others:
car2 = car[condition != 'New' | isOneOwner != 'f', ]
dim(car2)
## [1] 19156 17
summary(lm(car2$price ~ car2$isOneOwner)) # coefficient is positive now, but R square is low
##
## Call:
## lm(formula = car2$price ~ car2$isOneOwner)
##
## Residuals:
## Min 1Q Median 3Q Max
## -47906 -19901 -2892 14198 263208
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 35791.5 197.0 181.64 <2e-16 ***
## car2$isOneOwnert 15309.5 424.6 36.06 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24160 on 19154 degrees of freedom
## Multiple R-squared: 0.06357, Adjusted R-squared: 0.06352
## F-statistic: 1300 on 1 and 19154 DF, p-value: < 2.2e-16
# it doesn't make sense that a new car has mileage on it ?!
plot(car2$price ~ car2$isOneOwner) # plot makes sense now, although there are many dots above 3rd quartile. A common characteristics among them is they are all high-end trim, such as Mercedes 63 AMG, 550.
This is too cumbersome. don’t have time to go through all variables. switch approach: trying adding one variable at a time into null model, if a variable doesn’t add to adjusted R square, ignore it, no need to care about its missing values.
library(leaps)
regfit = regsubsets(price~., data = car, nvmax=3, really.big = T)
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax,
## force.in = force.in, : 15 linear dependencies found
## Reordering variables and trying again:
# we would have 129 variables if we do this.. (50 states!!)
# start from doing linear regression on the most possible variables
summary(lm(price ~ trim+condition+mileage+year))
##
## Call:
## lm(formula = price ~ trim + condition + mileage + year)
##
## Residuals:
## Min 1Q Median 3Q Max
## -69342 -6041 -1072 3762 272938
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.433e+06 9.136e+04 -103.246 < 2e-16 ***
## trim350 -3.830e+04 1.148e+03 -33.346 < 2e-16 ***
## trim400 -4.202e+04 1.323e+03 -31.760 < 2e-16 ***
## trim420 1.364e+03 1.261e+03 1.082 0.279449
## trim430 -2.678e+04 9.226e+02 -29.028 < 2e-16 ***
## trim450 -4.131e+04 1.213e+04 -3.405 0.000662 ***
## trim500 -2.367e+04 9.170e+02 -25.807 < 2e-16 ***
## trim55 AMG -2.560e+04 1.159e+03 -22.092 < 2e-16 ***
## trim550 -3.580e+04 9.900e+02 -36.164 < 2e-16 ***
## trim600 -1.524e+04 1.059e+03 -14.387 < 2e-16 ***
## trim63 AMG 1.227e+03 1.040e+03 1.180 0.237837
## trim65 AMG 2.031e+04 1.196e+03 16.976 < 2e-16 ***
## trimunsp 1.816e+04 1.644e+03 11.045 < 2e-16 ***
## conditionNew 3.741e+04 2.612e+02 143.232 < 2e-16 ***
## conditionUsed -5.484e+03 2.523e+02 -21.738 < 2e-16 ***
## mileage -1.351e-01 3.236e-03 -41.733 < 2e-16 ***
## year 4.738e+03 4.561e+01 103.897 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12090 on 29449 degrees of freedom
## Multiple R-squared: 0.9273, Adjusted R-squared: 0.9273
## F-statistic: 2.349e+04 on 16 and 29449 DF, p-value: < 2.2e-16
# adjusted R square is 0.9273, pretty good!!
summary(lm(price ~ condition+mileage+year))
##
## Call:
## lm(formula = price ~ condition + mileage + year)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57657 -8171 -3915 2095 266422
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.785e+06 8.817e+04 -76.96 <2e-16 ***
## conditionNew 4.235e+04 3.214e+02 131.77 <2e-16 ***
## conditionUsed -5.938e+03 3.287e+02 -18.07 <2e-16 ***
## mileage -1.646e-01 4.222e-03 -38.99 <2e-16 ***
## year 3.406e+03 4.379e+01 77.78 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15970 on 29461 degrees of freedom
## Multiple R-squared: 0.8732, Adjusted R-squared: 0.8732
## F-statistic: 5.073e+04 on 4 and 29461 DF, p-value: < 2.2e-16
# adjusted R: 0.8732 so we better keep trim even though it's cumbersome (too many dummies!)
summary(lm(price ~ trim+condition+mileage+year+fuel))
##
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel)
##
## Residuals:
## Min 1Q Median 3Q Max
## -75103 -5823 -972 3749 272745
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.321e+06 9.083e+04 -102.621 < 2e-16 ***
## trim350 -3.943e+04 1.444e+03 -27.314 < 2e-16 ***
## trim400 -3.210e+04 1.197e+04 -2.681 0.007339 **
## trim420 1.447e+03 1.245e+03 1.162 0.245110
## trim430 -2.638e+04 9.109e+02 -28.957 < 2e-16 ***
## trim450 -4.062e+04 1.197e+04 -3.393 0.000692 ***
## trim500 -2.330e+04 9.053e+02 -25.733 < 2e-16 ***
## trim55 AMG -2.519e+04 1.144e+03 -22.020 < 2e-16 ***
## trim550 -3.503e+04 9.796e+02 -35.757 < 2e-16 ***
## trim600 -1.554e+04 1.046e+03 -14.851 < 2e-16 ***
## trim63 AMG 1.592e+03 1.028e+03 1.548 0.121728
## trim65 AMG 2.024e+04 1.182e+03 17.126 < 2e-16 ***
## trimunsp 5.217e+03 1.687e+03 3.093 0.001983 **
## conditionNew 3.708e+04 2.594e+02 142.920 < 2e-16 ***
## conditionUsed -5.483e+03 2.490e+02 -22.019 < 2e-16 ***
## mileage -1.362e-01 3.194e-03 -42.643 < 2e-16 ***
## year 4.683e+03 4.527e+01 103.458 < 2e-16 ***
## fuelGasoline -2.553e+03 1.307e+03 -1.953 0.050847 .
## fuelHybrid -1.179e+04 1.204e+04 -0.980 0.327171
## fuelunsp 1.742e+04 1.476e+03 11.800 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11930 on 29446 degrees of freedom
## Multiple R-squared: 0.9293, Adjusted R-squared: 0.9292
## F-statistic: 2.036e+04 on 19 and 29446 DF, p-value: < 2.2e-16
# adjusted R 0.9292
summary(lm(price ~ trim+condition+mileage+year+fuel+featureCount))
##
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel +
## featureCount)
##
## Residuals:
## Min 1Q Median 3Q Max
## -75454 -5839 -959 3766 272804
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.367e+06 9.128e+04 -102.619 < 2e-16 ***
## trim350 -3.945e+04 1.443e+03 -27.334 < 2e-16 ***
## trim400 -3.246e+04 1.197e+04 -2.713 0.006676 **
## trim420 1.443e+03 1.244e+03 1.160 0.246042
## trim430 -2.635e+04 9.105e+02 -28.939 < 2e-16 ***
## trim450 -4.060e+04 1.197e+04 -3.392 0.000694 ***
## trim500 -2.327e+04 9.050e+02 -25.718 < 2e-16 ***
## trim55 AMG -2.514e+04 1.144e+03 -21.988 < 2e-16 ***
## trim550 -3.502e+04 9.792e+02 -35.761 < 2e-16 ***
## trim600 -1.550e+04 1.046e+03 -14.823 < 2e-16 ***
## trim63 AMG 1.592e+03 1.028e+03 1.548 0.121578
## trim65 AMG 2.028e+04 1.182e+03 17.168 < 2e-16 ***
## trimunsp 5.225e+03 1.686e+03 3.099 0.001945 **
## conditionNew 3.691e+04 2.616e+02 141.093 < 2e-16 ***
## conditionUsed -5.566e+03 2.495e+02 -22.309 < 2e-16 ***
## mileage -1.360e-01 3.193e-03 -42.590 < 2e-16 ***
## year 4.707e+03 4.550e+01 103.444 < 2e-16 ***
## fuelGasoline -2.571e+03 1.307e+03 -1.967 0.049159 *
## fuelHybrid -1.143e+04 1.203e+04 -0.950 0.342302
## fuelunsp 1.716e+04 1.477e+03 11.624 < 2e-16 ***
## featureCount -1.203e+01 2.449e+00 -4.911 9.11e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11930 on 29445 degrees of freedom
## Multiple R-squared: 0.9293, Adjusted R-squared: 0.9293
## F-statistic: 1.936e+04 on 20 and 29445 DF, p-value: < 2.2e-16
# 0.9293 little improvement, consider removing featureCount
# convert displacement data into numeric
dis = sapply(displacement, function(x) as.numeric(substr(x, 1, 3)))
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
## Warning in FUN(X[[i]], ...): NAs introduced by coercion
car$displacement = dis
# add displacement to regression
attach(car)
## The following objects are masked from car (pos = 4):
##
## color, condition, displacement, featureCount, fuel,
## isOneOwner, mileage, price, region, soundSystem, state,
## subTrim, trim, wheelSize, wheelType, X, year
summary(lm(price ~ trim+condition+mileage+year+fuel+displacement))
##
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel +
## displacement)
##
## Residuals:
## Min 1Q Median 3Q Max
## -70179 -5707 -953 3729 272710
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.779e+06 1.022e+05 -85.857 < 2e-16 ***
## trim350 -3.539e+04 1.482e+03 -23.882 < 2e-16 ***
## trim400 -3.891e+04 1.848e+03 -21.055 < 2e-16 ***
## trim420 4.287e+03 1.254e+03 3.418 0.000632 ***
## trim430 -2.148e+04 1.017e+03 -21.119 < 2e-16 ***
## trim450 -3.279e+04 1.174e+04 -2.793 0.005229 **
## trim500 -1.644e+04 1.120e+03 -14.681 < 2e-16 ***
## trim55 AMG -1.684e+04 1.394e+03 -12.078 < 2e-16 ***
## trim550 -2.528e+04 1.352e+03 -18.701 < 2e-16 ***
## trim600 -5.801e+03 1.437e+03 -4.036 5.45e-05 ***
## trim63 AMG 1.363e+04 1.571e+03 8.674 < 2e-16 ***
## trim65 AMG 3.165e+04 1.682e+03 18.818 < 2e-16 ***
## trimunsp 3.279e+04 2.391e+03 13.714 < 2e-16 ***
## conditionNew 3.705e+04 2.545e+02 145.574 < 2e-16 ***
## conditionUsed -5.083e+03 2.466e+02 -20.608 < 2e-16 ***
## mileage -1.331e-01 3.148e-03 -42.287 < 2e-16 ***
## year 4.416e+03 5.081e+01 86.917 < 2e-16 ***
## fuelGasoline -2.361e+03 1.311e+03 -1.801 0.071727 .
## fuelHybrid NA NA NA NA
## fuelunsp 2.329e+04 1.592e+03 14.629 < 2e-16 ***
## displacement -3.010e+03 2.991e+02 -10.062 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11680 on 29272 degrees of freedom
## (174 observations deleted due to missingness)
## Multiple R-squared: 0.9311, Adjusted R-squared: 0.9311
## F-statistic: 2.083e+04 on 19 and 29272 DF, p-value: < 2.2e-16
# adjusted R 0.9417. If continuous variable - 0.9311 ????
summary(lm(price ~ trim+condition+mileage+year+fuel+displacement+color))
##
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel +
## displacement + color)
##
## Residuals:
## Min 1Q Median 3Q Max
## -69648 -5690 -968 3700 273575
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.773e+06 1.021e+05 -85.895 < 2e-16 ***
## trim350 -3.488e+04 1.479e+03 -23.576 < 2e-16 ***
## trim400 -3.846e+04 1.845e+03 -20.844 < 2e-16 ***
## trim420 4.108e+03 1.251e+03 3.283 0.001029 **
## trim430 -2.108e+04 1.016e+03 -20.742 < 2e-16 ***
## trim450 -3.143e+04 1.171e+04 -2.684 0.007279 **
## trim500 -1.617e+04 1.118e+03 -14.454 < 2e-16 ***
## trim55 AMG -1.636e+04 1.393e+03 -11.748 < 2e-16 ***
## trim550 -2.507e+04 1.350e+03 -18.574 < 2e-16 ***
## trim600 -5.568e+03 1.435e+03 -3.879 0.000105 ***
## trim63 AMG 1.365e+04 1.568e+03 8.704 < 2e-16 ***
## trim65 AMG 3.184e+04 1.679e+03 18.961 < 2e-16 ***
## trimunsp 3.289e+04 2.386e+03 13.787 < 2e-16 ***
## conditionNew 3.705e+04 2.553e+02 145.090 < 2e-16 ***
## conditionUsed -5.068e+03 2.462e+02 -20.584 < 2e-16 ***
## mileage -1.326e-01 3.142e-03 -42.214 < 2e-16 ***
## year 4.413e+03 5.076e+01 86.944 < 2e-16 ***
## fuelGasoline -2.205e+03 1.308e+03 -1.685 0.091978 .
## fuelHybrid NA NA NA NA
## fuelunsp 2.343e+04 1.589e+03 14.748 < 2e-16 ***
## displacement -2.919e+03 2.990e+02 -9.763 < 2e-16 ***
## colorBlack -5.525e+02 8.131e+02 -0.680 0.496818
## colorBlue -1.302e+03 8.571e+02 -1.519 0.128758
## colorBronze 4.719e+03 4.196e+03 1.125 0.260730
## colorBrown 3.602e+02 1.657e+03 0.217 0.827889
## colorGold 1.132e+03 1.094e+03 1.035 0.300674
## colorGray -1.660e+03 8.471e+02 -1.960 0.050059 .
## colorGreen -2.693e+01 1.201e+03 -0.022 0.982103
## colorPurple 6.530e+03 4.197e+03 1.556 0.119767
## colorRed -2.398e+02 1.036e+03 -0.231 0.816935
## colorSilver -1.448e+03 8.184e+02 -1.770 0.076768 .
## colorTurquoise -1.345e+03 5.272e+03 -0.255 0.798631
## colorunsp 5.846e+02 8.627e+02 0.678 0.498014
## colorWhite 1.194e+03 8.260e+02 1.446 0.148303
## colorYellow -6.023e+03 8.274e+03 -0.728 0.466634
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11650 on 29258 degrees of freedom
## (174 observations deleted due to missingness)
## Multiple R-squared: 0.9316, Adjusted R-squared: 0.9315
## F-statistic: 1.207e+04 on 33 and 29258 DF, p-value: < 2.2e-16
# 0.9419, not much improvement for adjusted R, much more variance though. drop.
summary(lm(price ~ trim+condition+mileage+year+fuel+displacement+state))
##
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel +
## displacement + state)
##
## Residuals:
## Min 1Q Median 3Q Max
## -69928 -5715 -1011 3653 273216
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.842e+06 1.029e+05 -85.967 < 2e-16 ***
## trim350 -3.548e+04 1.481e+03 -23.952 < 2e-16 ***
## trim400 -3.878e+04 1.847e+03 -20.990 < 2e-16 ***
## trim420 4.285e+03 1.253e+03 3.419 0.00063 ***
## trim430 -2.156e+04 1.017e+03 -21.198 < 2e-16 ***
## trim450 -3.697e+04 1.352e+04 -2.734 0.00625 **
## trim500 -1.659e+04 1.119e+03 -14.823 < 2e-16 ***
## trim55 AMG -1.705e+04 1.393e+03 -12.237 < 2e-16 ***
## trim550 -2.542e+04 1.351e+03 -18.813 < 2e-16 ***
## trim600 -6.021e+03 1.437e+03 -4.191 2.79e-05 ***
## trim63 AMG 1.337e+04 1.570e+03 8.515 < 2e-16 ***
## trim65 AMG 3.138e+04 1.681e+03 18.666 < 2e-16 ***
## trimunsp 3.251e+04 2.390e+03 13.604 < 2e-16 ***
## conditionNew 3.707e+04 2.564e+02 144.552 < 2e-16 ***
## conditionUsed -5.060e+03 2.493e+02 -20.296 < 2e-16 ***
## mileage -1.326e-01 3.163e-03 -41.937 < 2e-16 ***
## year 4.445e+03 5.095e+01 87.237 < 2e-16 ***
## fuelGasoline -2.198e+03 1.311e+03 -1.677 0.09361 .
## fuelHybrid NA NA NA NA
## fuelunsp 2.357e+04 1.592e+03 14.810 < 2e-16 ***
## displacement -2.974e+03 2.990e+02 -9.945 < 2e-16 ***
## stateAL 5.530e+03 8.269e+03 0.669 0.50363
## stateAR 6.379e+03 8.367e+03 0.762 0.44583
## stateAZ 6.926e+03 8.266e+03 0.838 0.40207
## stateCA 6.004e+03 8.246e+03 0.728 0.46658
## stateCO 6.940e+03 8.264e+03 0.840 0.40107
## stateCT 5.523e+03 8.268e+03 0.668 0.50411
## stateDC -7.360e+03 1.064e+04 -0.692 0.48918
## stateDE 7.870e+03 8.330e+03 0.945 0.34475
## stateFL 5.600e+03 8.247e+03 0.679 0.49708
## stateGA 4.952e+03 8.251e+03 0.600 0.54838
## stateHI 5.424e+03 8.323e+03 0.652 0.51462
## stateIA 7.493e+03 8.459e+03 0.886 0.37572
## stateID 1.007e+04 8.595e+03 1.171 0.24156
## stateIL 6.038e+03 8.251e+03 0.732 0.46433
## stateIN 5.113e+03 8.287e+03 0.617 0.53729
## stateKS 6.633e+03 8.342e+03 0.795 0.42655
## stateKY 9.236e+03 8.286e+03 1.115 0.26499
## stateLA 8.012e+03 8.299e+03 0.965 0.33434
## stateMA 6.081e+03 8.255e+03 0.737 0.46135
## stateMD 6.591e+03 8.257e+03 0.798 0.42472
## stateME 4.439e+03 8.581e+03 0.517 0.60492
## stateMI 5.829e+03 8.282e+03 0.704 0.48155
## stateMN 6.750e+03 8.280e+03 0.815 0.41492
## stateMO 7.337e+03 8.267e+03 0.887 0.37483
## stateMS 7.796e+03 8.308e+03 0.938 0.34808
## stateMT 1.249e+04 8.963e+03 1.394 0.16337
## stateNC 6.772e+03 8.254e+03 0.820 0.41200
## stateND 8.337e+03 9.755e+03 0.855 0.39276
## stateNE 7.869e+03 8.646e+03 0.910 0.36275
## stateNH 7.445e+03 8.301e+03 0.897 0.36984
## stateNJ 5.487e+03 8.249e+03 0.665 0.50593
## stateNM 6.309e+03 8.484e+03 0.744 0.45706
## stateNV 8.711e+03 8.267e+03 1.054 0.29200
## stateNY 4.631e+03 8.248e+03 0.561 0.57450
## stateOH 5.353e+03 8.258e+03 0.648 0.51683
## stateOK 5.665e+03 8.289e+03 0.683 0.49433
## stateON 1.000e+04 1.065e+04 0.939 0.34750
## stateOR 6.987e+03 8.297e+03 0.842 0.39972
## statePA 5.977e+03 8.255e+03 0.724 0.46907
## stateRI 4.949e+03 8.399e+03 0.589 0.55571
## stateSC 6.711e+03 8.277e+03 0.811 0.41752
## stateSD 2.540e+04 1.166e+04 2.179 0.02937 *
## stateTN 4.966e+03 8.264e+03 0.601 0.54787
## stateTX 6.291e+03 8.248e+03 0.763 0.44563
## stateunsp 8.728e+03 1.428e+04 0.611 0.54099
## stateUT 8.867e+03 8.308e+03 1.067 0.28589
## stateVA 6.043e+03 8.253e+03 0.732 0.46407
## stateWA 7.459e+03 8.270e+03 0.902 0.36708
## stateWI 7.044e+03 8.299e+03 0.849 0.39600
## stateWV 6.446e+03 8.454e+03 0.763 0.44575
## stateWY 2.201e+03 1.166e+04 0.189 0.85027
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11660 on 29221 degrees of freedom
## (174 observations deleted due to missingness)
## Multiple R-squared: 0.9315, Adjusted R-squared: 0.9314
## F-statistic: 5679 on 70 and 29221 DF, p-value: < 2.2e-16
# 0.942 6not much improvement for adjusted R, much more variance though. drop.
summary(lm(price ~ trim+condition+mileage+year+fuel+displacement+region))
##
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel +
## displacement + region)
##
## Residuals:
## Min 1Q Median 3Q Max
## -70288 -5705 -991 3701 273525
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.805e+06 1.023e+05 -86.079 < 2e-16 ***
## trim350 -3.549e+04 1.481e+03 -23.968 < 2e-16 ***
## trim400 -3.905e+04 1.847e+03 -21.140 < 2e-16 ***
## trim420 4.261e+03 1.253e+03 3.400 0.000674 ***
## trim430 -2.155e+04 1.016e+03 -21.206 < 2e-16 ***
## trim450 -3.652e+04 1.311e+04 -2.787 0.005323 **
## trim500 -1.657e+04 1.119e+03 -14.806 < 2e-16 ***
## trim55 AMG -1.701e+04 1.393e+03 -12.208 < 2e-16 ***
## trim550 -2.535e+04 1.351e+03 -18.769 < 2e-16 ***
## trim600 -5.952e+03 1.436e+03 -4.144 3.42e-05 ***
## trim63 AMG 1.344e+04 1.570e+03 8.564 < 2e-16 ***
## trim65 AMG 3.144e+04 1.681e+03 18.705 < 2e-16 ***
## trimunsp 3.256e+04 2.389e+03 13.629 < 2e-16 ***
## conditionNew 3.710e+04 2.555e+02 145.173 < 2e-16 ***
## conditionUsed -5.119e+03 2.476e+02 -20.675 < 2e-16 ***
## mileage -1.325e-01 3.147e-03 -42.104 < 2e-16 ***
## year 4.429e+03 5.083e+01 87.133 < 2e-16 ***
## fuelGasoline -2.447e+03 1.311e+03 -1.867 0.061916 .
## fuelHybrid NA NA NA NA
## fuelunsp 2.331e+04 1.592e+03 14.647 < 2e-16 ***
## displacement -2.985e+03 2.989e+02 -9.985 < 2e-16 ***
## regionESC 4.331e+02 4.247e+02 1.020 0.307850
## regionMid -6.712e+02 2.829e+02 -2.372 0.017687 *
## regionMtn 1.838e+03 3.936e+02 4.671 3.01e-06 ***
## regionNew 1.871e+02 3.895e+02 0.480 0.630910
## regionPac 2.691e+02 2.815e+02 0.956 0.339184
## regionSoA -1.718e+01 2.691e+02 -0.064 0.949087
## regionunsp 3.798e+03 5.842e+03 0.650 0.515666
## regionWNC 1.317e+03 4.884e+02 2.696 0.007019 **
## regionWSC 5.204e+02 3.213e+02 1.620 0.105318
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11670 on 29263 degrees of freedom
## (174 observations deleted due to missingness)
## Multiple R-squared: 0.9313, Adjusted R-squared: 0.9312
## F-statistic: 1.417e+04 on 28 and 29263 DF, p-value: < 2.2e-16
# 0.9419 drop.
summary(lm(price ~ trim+condition+mileage+year+fuel+displacement+soundSystem))
##
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel +
## displacement + soundSystem)
##
## Residuals:
## Min 1Q Median 3Q Max
## -69973 -5778 -913 3836 271711
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.746e+06 1.024e+05 -85.443 < 2e-16 ***
## trim350 -3.507e+04 1.477e+03 -23.737 < 2e-16 ***
## trim400 -3.869e+04 1.842e+03 -21.004 < 2e-16 ***
## trim420 4.345e+03 1.250e+03 3.477 0.000508 ***
## trim430 -2.117e+04 1.015e+03 -20.856 < 2e-16 ***
## trim450 -3.045e+04 1.170e+04 -2.603 0.009250 **
## trim500 -1.617e+04 1.117e+03 -14.480 < 2e-16 ***
## trim55 AMG -1.632e+04 1.392e+03 -11.723 < 2e-16 ***
## trim550 -2.470e+04 1.348e+03 -18.322 < 2e-16 ***
## trim600 -5.643e+03 1.432e+03 -3.941 8.14e-05 ***
## trim63 AMG 1.402e+04 1.566e+03 8.955 < 2e-16 ***
## trim65 AMG 3.158e+04 1.677e+03 18.836 < 2e-16 ***
## trimunsp 3.305e+04 2.383e+03 13.870 < 2e-16 ***
## conditionNew 3.656e+04 2.601e+02 140.544 < 2e-16 ***
## conditionUsed -4.928e+03 2.462e+02 -20.020 < 2e-16 ***
## mileage -1.344e-01 3.140e-03 -42.794 < 2e-16 ***
## year 4.403e+03 5.076e+01 86.748 < 2e-16 ***
## fuelGasoline -2.889e+03 1.310e+03 -2.205 0.027466 *
## fuelHybrid NA NA NA NA
## fuelunsp 2.276e+04 1.589e+03 14.319 < 2e-16 ***
## displacement -2.968e+03 2.981e+02 -9.955 < 2e-16 ***
## soundSystemBang Olufsen -6.314e+02 8.278e+03 -0.076 0.939195
## soundSystemBose -8.569e+03 8.238e+03 -1.040 0.298216
## soundSystemBoston Acoustic -1.050e+04 1.425e+04 -0.737 0.461419
## soundSystemHarman Kardon -8.522e+03 8.231e+03 -1.035 0.300535
## soundSystemPremium -6.496e+03 8.230e+03 -0.789 0.429954
## soundSystemunsp -5.826e+03 8.229e+03 -0.708 0.479012
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11630 on 29266 degrees of freedom
## (174 observations deleted due to missingness)
## Multiple R-squared: 0.9317, Adjusted R-squared: 0.9316
## F-statistic: 1.596e+04 on 25 and 29266 DF, p-value: < 2.2e-16
# 0.942 drop
summary(lm(price ~ trim+condition+mileage+year+fuel+displacement+wheelType))
##
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel +
## displacement + wheelType)
##
## Residuals:
## Min 1Q Median 3Q Max
## -69892 -5699 -926 3764 272938
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.767e+06 1.021e+05 -85.872 < 2e-16 ***
## trim350 -3.540e+04 1.479e+03 -23.930 < 2e-16 ***
## trim400 -3.894e+04 1.845e+03 -21.112 < 2e-16 ***
## trim420 4.216e+03 1.252e+03 3.368 0.000759 ***
## trim430 -2.152e+04 1.015e+03 -21.196 < 2e-16 ***
## trim450 -3.260e+04 1.172e+04 -2.782 0.005411 **
## trim500 -1.651e+04 1.118e+03 -14.774 < 2e-16 ***
## trim55 AMG -1.691e+04 1.392e+03 -12.150 < 2e-16 ***
## trim550 -2.538e+04 1.350e+03 -18.808 < 2e-16 ***
## trim600 -5.956e+03 1.435e+03 -4.151 3.32e-05 ***
## trim63 AMG 1.351e+04 1.568e+03 8.613 < 2e-16 ***
## trim65 AMG 3.153e+04 1.679e+03 18.776 < 2e-16 ***
## trimunsp 3.276e+04 2.387e+03 13.726 < 2e-16 ***
## conditionNew 3.699e+04 2.563e+02 144.336 < 2e-16 ***
## conditionUsed -5.137e+03 2.465e+02 -20.843 < 2e-16 ***
## mileage -1.328e-01 3.142e-03 -42.262 < 2e-16 ***
## year 4.410e+03 5.074e+01 86.928 < 2e-16 ***
## fuelGasoline -2.466e+03 1.309e+03 -1.884 0.059545 .
## fuelHybrid NA NA NA NA
## fuelunsp 2.323e+04 1.589e+03 14.620 < 2e-16 ***
## displacement -2.941e+03 2.986e+02 -9.846 < 2e-16 ***
## wheelTypeChrome 7.816e+00 1.309e+03 0.006 0.995237
## wheelTypePremium -9.616e+02 5.756e+02 -1.670 0.094844 .
## wheelTypeSteel 1.682e+04 1.670e+03 10.073 < 2e-16 ***
## wheelTypeunsp 5.559e+02 1.400e+02 3.971 7.18e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11660 on 29268 degrees of freedom
## (174 observations deleted due to missingness)
## Multiple R-squared: 0.9314, Adjusted R-squared: 0.9314
## F-statistic: 1.728e+04 on 23 and 29268 DF, p-value: < 2.2e-16
# 0.942 drop
summary(lm(price ~ trim+condition+mileage+year+fuel+displacement+wheelSize))
##
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel +
## displacement + wheelSize)
##
## Residuals:
## Min 1Q Median 3Q Max
## -68074 -5741 -888 3718 272682
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.791e+06 1.023e+05 -85.942 < 2e-16 ***
## trim350 -3.451e+04 1.474e+03 -23.411 < 2e-16 ***
## trim400 -3.797e+04 1.837e+03 -20.668 < 2e-16 ***
## trim420 4.575e+03 1.246e+03 3.670 0.000243 ***
## trim430 -2.103e+04 1.011e+03 -20.798 < 2e-16 ***
## trim450 -3.242e+04 1.166e+04 -2.780 0.005445 **
## trim500 -1.590e+04 1.113e+03 -14.284 < 2e-16 ***
## trim55 AMG -1.627e+04 1.387e+03 -11.735 < 2e-16 ***
## trim550 -2.481e+04 1.345e+03 -18.440 < 2e-16 ***
## trim600 -5.300e+03 1.429e+03 -3.708 0.000209 ***
## trim63 AMG 1.292e+04 1.563e+03 8.264 < 2e-16 ***
## trim65 AMG 3.118e+04 1.672e+03 18.645 < 2e-16 ***
## trimunsp 3.178e+04 2.377e+03 13.371 < 2e-16 ***
## conditionNew 3.688e+04 2.557e+02 144.229 < 2e-16 ***
## conditionUsed -5.006e+03 2.452e+02 -20.420 < 2e-16 ***
## mileage -1.329e-01 3.128e-03 -42.498 < 2e-16 ***
## year 4.425e+03 5.087e+01 86.988 < 2e-16 ***
## fuelGasoline -1.766e+03 1.306e+03 -1.352 0.176240
## fuelHybrid NA NA NA NA
## fuelunsp 2.339e+04 1.584e+03 14.765 < 2e-16 ***
## displacement -3.071e+03 2.976e+02 -10.318 < 2e-16 ***
## wheelSize17 -1.116e+04 1.484e+03 -7.525 5.42e-14 ***
## wheelSize18 -6.631e+03 1.178e+03 -5.627 1.85e-08 ***
## wheelSize19 -5.607e+03 1.193e+03 -4.700 2.62e-06 ***
## wheelSize20 2.355e+03 1.221e+03 1.929 0.053687 .
## wheelSize21 -2.470e+03 8.281e+03 -0.298 0.765538
## wheelSize22 3.059e+00 2.377e+03 0.001 0.998973
## wheelSizeunsp -5.050e+03 1.141e+03 -4.426 9.62e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11600 on 29265 degrees of freedom
## (174 observations deleted due to missingness)
## Multiple R-squared: 0.9321, Adjusted R-squared: 0.932
## F-statistic: 1.545e+04 on 26 and 29265 DF, p-value: < 2.2e-16
# 0.9421 drop
summary(lm(price ~ trim+condition+mileage+year+fuel+displacement+featureCount))
##
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel +
## displacement + featureCount)
##
## Residuals:
## Min 1Q Median 3Q Max
## -70088 -5740 -924 3715 272781
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.839e+06 1.027e+05 -86.079 < 2e-16 ***
## trim350 -3.545e+04 1.481e+03 -23.932 < 2e-16 ***
## trim400 -3.896e+04 1.847e+03 -21.092 < 2e-16 ***
## trim420 4.259e+03 1.254e+03 3.397 0.000681 ***
## trim430 -2.149e+04 1.016e+03 -21.141 < 2e-16 ***
## trim450 -3.283e+04 1.173e+04 -2.798 0.005149 **
## trim500 -1.647e+04 1.119e+03 -14.717 < 2e-16 ***
## trim55 AMG -1.686e+04 1.393e+03 -12.097 < 2e-16 ***
## trim550 -2.535e+04 1.351e+03 -18.763 < 2e-16 ***
## trim600 -5.843e+03 1.436e+03 -4.068 4.76e-05 ***
## trim63 AMG 1.352e+04 1.570e+03 8.612 < 2e-16 ***
## trim65 AMG 3.160e+04 1.681e+03 18.798 < 2e-16 ***
## trimunsp 3.295e+04 2.390e+03 13.786 < 2e-16 ***
## conditionNew 3.684e+04 2.566e+02 143.590 < 2e-16 ***
## conditionUsed -5.185e+03 2.471e+02 -20.984 < 2e-16 ***
## mileage -1.329e-01 3.147e-03 -42.238 < 2e-16 ***
## year 4.447e+03 5.103e+01 87.131 < 2e-16 ***
## fuelGasoline -2.389e+03 1.310e+03 -1.823 0.068300 .
## fuelHybrid NA NA NA NA
## fuelunsp 2.315e+04 1.592e+03 14.546 < 2e-16 ***
## displacement -2.985e+03 2.990e+02 -9.982 < 2e-16 ***
## featureCount -1.436e+01 2.409e+00 -5.962 2.52e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11670 on 29271 degrees of freedom
## (174 observations deleted due to missingness)
## Multiple R-squared: 0.9312, Adjusted R-squared: 0.9312
## F-statistic: 1.982e+04 on 20 and 29271 DF, p-value: < 2.2e-16
# 0.9417 drop
# so best bet is: adjusted R 0.9417:
summary(lm(price ~ trim+condition+mileage+year+fuel+displacement))
##
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel +
## displacement)
##
## Residuals:
## Min 1Q Median 3Q Max
## -70179 -5707 -953 3729 272710
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.779e+06 1.022e+05 -85.857 < 2e-16 ***
## trim350 -3.539e+04 1.482e+03 -23.882 < 2e-16 ***
## trim400 -3.891e+04 1.848e+03 -21.055 < 2e-16 ***
## trim420 4.287e+03 1.254e+03 3.418 0.000632 ***
## trim430 -2.148e+04 1.017e+03 -21.119 < 2e-16 ***
## trim450 -3.279e+04 1.174e+04 -2.793 0.005229 **
## trim500 -1.644e+04 1.120e+03 -14.681 < 2e-16 ***
## trim55 AMG -1.684e+04 1.394e+03 -12.078 < 2e-16 ***
## trim550 -2.528e+04 1.352e+03 -18.701 < 2e-16 ***
## trim600 -5.801e+03 1.437e+03 -4.036 5.45e-05 ***
## trim63 AMG 1.363e+04 1.571e+03 8.674 < 2e-16 ***
## trim65 AMG 3.165e+04 1.682e+03 18.818 < 2e-16 ***
## trimunsp 3.279e+04 2.391e+03 13.714 < 2e-16 ***
## conditionNew 3.705e+04 2.545e+02 145.574 < 2e-16 ***
## conditionUsed -5.083e+03 2.466e+02 -20.608 < 2e-16 ***
## mileage -1.331e-01 3.148e-03 -42.287 < 2e-16 ***
## year 4.416e+03 5.081e+01 86.917 < 2e-16 ***
## fuelGasoline -2.361e+03 1.311e+03 -1.801 0.071727 .
## fuelHybrid NA NA NA NA
## fuelunsp 2.329e+04 1.592e+03 14.629 < 2e-16 ***
## displacement -3.010e+03 2.991e+02 -10.062 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11680 on 29272 degrees of freedom
## (174 observations deleted due to missingness)
## Multiple R-squared: 0.9311, Adjusted R-squared: 0.9311
## F-statistic: 2.083e+04 on 19 and 29272 DF, p-value: < 2.2e-16
# try some quadratic function:
summary(lm(price ~ trim+condition+mileage+year+fuel+displacement))
##
## Call:
## lm(formula = price ~ trim + condition + mileage + year + fuel +
## displacement)
##
## Residuals:
## Min 1Q Median 3Q Max
## -70179 -5707 -953 3729 272710
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.779e+06 1.022e+05 -85.857 < 2e-16 ***
## trim350 -3.539e+04 1.482e+03 -23.882 < 2e-16 ***
## trim400 -3.891e+04 1.848e+03 -21.055 < 2e-16 ***
## trim420 4.287e+03 1.254e+03 3.418 0.000632 ***
## trim430 -2.148e+04 1.017e+03 -21.119 < 2e-16 ***
## trim450 -3.279e+04 1.174e+04 -2.793 0.005229 **
## trim500 -1.644e+04 1.120e+03 -14.681 < 2e-16 ***
## trim55 AMG -1.684e+04 1.394e+03 -12.078 < 2e-16 ***
## trim550 -2.528e+04 1.352e+03 -18.701 < 2e-16 ***
## trim600 -5.801e+03 1.437e+03 -4.036 5.45e-05 ***
## trim63 AMG 1.363e+04 1.571e+03 8.674 < 2e-16 ***
## trim65 AMG 3.165e+04 1.682e+03 18.818 < 2e-16 ***
## trimunsp 3.279e+04 2.391e+03 13.714 < 2e-16 ***
## conditionNew 3.705e+04 2.545e+02 145.574 < 2e-16 ***
## conditionUsed -5.083e+03 2.466e+02 -20.608 < 2e-16 ***
## mileage -1.331e-01 3.148e-03 -42.287 < 2e-16 ***
## year 4.416e+03 5.081e+01 86.917 < 2e-16 ***
## fuelGasoline -2.361e+03 1.311e+03 -1.801 0.071727 .
## fuelHybrid NA NA NA NA
## fuelunsp 2.329e+04 1.592e+03 14.629 < 2e-16 ***
## displacement -3.010e+03 2.991e+02 -10.062 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11680 on 29272 degrees of freedom
## (174 observations deleted due to missingness)
## Multiple R-squared: 0.9311, Adjusted R-squared: 0.9311
## F-statistic: 2.083e+04 on 19 and 29272 DF, p-value: < 2.2e-16
set.seed(17)
# k = 10
train = sample(29466, 29466 * 0.9)
fit = lm(price ~ trim+condition+mileage+year+fuel+displacement, data= car, subset = train)
mean((price-predict(fit, car))[-train]^2)
## Warning in predict.lm(fit, car): prediction from a rank-deficient fit may
## be misleading
## [1] NA