car_tree

references: http://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html http://www.statmethods.net/advstats/cart.html

####### tree

car = read.csv("/Users/vickyzhang/Documents/MSBA/predictive/project/cars.csv", header = TRUE)
attach(car)

library(tree)



#--------------------------------------------------
#fit a tree to boston data just using lstat.

#first get a big tree using a small value of mindev
temp = tree(price~year,data=car,mindev=.0001)
cat('first big tree size: \n')

## first big tree size:

print(length(unique(temp$where)))

## [1] 12

#then prune it down to one with 7 leaves
car.tree=prune.tree(temp,best=7)
cat('pruned tree size: \n')

## pruned tree size:

print(length(unique(car.tree$where)))

## [1] 7

#--------------------------------------------------
#plot the tree and the fits.
par(mfrow=c(1,2))

#plot the tree
plot(car.tree,type="uniform")
text(car.tree,col="blue",label=c("yval"),cex=.8)

#plot data with fit
car.fit = predict(car.tree) #get training fitted values

plot(year,price,cex=.5,pch=16) #plot data
oo=order(price) # sorting price, decreasing
lines(price[oo],car.fit[oo],col='red',lwd=3) #step function fit

cvals=c(2013.5, 2009.5, 2014.5, 2000.5, 2011.5, 2012.5) #cutpoints from tree
for(i in 1:length(cvals)) abline(v=cvals[i],col='magenta',lty=2) #cutpoints

rm(list=ls())

##################part 2
library(tree)
car = read.csv("/Users/vickyzhang/Documents/MSBA/predictive/project/cars.csv", header = TRUE)
attach(car)

## The following objects are masked from car (pos = 4):
## 
##     color, condition, displacement, featureCount, fuel,
##     isOneOwner, mileage, price, region, soundSystem, state,
##     subTrim, trim, wheelSize, wheelType, X, year

#--------------------------------------------------
df2=car[,c(6,7,17)] #pick off mileage, year, price
print(names(df2))

## [1] "mileage" "year"    "price"

#--------------------------------------------------
#big tree
temp = tree(price~.,df2,mindev=.0001)
cat('first big tree size: \n')

## first big tree size:

print(length(unique(temp$where)))

## [1] 30

#--------------------------------------------------
#then prune it down to one with 7 leaves
boston.tree=prune.tree(temp,best=7)
cat('pruned tree size: \n')

## pruned tree size:

print(length(unique(boston.tree$where)))

## [1] 7

#--------------------------------------------------
# plot tree and partition in x.
par(mfrow=c(1,2))
plot(boston.tree,type="u")
text(boston.tree,col="blue",label=c("yval"),cex=.8)
partition.tree(boston.tree)

rm(list=ls())

car = read.csv("/Users/vickyzhang/Documents/MSBA/predictive/project/cars.csv", header = TRUE)
attach(car)

## The following objects are masked from car (pos = 3):
## 
##     color, condition, displacement, featureCount, fuel,
##     isOneOwner, mileage, price, region, soundSystem, state,
##     subTrim, trim, wheelSize, wheelType, X, year
## 
## The following objects are masked from car (pos = 5):
## 
##     color, condition, displacement, featureCount, fuel,
##     isOneOwner, mileage, price, region, soundSystem, state,
##     subTrim, trim, wheelSize, wheelType, X, year

#--------------------------------------------------
df2=car[,c(6,9,17)] #pick off variables
print(names(df2))

## [1] "mileage"      "displacement" "price"

#--------------------------------------------------
#big tree
temp = tree(price~.,df2,mindev=.0001)
cat('first big tree size: \n')

## first big tree size:

print(length(unique(temp$where)))

## [1] 40

#then prune it down to one with 7 leaves
boston.tree=prune.tree(temp,best=7)
cat('pruned tree size: \n')

## pruned tree size:

print(length(unique(boston.tree$where)))

## [1] 7

#--------------------------------------------------
#get predictions on 2d grid
pv=seq(from=.01,to=.99,by=.05)
# The following 2 sections will not work because of dataset limitations 
#x1q = quantile(df2$displacement,probs=pv,na.rm = TRUE)
#x2q = quantile(df2$mileage,probs=pv,na.rm = TRUE)
#xx = expand.grid(x1q,x2q) #matrix with two columns using all combinations of x1q and x2q
#dfpred = data.frame(mileage=xx[,2],displacement=xx[,1])
#lmedpred = predict(boston.tree,dfpred)

#make perspective plot

#par(mfrow=c(1,1))
# this one failed because 'increasing 'x' and 'y' values expected'
# Price and years do not span a big interval so they don't increase between some quartiles. That's why this problem can't be fixed here. We need a continuously changing variable to make this function work.
#persp(x1q,x2q,matrix(lmedpred,ncol=length(x2q),byrow=T),
 #          theta=150,xlab='dis',ylab='lstat',zlab='medv',
  #         zlim=c(min(df2$price),1.1*max(df2$price)))

library(tree)
library(rpart)
attach(car)

## The following objects are masked from car (pos = 4):
## 
##     color, condition, displacement, featureCount, fuel,
##     isOneOwner, mileage, price, region, soundSystem, state,
##     subTrim, trim, wheelSize, wheelType, X, year
## 
## The following objects are masked from car (pos = 5):
## 
##     color, condition, displacement, featureCount, fuel,
##     isOneOwner, mileage, price, region, soundSystem, state,
##     subTrim, trim, wheelSize, wheelType, X, year
## 
## The following objects are masked from car (pos = 7):
## 
##     color, condition, displacement, featureCount, fuel,
##     isOneOwner, mileage, price, region, soundSystem, state,
##     subTrim, trim, wheelSize, wheelType, X, year

#--------------------------------------------------
#reduce df to just lmed and lrat
bdf = car[,c(7,17)] #year and price
#--------------------------------------------------
#fit a big tree using rpart.control
big.tree = rpart(price~year,method="anova",data=bdf,
                        control=rpart.control(minsplit=5,cp=.0005))
nbig = length(unique(big.tree$where))
cat('size of big tree: ',nbig,'\n')

## size of big tree:  10

#--------------------------------------------------
#look at cross-validation
par(mfrow=c(1,1))
plotcp(big.tree)

#--------------------------------------------------
#show fit from some trees

oo=order(bdf$year)
bestcp=big.tree$cptable[which.min(big.tree$cptable[,"xerror"]),"CP"]
# because relative error doesn't change much when cp >= 0.0025, let's just set bestcp as 0.0025.
bestcp = 0.0025
cat('bestcp: ',bestcp,'\n')

## bestcp:  0.0025

cpvec = c(.0157,bestcp,.004)
par(mfrow=c(3,2))
for(i in 1:3) {
   plot(bdf,pch=16,col='blue',cex=.5)
   ptree = prune(big.tree,cp=cpvec[i])
   pfit = predict(ptree)
   lines(bdf$year[oo],pfit[oo],col='red',lwd=2)
   title(paste('alpha = ',round(cpvec[i],3)))
   plot(ptree,uniform=TRUE)
   text(ptree,digits=3,cex=1)
}

#--------------------------------------------------
#plot best tree

#reference: http://rankexploits.com/musings/2011/margin-control-in-r-oma-and-mar-and-the-vanishing-axis-label/
# http://research.stowers-institute.org/mcm/efg/R/Graphics/Basics/mar-oma/index.htm
# still can't figure out why the edges of the graph are cut off???!!!
par(mfrow=c(1,1), mar=c(1,1,1,1), oma=c(0,0,0,0))
#par(mar=c(5,3,2,2)+0.1)
best.tree = prune(big.tree,cp=bestcp) # best tree is here!
plot(best.tree,uniform=TRUE) # get rid of margin etc to see better
box('figure',lty='solid', col='green')
text(best.tree,digits=2,use.n=TRUE,cex=1) # get rid of fancy parameters!

rm(list=ls())

car = read.csv("/Users/vickyzhang/Documents/MSBA/predictive/project/cars.csv", header = TRUE)
attach(car)

## The following objects are masked from car (pos = 3):
## 
##     color, condition, displacement, featureCount, fuel,
##     isOneOwner, mileage, price, region, soundSystem, state,
##     subTrim, trim, wheelSize, wheelType, X, year
## 
## The following objects are masked from car (pos = 5):
## 
##     color, condition, displacement, featureCount, fuel,
##     isOneOwner, mileage, price, region, soundSystem, state,
##     subTrim, trim, wheelSize, wheelType, X, year
## 
## The following objects are masked from car (pos = 6):
## 
##     color, condition, displacement, featureCount, fuel,
##     isOneOwner, mileage, price, region, soundSystem, state,
##     subTrim, trim, wheelSize, wheelType, X, year
## 
## The following objects are masked from car (pos = 8):
## 
##     color, condition, displacement, featureCount, fuel,
##     isOneOwner, mileage, price, region, soundSystem, state,
##     subTrim, trim, wheelSize, wheelType, X, year

tree.car = tree(log(price)~.-price-state,car)
summary(tree.car)

## 
## Regression tree:
## tree(formula = log(price) ~ . - price - state, data = car)
## Variables actually used in tree construction:
## [1] "year"
## Number of terminal nodes:  5 
## Residual mean deviance:  0.05981 = 1762 / 29460 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -3.61000 -0.11140 -0.02175  0.00000  0.12610  2.28200

plot(tree.car)
text(tree.car,pretty =0)

tree.car

## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 29466 24480.0 10.790  
##   2) year < 2009.5 10348  4672.0  9.777  
##     4) year < 2006.5 5078  1114.0  9.208  
##       8) year < 2002.5 2457   422.6  8.915 *
##       9) year > 2002.5 2621   282.0  9.483 *
##     5) year > 2006.5 5270   325.8 10.330 *
##   3) year > 2009.5 19118  3353.0 11.340  
##     6) year < 2013.5 8356   472.4 10.920 *
##     7) year > 2013.5 10762   259.4 11.670 *

train=sample(1:nrow(car), 200)
car.test=car[-train ,]
price.test = car$price[-train]
# I don't think using a boolean variable for the predicted column is necessary...
#High=ifelse(,"No","Yes")
tree.car=tree(log(price)~.-price-state,car,subset=train)
tree.pred=predict(tree.car,car.test)
# this one probably caused stack overflow..literally
#table(tree.pred, price.test)

Regression tree!!! This one works the best!!! the test set MSE associated with the regression tree is 0.05898431!!!!!

# library(tree)
# car = read.csv("/Users/vickyzhang/Documents/MSBA/predictive/project/cars.csv", header = TRUE)
# set.seed(1)
# train = sample(1:nrow(car), nrow(car))
# # transform price into log
# #car$logprice = log(car$price)
# library(MASS)
# tree.Boston=tree(medv~.-medv,Boston)
# summary(tree.Boston)
# 
# tree.car=tree(price~.-state-subTrim-isOneOwner-color-displacement-state-region-soundSystem,car,subset = train)
# summary(tree.car)
# plot(tree.car)
# text(tree.car,pretty=0)
# # every time MSE is different??!!
# 
# # see whether pruning the tree will improve performance
# cv.car=cv.tree(tree.car)
# plot(cv.car$size ,cv.car$dev ,type='b') # shows that tree size of 5 gives the smallest deviance, which is SSE
# 
# # prune the tree to 5 nodes... though I think it's not actually trimming it because we had 5 nodes to begin with 
# prune.car=prune.tree(tree.car,best=5)
# plot(prune.car)
# text(prune.car,pretty=0)
# 
# 
# # use unpruned tree to predict on test set
# yhat=predict(tree.car,newdata=car[-train ,])
# car.test=car[-train ,"price"]
# plot(yhat,car.test)
# abline (0 ,1)
# MSE = mean((yhat-car.test)^2)
# MSE
# # the test set MSE associated with the regression tree is 0.05898431!!!!!
# RMSE = sqrt(MSE)
# RMSE
# # fit prof's test data
# test_x = read.csv("/Users/vickyzhang/Documents/MSBA/predictive/project/Cars_X_out.csv", header = TRUE)
# 
# set.seed(1)
# #train = sample(1:nrow(test_x), nrow(test_x)/2)
# #model_sample = sample(1:nrow(car), nrow(test_x))
# #tree.car.test = tree.car[model_sample]
# y_test=predict(tree.car,newdata=test_x)
# car.test=car[-train ,"price"]
# plot(y_test,car.test)
# abline (0 ,1)
# MSE = mean((yhat-car.test)^2)
# MSE
# # the test set MSE associated with the regression tree is 0.05898431!!!!!
# RMSE = sqrt(MSE)
# RMSE
# 
# head(exp(y_test), 100)
# 
# 
# typeof(tree.car.test)
# total.test.x[,'price'] = 0
# total.test.x = rbind(test_x, test_x, test_x[1:9466,])
# dim(total.test.x)

# car = read.csv("/Users/vickyzhang/Documents/MSBA/predictive/project/cars.csv", header = TRUE)
# attach(car)
# library(randomForest)
# car = car[complete.cases(car),]
# car$price = as.numeric(car$price)
# set.seed (1)
# train = sample(1:nrow(car), nrow(car)/2)
# # this line won't work and r studio crashs every time
# bag.car=randomForest(price~.-state-subTrim,data=car,subset=train,
# mtry=6,importance =TRUE)

cars = read.csv("/Users/vickyzhang/Documents/MSBA/predictive/project/cars.csv", header = TRUE)
#dis = sapply(displacement, function(x) as.numeric(substr(x, 1, 3)))
#cars$displacement = dis
# add displacement to regression

attach(cars)

## The following objects are masked from car (pos = 3):
## 
##     color, condition, displacement, featureCount, fuel,
##     isOneOwner, mileage, price, region, soundSystem, state,
##     subTrim, trim, wheelSize, wheelType, X, year
## 
## The following objects are masked from car (pos = 4):
## 
##     color, condition, displacement, featureCount, fuel,
##     isOneOwner, mileage, price, region, soundSystem, state,
##     subTrim, trim, wheelSize, wheelType, X, year
## 
## The following objects are masked from car (pos = 6):
## 
##     color, condition, displacement, featureCount, fuel,
##     isOneOwner, mileage, price, region, soundSystem, state,
##     subTrim, trim, wheelSize, wheelType, X, year
## 
## The following objects are masked from car (pos = 7):
## 
##     color, condition, displacement, featureCount, fuel,
##     isOneOwner, mileage, price, region, soundSystem, state,
##     subTrim, trim, wheelSize, wheelType, X, year
## 
## The following objects are masked from car (pos = 9):
## 
##     color, condition, displacement, featureCount, fuel,
##     isOneOwner, mileage, price, region, soundSystem, state,
##     subTrim, trim, wheelSize, wheelType, X, year

library(MASS)
library(glmnet)

## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-2

# x=model.matrix(price~.-price-state-soundSystem-color,cars)[,-1]
# RMSE: 10530

# RMSE for this one: 10510.63 smallest so far
x=model.matrix(price~.-price-state,cars)[,-1]
y=cars$price
grid=10^seq(10,-2,length=100)
train=sample(1:nrow(x), nrow(x)/2)
test=(-train)
y.test=y[test]
# convert displacement into continuous

lasso.mod=glmnet(x[train ,],y[train],alpha=1,lambda=grid)
#plot(lasso.mod)
set.seed(1)
cv.out=cv.glmnet(x[train ,],y[train],alpha=1)
#plot(cv.out)
bestlam2=cv.out$lambda.min
bestlam2

## [1] 47.15771

lasso.pred=predict(lasso.mod,s=bestlam2,newx=x[test,])
MSE_L = mean((lasso.pred-y.test)^2) 
sqrt(MSE_L)

## [1] 10622.79

out=glmnet(x,y,alpha=1,lambda=grid)
lasso.coef=predict(out,type="coefficients",s=bestlam2)
lasso.coef

## 79 x 1 sparse Matrix of class "dgCMatrix"
##                                        1
## (Intercept)                -7.921718e+06
## X                          -2.185611e-03
## trim350                    -3.724514e+03
## trim400                    -7.157224e+03
## trim420                     2.032887e+04
## trim430                    -1.212261e+03
## trim450                    -1.509760e+04
## trim500                     1.069347e+03
## trim55 AMG                  .           
## trim550                    -7.928914e+02
## trim600                     7.791229e+03
## trim63 AMG                  4.887285e+04
## trim65 AMG                  8.722786e+03
## trimunsp                    2.975153e+04
## subTrimunsp                 6.462886e+01
## conditionNew                3.566064e+04
## conditionUsed              -4.374118e+03
## isOneOwnert                -2.674443e+02
## mileage                    -1.324367e-01
## year                        3.970503e+03
## colorBlack                  .           
## colorBlue                  -1.294565e+02
## colorBronze                 1.387140e+03
## colorBrown                  .           
## colorGold                   1.017949e+03
## colorGray                  -8.791285e+02
## colorGreen                  .           
## colorPurple                 2.510035e+03
## colorRed                    .           
## colorSilver                -6.093285e+02
## colorTurquoise              .           
## colorunsp                   3.152759e+02
## colorWhite                  1.343328e+03
## colorYellow                -1.189119e+03
## displacement3.2 L           1.954720e+04
## displacement3.5 L           .           
## displacement3.7 L          -7.989703e+03
## displacement4.2 L           3.054166e+02
## displacement4.3 L           .           
## displacement4.6 L           .           
## displacement4.7 L           2.157101e+04
## displacement5.0 L           1.897196e+00
## displacement5.4 L           .           
## displacement5.5 L          -4.305265e+03
## displacement5.8 L          -4.028524e+03
## displacement6.0 L           4.280531e+04
## displacement6.3 L          -4.245640e+04
## displacement8.0 L           1.409807e+04
## displacementunsp            4.638631e+03
## fuelGasoline                .           
## fuelHybrid                 -2.579628e+03
## fuelunsp                    1.296575e+04
## regionESC                   .           
## regionMid                  -5.886578e+02
## regionMtn                   1.045707e+03
## regionNew                   .           
## regionPac                   1.258800e+02
## regionSoA                  -1.499476e+02
## regionunsp                 -6.691902e+03
## regionWNC                   1.203215e+03
## regionWSC                   8.982930e+02
## soundSystemBang Olufsen     2.256736e+03
## soundSystemBose            -1.115984e+03
## soundSystemBoston Acoustic  .           
## soundSystemHarman Kardon   -1.958110e+03
## soundSystemPremium          .           
## soundSystemunsp             3.133431e+02
## wheelTypeChrome             .           
## wheelTypePremium            .           
## wheelTypeSteel              1.569900e+04
## wheelTypeunsp               3.434597e+02
## wheelSize17                -3.717901e+03
## wheelSize18                -5.166717e+02
## wheelSize19                -1.974393e+01
## wheelSize20                 4.588211e+03
## wheelSize21                 .           
## wheelSize22                 3.195147e+03
## wheelSizeunsp               .           
## featureCount                .

car_tree

Vicky

July 26, 2015