car_tree

references: http://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html http://www.statmethods.net/advstats/cart.html

####### tree

car = read.csv("/Users/vickyzhang/Documents/MSBA/predictive/project/cars.csv", header = TRUE)
attach(car)

library(tree)



#--------------------------------------------------
#fit a tree to boston data just using lstat.

#first get a big tree using a small value of mindev
temp = tree(price~year,data=car,mindev=.0001)
cat('first big tree size: \n')

## first big tree size:

print(length(unique(temp$where)))

## [1] 12

#then prune it down to one with 7 leaves
car.tree=prune.tree(temp,best=7)
cat('pruned tree size: \n')

## pruned tree size:

print(length(unique(car.tree$where)))

## [1] 7

#--------------------------------------------------
#plot the tree and the fits.
par(mfrow=c(1,2))

#plot the tree
plot(car.tree,type="uniform")
text(car.tree,col="blue",label=c("yval"),cex=.8)

#plot data with fit
car.fit = predict(car.tree) #get training fitted values

plot(year,price,cex=.5,pch=16) #plot data
oo=order(price) # sorting price, decreasing
lines(price[oo],car.fit[oo],col='red',lwd=3) #step function fit

cvals=c(2013.5, 2009.5, 2014.5, 2000.5, 2011.5, 2012.5) #cutpoints from tree
for(i in 1:length(cvals)) abline(v=cvals[i],col='magenta',lty=2) #cutpoints

rm(list=ls())

##################part 2
library(tree)
car = read.csv("/Users/vickyzhang/Documents/MSBA/predictive/project/cars.csv", header = TRUE)
attach(car)

## The following objects are masked from car (pos = 4):
## 
##     color, condition, displacement, featureCount, fuel,
##     isOneOwner, mileage, price, region, soundSystem, state,
##     subTrim, trim, wheelSize, wheelType, X, year

#--------------------------------------------------
df2=car[,c(6,7,17)] #pick off mileage, year, price
print(names(df2))

## [1] "mileage" "year"    "price"

#--------------------------------------------------
#big tree
temp = tree(price~.,df2,mindev=.0001)
cat('first big tree size: \n')

## first big tree size:

print(length(unique(temp$where)))

## [1] 30

#--------------------------------------------------
#then prune it down to one with 7 leaves
boston.tree=prune.tree(temp,best=7)
cat('pruned tree size: \n')

## pruned tree size:

print(length(unique(boston.tree$where)))

## [1] 7

#--------------------------------------------------
# plot tree and partition in x.
par(mfrow=c(1,2))
plot(boston.tree,type="u")
text(boston.tree,col="blue",label=c("yval"),cex=.8)
partition.tree(boston.tree)

rm(list=ls())

car = read.csv("/Users/vickyzhang/Documents/MSBA/predictive/project/cars.csv", header = TRUE)
attach(car)

## The following objects are masked from car (pos = 3):
## 
##     color, condition, displacement, featureCount, fuel,
##     isOneOwner, mileage, price, region, soundSystem, state,
##     subTrim, trim, wheelSize, wheelType, X, year
## 
## The following objects are masked from car (pos = 5):
## 
##     color, condition, displacement, featureCount, fuel,
##     isOneOwner, mileage, price, region, soundSystem, state,
##     subTrim, trim, wheelSize, wheelType, X, year

#--------------------------------------------------
df2=car[,c(6,9,17)] #pick off variables
print(names(df2))

## [1] "mileage"      "displacement" "price"

#--------------------------------------------------
#big tree
temp = tree(price~.,df2,mindev=.0001)
cat('first big tree size: \n')

## first big tree size:

print(length(unique(temp$where)))

## [1] 40

#then prune it down to one with 7 leaves
boston.tree=prune.tree(temp,best=7)
cat('pruned tree size: \n')

## pruned tree size:

print(length(unique(boston.tree$where)))

## [1] 7

#--------------------------------------------------
#get predictions on 2d grid
pv=seq(from=.01,to=.99,by=.05)
# The following 2 sections will not work because of dataset limitations 
#x1q = quantile(df2$displacement,probs=pv,na.rm = TRUE)
#x2q = quantile(df2$mileage,probs=pv,na.rm = TRUE)
#xx = expand.grid(x1q,x2q) #matrix with two columns using all combinations of x1q and x2q
#dfpred = data.frame(mileage=xx[,2],displacement=xx[,1])
#lmedpred = predict(boston.tree,dfpred)

#make perspective plot

#par(mfrow=c(1,1))
# this one failed because 'increasing 'x' and 'y' values expected'
# Price and years do not span a big interval so they don't increase between some quartiles. That's why this problem can't be fixed here. We need a continuously changing variable to make this function work.
#persp(x1q,x2q,matrix(lmedpred,ncol=length(x2q),byrow=T),
 #          theta=150,xlab='dis',ylab='lstat',zlab='medv',
  #         zlim=c(min(df2$price),1.1*max(df2$price)))

library(tree)
library(rpart)
attach(car)

## The following objects are masked from car (pos = 4):
## 
##     color, condition, displacement, featureCount, fuel,
##     isOneOwner, mileage, price, region, soundSystem, state,
##     subTrim, trim, wheelSize, wheelType, X, year
## 
## The following objects are masked from car (pos = 5):
## 
##     color, condition, displacement, featureCount, fuel,
##     isOneOwner, mileage, price, region, soundSystem, state,
##     subTrim, trim, wheelSize, wheelType, X, year
## 
## The following objects are masked from car (pos = 7):
## 
##     color, condition, displacement, featureCount, fuel,
##     isOneOwner, mileage, price, region, soundSystem, state,
##     subTrim, trim, wheelSize, wheelType, X, year

#--------------------------------------------------
#reduce df to just lmed and lrat
bdf = car[,c(7,17)] #year and price
#--------------------------------------------------
#fit a big tree using rpart.control
big.tree = rpart(price~year,method="anova",data=bdf,
                        control=rpart.control(minsplit=5,cp=.0005))
nbig = length(unique(big.tree$where))
cat('size of big tree: ',nbig,'\n')

## size of big tree:  10

#--------------------------------------------------
#look at cross-validation
par(mfrow=c(1,1))
plotcp(big.tree)

#--------------------------------------------------
#show fit from some trees

oo=order(bdf$year)
bestcp=big.tree$cptable[which.min(big.tree$cptable[,"xerror"]),"CP"]
# because relative error doesn't change much when cp >= 0.0025, let's just set bestcp as 0.0025.
bestcp = 0.0025
cat('bestcp: ',bestcp,'\n')

## bestcp:  0.0025

cpvec = c(.0157,bestcp,.004)
par(mfrow=c(3,2))
for(i in 1:3) {
   plot(bdf,pch=16,col='blue',cex=.5)
   ptree = prune(big.tree,cp=cpvec[i])
   pfit = predict(ptree)
   lines(bdf$year[oo],pfit[oo],col='red',lwd=2)
   title(paste('alpha = ',round(cpvec[i],3)))
   plot(ptree,uniform=TRUE)
   text(ptree,digits=3,cex=1)
}

#--------------------------------------------------
#plot best tree

#reference: http://rankexploits.com/musings/2011/margin-control-in-r-oma-and-mar-and-the-vanishing-axis-label/
# http://research.stowers-institute.org/mcm/efg/R/Graphics/Basics/mar-oma/index.htm
# still can't figure out why the edges of the graph are cut off???!!!
par(mfrow=c(1,1), mar=c(1,1,1,1), oma=c(0,0,0,0))
#par(mar=c(5,3,2,2)+0.1)
best.tree = prune(big.tree,cp=bestcp) # best tree is here!
plot(best.tree,uniform=TRUE) # get rid of margin etc to see better
box('figure',lty='solid', col='green')
text(best.tree,digits=2,use.n=TRUE,cex=1) # get rid of fancy parameters!

rm(list=ls())

car = read.csv("/Users/vickyzhang/Documents/MSBA/predictive/project/cars.csv", header = TRUE)
attach(car)

## The following objects are masked from car (pos = 3):
## 
##     color, condition, displacement, featureCount, fuel,
##     isOneOwner, mileage, price, region, soundSystem, state,
##     subTrim, trim, wheelSize, wheelType, X, year
## 
## The following objects are masked from car (pos = 5):
## 
##     color, condition, displacement, featureCount, fuel,
##     isOneOwner, mileage, price, region, soundSystem, state,
##     subTrim, trim, wheelSize, wheelType, X, year
## 
## The following objects are masked from car (pos = 6):
## 
##     color, condition, displacement, featureCount, fuel,
##     isOneOwner, mileage, price, region, soundSystem, state,
##     subTrim, trim, wheelSize, wheelType, X, year
## 
## The following objects are masked from car (pos = 8):
## 
##     color, condition, displacement, featureCount, fuel,
##     isOneOwner, mileage, price, region, soundSystem, state,
##     subTrim, trim, wheelSize, wheelType, X, year

tree.car = tree(log(price)~.-price-state,car)
summary(tree.car)

## 
## Regression tree:
## tree(formula = log(price) ~ . - price - state, data = car)
## Variables actually used in tree construction:
## [1] "year"
## Number of terminal nodes:  5 
## Residual mean deviance:  0.05981 = 1762 / 29460 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -3.61000 -0.11140 -0.02175  0.00000  0.12610  2.28200

plot(tree.car)
text(tree.car,pretty =0)

tree.car

## node), split, n, deviance, yval
##       * denotes terminal node
## 
## 1) root 29466 24480.0 10.790  
##   2) year < 2009.5 10348  4672.0  9.777  
##     4) year < 2006.5 5078  1114.0  9.208  
##       8) year < 2002.5 2457   422.6  8.915 *
##       9) year > 2002.5 2621   282.0  9.483 *
##     5) year > 2006.5 5270   325.8 10.330 *
##   3) year > 2009.5 19118  3353.0 11.340  
##     6) year < 2013.5 8356   472.4 10.920 *
##     7) year > 2013.5 10762   259.4 11.670 *

train=sample(1:nrow(car), 200)
car.test=car[-train ,]
price.test = car$price[-train]
# I don't think using a boolean variable for the predicted column is necessary...
#High=ifelse(,"No","Yes")
tree.car=tree(log(price)~.-price-state,car,subset=train)
tree.pred=predict(tree.car,car.test)
# this one probably caused stack overflow..literally
#table(tree.pred, price.test)

Regression tree!!! This one works the best!!! the test set MSE associated with the regression tree is 0.05898431!!!!!

set.seed(1)
train = sample(1:nrow(car), nrow(car)/2)
tree.car=tree(log(price)~.-price-state,car,subset=train)
summary(tree.car)

## 
## Regression tree:
## tree(formula = log(price) ~ . - price - state, data = car, subset = train)
## Variables actually used in tree construction:
## [1] "year"
## Number of terminal nodes:  5 
## Residual mean deviance:  0.06066 = 893.4 / 14730 
## Distribution of residuals:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -3.6050 -0.1158 -0.0217  0.0000  0.1274  2.2790

plot(tree.car)
text(tree.car,pretty=0)

# see whether pruning the tree will improve performance
cv.car=cv.tree(tree.car)
plot(cv.car$size ,cv.car$dev ,type='b') # shows that tree size of 5 gives the smallest deviance, which is SSE

# prune the tree... though I think it's not actually trimming it because we had 5 nodes to begin with 
prune.car=prune.tree(tree.car,best=5)
plot(prune.car)
text(prune.car,pretty=0)

# transform price into log
car$price = log(car$price)
# use unpruned tree to predict on test set
yhat=predict(tree.car,newdata=car[-train ,])
car.test=car[-train ,"price"]
plot(yhat,car.test)
abline (0 ,1)

MSE = mean((yhat-car.test)^2)
MSE

## [1] 0.05898431

# the test set MSE associated with the regression tree is 0.05898431!!!!!
RMSE = sqrt(MSE)
stdev = exp(RMSE)
stdev

## [1] 1.274899

# The square root of the MSE is therefore around 0.2428669, indicating that this model leads to test predictions that are within around $1.274899 of the true car value.

car_tree

Vicky

July 26, 2015