references: http://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html http://www.statmethods.net/advstats/cart.html
####### tree
car = read.csv("/Users/vickyzhang/Documents/MSBA/predictive/project/cars.csv", header = TRUE)
attach(car)
library(tree)
#--------------------------------------------------
#fit a tree to boston data just using lstat.
#first get a big tree using a small value of mindev
temp = tree(price~year,data=car,mindev=.0001)
cat('first big tree size: \n')
## first big tree size:
print(length(unique(temp$where)))
## [1] 12
#then prune it down to one with 7 leaves
car.tree=prune.tree(temp,best=7)
cat('pruned tree size: \n')
## pruned tree size:
print(length(unique(car.tree$where)))
## [1] 7
#--------------------------------------------------
#plot the tree and the fits.
par(mfrow=c(1,2))
#plot the tree
plot(car.tree,type="uniform")
text(car.tree,col="blue",label=c("yval"),cex=.8)
#plot data with fit
car.fit = predict(car.tree) #get training fitted values
plot(year,price,cex=.5,pch=16) #plot data
oo=order(price) # sorting price, decreasing
lines(price[oo],car.fit[oo],col='red',lwd=3) #step function fit
cvals=c(2013.5, 2009.5, 2014.5, 2000.5, 2011.5, 2012.5) #cutpoints from tree
for(i in 1:length(cvals)) abline(v=cvals[i],col='magenta',lty=2) #cutpoints
rm(list=ls())
##################part 2
library(tree)
car = read.csv("/Users/vickyzhang/Documents/MSBA/predictive/project/cars.csv", header = TRUE)
attach(car)
## The following objects are masked from car (pos = 4):
##
## color, condition, displacement, featureCount, fuel,
## isOneOwner, mileage, price, region, soundSystem, state,
## subTrim, trim, wheelSize, wheelType, X, year
#--------------------------------------------------
df2=car[,c(6,7,17)] #pick off mileage, year, price
print(names(df2))
## [1] "mileage" "year" "price"
#--------------------------------------------------
#big tree
temp = tree(price~.,df2,mindev=.0001)
cat('first big tree size: \n')
## first big tree size:
print(length(unique(temp$where)))
## [1] 30
#--------------------------------------------------
#then prune it down to one with 7 leaves
boston.tree=prune.tree(temp,best=7)
cat('pruned tree size: \n')
## pruned tree size:
print(length(unique(boston.tree$where)))
## [1] 7
#--------------------------------------------------
# plot tree and partition in x.
par(mfrow=c(1,2))
plot(boston.tree,type="u")
text(boston.tree,col="blue",label=c("yval"),cex=.8)
partition.tree(boston.tree)
rm(list=ls())
car = read.csv("/Users/vickyzhang/Documents/MSBA/predictive/project/cars.csv", header = TRUE)
attach(car)
## The following objects are masked from car (pos = 3):
##
## color, condition, displacement, featureCount, fuel,
## isOneOwner, mileage, price, region, soundSystem, state,
## subTrim, trim, wheelSize, wheelType, X, year
##
## The following objects are masked from car (pos = 5):
##
## color, condition, displacement, featureCount, fuel,
## isOneOwner, mileage, price, region, soundSystem, state,
## subTrim, trim, wheelSize, wheelType, X, year
#--------------------------------------------------
df2=car[,c(6,9,17)] #pick off variables
print(names(df2))
## [1] "mileage" "displacement" "price"
#--------------------------------------------------
#big tree
temp = tree(price~.,df2,mindev=.0001)
cat('first big tree size: \n')
## first big tree size:
print(length(unique(temp$where)))
## [1] 40
#then prune it down to one with 7 leaves
boston.tree=prune.tree(temp,best=7)
cat('pruned tree size: \n')
## pruned tree size:
print(length(unique(boston.tree$where)))
## [1] 7
#--------------------------------------------------
#get predictions on 2d grid
pv=seq(from=.01,to=.99,by=.05)
# The following 2 sections will not work because of dataset limitations
#x1q = quantile(df2$displacement,probs=pv,na.rm = TRUE)
#x2q = quantile(df2$mileage,probs=pv,na.rm = TRUE)
#xx = expand.grid(x1q,x2q) #matrix with two columns using all combinations of x1q and x2q
#dfpred = data.frame(mileage=xx[,2],displacement=xx[,1])
#lmedpred = predict(boston.tree,dfpred)
#make perspective plot
#par(mfrow=c(1,1))
# this one failed because 'increasing 'x' and 'y' values expected'
# Price and years do not span a big interval so they don't increase between some quartiles. That's why this problem can't be fixed here. We need a continuously changing variable to make this function work.
#persp(x1q,x2q,matrix(lmedpred,ncol=length(x2q),byrow=T),
# theta=150,xlab='dis',ylab='lstat',zlab='medv',
# zlim=c(min(df2$price),1.1*max(df2$price)))
library(tree)
library(rpart)
attach(car)
## The following objects are masked from car (pos = 4):
##
## color, condition, displacement, featureCount, fuel,
## isOneOwner, mileage, price, region, soundSystem, state,
## subTrim, trim, wheelSize, wheelType, X, year
##
## The following objects are masked from car (pos = 5):
##
## color, condition, displacement, featureCount, fuel,
## isOneOwner, mileage, price, region, soundSystem, state,
## subTrim, trim, wheelSize, wheelType, X, year
##
## The following objects are masked from car (pos = 7):
##
## color, condition, displacement, featureCount, fuel,
## isOneOwner, mileage, price, region, soundSystem, state,
## subTrim, trim, wheelSize, wheelType, X, year
#--------------------------------------------------
#reduce df to just lmed and lrat
bdf = car[,c(7,17)] #year and price
#--------------------------------------------------
#fit a big tree using rpart.control
big.tree = rpart(price~year,method="anova",data=bdf,
control=rpart.control(minsplit=5,cp=.0005))
nbig = length(unique(big.tree$where))
cat('size of big tree: ',nbig,'\n')
## size of big tree: 10
#--------------------------------------------------
#look at cross-validation
par(mfrow=c(1,1))
plotcp(big.tree)
#--------------------------------------------------
#show fit from some trees
oo=order(bdf$year)
bestcp=big.tree$cptable[which.min(big.tree$cptable[,"xerror"]),"CP"]
# because relative error doesn't change much when cp >= 0.0025, let's just set bestcp as 0.0025.
bestcp = 0.0025
cat('bestcp: ',bestcp,'\n')
## bestcp: 0.0025
cpvec = c(.0157,bestcp,.004)
par(mfrow=c(3,2))
for(i in 1:3) {
plot(bdf,pch=16,col='blue',cex=.5)
ptree = prune(big.tree,cp=cpvec[i])
pfit = predict(ptree)
lines(bdf$year[oo],pfit[oo],col='red',lwd=2)
title(paste('alpha = ',round(cpvec[i],3)))
plot(ptree,uniform=TRUE)
text(ptree,digits=3,cex=1)
}
#--------------------------------------------------
#plot best tree
#reference: http://rankexploits.com/musings/2011/margin-control-in-r-oma-and-mar-and-the-vanishing-axis-label/
# http://research.stowers-institute.org/mcm/efg/R/Graphics/Basics/mar-oma/index.htm
# still can't figure out why the edges of the graph are cut off???!!!
par(mfrow=c(1,1), mar=c(1,1,1,1), oma=c(0,0,0,0))
#par(mar=c(5,3,2,2)+0.1)
best.tree = prune(big.tree,cp=bestcp) # best tree is here!
plot(best.tree,uniform=TRUE) # get rid of margin etc to see better
box('figure',lty='solid', col='green')
text(best.tree,digits=2,use.n=TRUE,cex=1) # get rid of fancy parameters!
rm(list=ls())
car = read.csv("/Users/vickyzhang/Documents/MSBA/predictive/project/cars.csv", header = TRUE)
attach(car)
## The following objects are masked from car (pos = 3):
##
## color, condition, displacement, featureCount, fuel,
## isOneOwner, mileage, price, region, soundSystem, state,
## subTrim, trim, wheelSize, wheelType, X, year
##
## The following objects are masked from car (pos = 5):
##
## color, condition, displacement, featureCount, fuel,
## isOneOwner, mileage, price, region, soundSystem, state,
## subTrim, trim, wheelSize, wheelType, X, year
##
## The following objects are masked from car (pos = 6):
##
## color, condition, displacement, featureCount, fuel,
## isOneOwner, mileage, price, region, soundSystem, state,
## subTrim, trim, wheelSize, wheelType, X, year
##
## The following objects are masked from car (pos = 8):
##
## color, condition, displacement, featureCount, fuel,
## isOneOwner, mileage, price, region, soundSystem, state,
## subTrim, trim, wheelSize, wheelType, X, year
tree.car = tree(log(price)~.-price-state,car)
summary(tree.car)
##
## Regression tree:
## tree(formula = log(price) ~ . - price - state, data = car)
## Variables actually used in tree construction:
## [1] "year"
## Number of terminal nodes: 5
## Residual mean deviance: 0.05981 = 1762 / 29460
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.61000 -0.11140 -0.02175 0.00000 0.12610 2.28200
plot(tree.car)
text(tree.car,pretty =0)
tree.car
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 29466 24480.0 10.790
## 2) year < 2009.5 10348 4672.0 9.777
## 4) year < 2006.5 5078 1114.0 9.208
## 8) year < 2002.5 2457 422.6 8.915 *
## 9) year > 2002.5 2621 282.0 9.483 *
## 5) year > 2006.5 5270 325.8 10.330 *
## 3) year > 2009.5 19118 3353.0 11.340
## 6) year < 2013.5 8356 472.4 10.920 *
## 7) year > 2013.5 10762 259.4 11.670 *
train=sample(1:nrow(car), 200)
car.test=car[-train ,]
price.test = car$price[-train]
# I don't think using a boolean variable for the predicted column is necessary...
#High=ifelse(,"No","Yes")
tree.car=tree(log(price)~.-price-state,car,subset=train)
tree.pred=predict(tree.car,car.test)
# this one probably caused stack overflow..literally
#table(tree.pred, price.test)
Regression tree!!! This one works the best!!! the test set MSE associated with the regression tree is 0.05898431!!!!!
# library(tree)
# car = read.csv("/Users/vickyzhang/Documents/MSBA/predictive/project/cars.csv", header = TRUE)
# set.seed(1)
# train = sample(1:nrow(car), nrow(car))
# # transform price into log
# #car$logprice = log(car$price)
# library(MASS)
# tree.Boston=tree(medv~.-medv,Boston)
# summary(tree.Boston)
#
# tree.car=tree(price~.-state-subTrim-isOneOwner-color-displacement-state-region-soundSystem,car,subset = train)
# summary(tree.car)
# plot(tree.car)
# text(tree.car,pretty=0)
# # every time MSE is different??!!
#
# # see whether pruning the tree will improve performance
# cv.car=cv.tree(tree.car)
# plot(cv.car$size ,cv.car$dev ,type='b') # shows that tree size of 5 gives the smallest deviance, which is SSE
#
# # prune the tree to 5 nodes... though I think it's not actually trimming it because we had 5 nodes to begin with
# prune.car=prune.tree(tree.car,best=5)
# plot(prune.car)
# text(prune.car,pretty=0)
#
#
# # use unpruned tree to predict on test set
# yhat=predict(tree.car,newdata=car[-train ,])
# car.test=car[-train ,"price"]
# plot(yhat,car.test)
# abline (0 ,1)
# MSE = mean((yhat-car.test)^2)
# MSE
# # the test set MSE associated with the regression tree is 0.05898431!!!!!
# RMSE = sqrt(MSE)
# RMSE
# # fit prof's test data
# test_x = read.csv("/Users/vickyzhang/Documents/MSBA/predictive/project/Cars_X_out.csv", header = TRUE)
#
# set.seed(1)
# #train = sample(1:nrow(test_x), nrow(test_x)/2)
# #model_sample = sample(1:nrow(car), nrow(test_x))
# #tree.car.test = tree.car[model_sample]
# y_test=predict(tree.car,newdata=test_x)
# car.test=car[-train ,"price"]
# plot(y_test,car.test)
# abline (0 ,1)
# MSE = mean((yhat-car.test)^2)
# MSE
# # the test set MSE associated with the regression tree is 0.05898431!!!!!
# RMSE = sqrt(MSE)
# RMSE
#
# head(exp(y_test), 100)
#
#
# typeof(tree.car.test)
# total.test.x[,'price'] = 0
# total.test.x = rbind(test_x, test_x, test_x[1:9466,])
# dim(total.test.x)
# car = read.csv("/Users/vickyzhang/Documents/MSBA/predictive/project/cars.csv", header = TRUE)
# attach(car)
# library(randomForest)
# car = car[complete.cases(car),]
# car$price = as.numeric(car$price)
# set.seed (1)
# train = sample(1:nrow(car), nrow(car)/2)
# # this line won't work and r studio crashs every time
# bag.car=randomForest(price~.-state-subTrim,data=car,subset=train,
# mtry=6,importance =TRUE)
cars = read.csv("/Users/vickyzhang/Documents/MSBA/predictive/project/cars.csv", header = TRUE)
#dis = sapply(displacement, function(x) as.numeric(substr(x, 1, 3)))
#cars$displacement = dis
# add displacement to regression
attach(cars)
## The following objects are masked from car (pos = 3):
##
## color, condition, displacement, featureCount, fuel,
## isOneOwner, mileage, price, region, soundSystem, state,
## subTrim, trim, wheelSize, wheelType, X, year
##
## The following objects are masked from car (pos = 4):
##
## color, condition, displacement, featureCount, fuel,
## isOneOwner, mileage, price, region, soundSystem, state,
## subTrim, trim, wheelSize, wheelType, X, year
##
## The following objects are masked from car (pos = 6):
##
## color, condition, displacement, featureCount, fuel,
## isOneOwner, mileage, price, region, soundSystem, state,
## subTrim, trim, wheelSize, wheelType, X, year
##
## The following objects are masked from car (pos = 7):
##
## color, condition, displacement, featureCount, fuel,
## isOneOwner, mileage, price, region, soundSystem, state,
## subTrim, trim, wheelSize, wheelType, X, year
##
## The following objects are masked from car (pos = 9):
##
## color, condition, displacement, featureCount, fuel,
## isOneOwner, mileage, price, region, soundSystem, state,
## subTrim, trim, wheelSize, wheelType, X, year
library(MASS)
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-2
# x=model.matrix(price~.-price-state-soundSystem-color,cars)[,-1]
# RMSE: 10530
# RMSE for this one: 10510.63 smallest so far
x=model.matrix(price~.-price-state,cars)[,-1]
y=cars$price
grid=10^seq(10,-2,length=100)
train=sample(1:nrow(x), nrow(x)/2)
test=(-train)
y.test=y[test]
# convert displacement into continuous
lasso.mod=glmnet(x[train ,],y[train],alpha=1,lambda=grid)
#plot(lasso.mod)
set.seed(1)
cv.out=cv.glmnet(x[train ,],y[train],alpha=1)
#plot(cv.out)
bestlam2=cv.out$lambda.min
bestlam2
## [1] 47.15771
lasso.pred=predict(lasso.mod,s=bestlam2,newx=x[test,])
MSE_L = mean((lasso.pred-y.test)^2)
sqrt(MSE_L)
## [1] 10622.79
out=glmnet(x,y,alpha=1,lambda=grid)
lasso.coef=predict(out,type="coefficients",s=bestlam2)
lasso.coef
## 79 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -7.921718e+06
## X -2.185611e-03
## trim350 -3.724514e+03
## trim400 -7.157224e+03
## trim420 2.032887e+04
## trim430 -1.212261e+03
## trim450 -1.509760e+04
## trim500 1.069347e+03
## trim55 AMG .
## trim550 -7.928914e+02
## trim600 7.791229e+03
## trim63 AMG 4.887285e+04
## trim65 AMG 8.722786e+03
## trimunsp 2.975153e+04
## subTrimunsp 6.462886e+01
## conditionNew 3.566064e+04
## conditionUsed -4.374118e+03
## isOneOwnert -2.674443e+02
## mileage -1.324367e-01
## year 3.970503e+03
## colorBlack .
## colorBlue -1.294565e+02
## colorBronze 1.387140e+03
## colorBrown .
## colorGold 1.017949e+03
## colorGray -8.791285e+02
## colorGreen .
## colorPurple 2.510035e+03
## colorRed .
## colorSilver -6.093285e+02
## colorTurquoise .
## colorunsp 3.152759e+02
## colorWhite 1.343328e+03
## colorYellow -1.189119e+03
## displacement3.2 L 1.954720e+04
## displacement3.5 L .
## displacement3.7 L -7.989703e+03
## displacement4.2 L 3.054166e+02
## displacement4.3 L .
## displacement4.6 L .
## displacement4.7 L 2.157101e+04
## displacement5.0 L 1.897196e+00
## displacement5.4 L .
## displacement5.5 L -4.305265e+03
## displacement5.8 L -4.028524e+03
## displacement6.0 L 4.280531e+04
## displacement6.3 L -4.245640e+04
## displacement8.0 L 1.409807e+04
## displacementunsp 4.638631e+03
## fuelGasoline .
## fuelHybrid -2.579628e+03
## fuelunsp 1.296575e+04
## regionESC .
## regionMid -5.886578e+02
## regionMtn 1.045707e+03
## regionNew .
## regionPac 1.258800e+02
## regionSoA -1.499476e+02
## regionunsp -6.691902e+03
## regionWNC 1.203215e+03
## regionWSC 8.982930e+02
## soundSystemBang Olufsen 2.256736e+03
## soundSystemBose -1.115984e+03
## soundSystemBoston Acoustic .
## soundSystemHarman Kardon -1.958110e+03
## soundSystemPremium .
## soundSystemunsp 3.133431e+02
## wheelTypeChrome .
## wheelTypePremium .
## wheelTypeSteel 1.569900e+04
## wheelTypeunsp 3.434597e+02
## wheelSize17 -3.717901e+03
## wheelSize18 -5.166717e+02
## wheelSize19 -1.974393e+01
## wheelSize20 4.588211e+03
## wheelSize21 .
## wheelSize22 3.195147e+03
## wheelSizeunsp .
## featureCount .