library(tree)
library(ISLR)
attach(Hitters)
# View(Hitters)
# ?Hitters
Hitters_narm<-na.omit(Hitters)
hist(Salary)

#detach(Hitters)
#attach(Hitters_narm)
#log(Hitters_narm$Salary) -- to convert salary column into
#approximated normal distributed data if data of that column
#is not distributed in normal.
Hitters_narm$Salary<-log(Hitters_narm$Salary)
hist(Hitters_narm$Salary)

tree.fit<-tree(Salary~Hits+Years,data=Hitters_narm)
summary(tree.fit)
##
## Regression tree:
## tree(formula = Salary ~ Hits + Years, data = Hitters_narm)
## Number of terminal nodes: 8
## Residual mean deviance: 0.2708 = 69.06 / 255
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.2400 -0.2980 -0.0365 0.0000 0.3233 2.1520
plot(tree.fit)
text(tree.fit)
#install.packages("caret")
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2

split<-createDataPartition(y=Hitters_narm$Salary,p=.7,list=F)
nrow(split)
## [1] 185
train<-Hitters_narm[split,]
test<-Hitters_narm[-split,]
nrow(train)
## [1] 185
nrow(test)
## [1] 78
View(test)
trees<-tree(Salary~.,data=train)
plot(trees)
text(trees)

cv.trees<-cv.tree(trees)
plot(cv.trees)

prune.trees<-prune.tree(trees,best = 3)
plot(prune.trees)
text(prune.trees)

prune.trees<-prune.tree(trees,best = 4)
plot(prune.trees)
text(prune.trees)

yhat<-predict(prune.trees,test)
View(data.frame(yhat))
View(data.frame(test$Salary))
temp_data<-cbind(test$Salary,data.frame(yhat))
View(temp_data)
#temp_data$`test$Salary`<-exp(test$Salary)
#plot(temp_data$yhat,temp_data$`test$Salary`)
plot(temp_data$yhat,test$Salary)
abline(0,1)

mean((temp_data$yhat-test$Salary)^2)
## [1] 0.3039835