dataset=read.csv('Position_Salaries.csv')
dataset=dataset[2:3]
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.2
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.4.2
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
set.seed(1234)
regressor=randomForest(x=dataset[1], y=dataset$Salary, ntree=10)
xgrid=seq(min(dataset$Level), max(dataset$Level),0.01)
g2=ggplot()+
geom_point(aes(x=dataset$Level , y=dataset$Salary),
colour="red")+
geom_line(aes(x=xgrid, y=predict(regressor, newdata=data.frame(Level=xgrid))),
colour="blue")+
ggtitle("Truth or Bluff(Random Forest)")+
xlab('Level')+
ylab("Salary")
g2
y_forest_pred=predict(regressor, data.frame(Level=6.5))
y_forest_pred
## 1
## 141733.3
Each interval has 10 trees, and the random forest take the average of the 10 tree predictions for each step. More trees dont neccessarily mean more steps
regressor1=randomForest(x=dataset[1], y=dataset$Salary, ntree=100)
xgrid=seq(min(dataset$Level), max(dataset$Level),0.01)
g2=ggplot()+
geom_point(aes(x=dataset$Level , y=dataset$Salary),
colour="purple")+
geom_line(aes(x=xgrid, y=predict(regressor1, newdata=data.frame(Level=xgrid))),
colour="black")+
ggtitle("Truth or Bluff(Random Forest 100 trees)")+
xlab('Level')+
ylab("Salary")
g2
#Random Forest prediction with 100 trees
y_forest_pred1=predict(regressor1, data.frame(Level=6.5))
y_forest_pred1
## 1
## 166281.7
100 trees doesnt mean more stairs, it meant better location of the trees. So we have a better result. If we try 500 trees then result is even better