# Import data set
dataset <- read.csv("G:\\RStudio\\udemy\\ml\\Machine Learning AZ\\Part 2 - Regression\\Section 8 - Decision Tree Regression\\Decision_Tree_Regression\\Position_Salaries.csv")
# We want to use only columns 2 and 3
dataset <- dataset[2:3]
head(dataset)
# Polynomial Regression
# taking care of missing values
# Test for missing values
sum(is.na(dataset$Level))
[1] 0
sum(is.na(dataset$Salary))
[1] 0
There is no missing values in the dataset.
The while the next step should be to split the dataset, since this dataset is small, we will use the full dataset.
# feature scaling
# in this case for polynomial, not needed for feature scaling.
# Fitting the regression model
# add a new column in the dataframe
# install.packages("rpart")
library(rpart)
package <U+393C><U+3E31>rpart<U+393C><U+3E32> was built under R version 3.3.3
regressor <-rpart(formula = Salary ~., data = dataset)
summary(regressor)
Call:
rpart(formula = Salary ~ ., data = dataset)
n= 10
CP nsplit rel error xerror xstd
1 0.01 0 1 0 0
Node number 1: 10 observations
mean=249500, MSE=8.066225e+10
# predicting a new result
y_pred <- predict(regressor, data.frame(Level = 6.5))
y_pred
[1] 249500
# Plot
library(ggplot2)
ggplot() +
geom_point(aes(x = dataset$Level, y = dataset$Salary), colour = "red") +
geom_line(aes(x = dataset$Level, y = predict(regressor, newdata = dataset)), colour = "blue")+
ggtitle("Truth or Bluff (Regression Model)") +
xlab("Levels") +
ylab("Salary")
# visualizing polynomial regression
regressor <-rpart(formula = Salary ~., data = dataset, control = rpart.control(minsplit = 1))
summary(regressor)
Call:
rpart(formula = Salary ~ ., data = dataset, control = rpart.control(minsplit = 1))
n= 10
CP nsplit rel error xerror xstd
1 0.77638626 0 1.00000000 1.234568 0.7835133
2 0.15496716 1 0.22361374 1.148378 0.7931845
3 0.05217357 2 0.06864658 1.120316 0.7968327
4 0.01000000 3 0.01647301 1.120316 0.7968327
Variable importance
Level
100
Node number 1: 10 observations, complexity param=0.7763863
mean=249500, MSE=8.066225e+10
left son=2 (8 obs) right son=3 (2 obs)
Primary splits:
Level < 8.5 to the left, improve=0.7763863, (0 missing)
Node number 2: 8 observations, complexity param=0.05217357
mean=124375, MSE=6.921484e+09
left son=4 (6 obs) right son=5 (2 obs)
Primary splits:
Level < 6.5 to the left, improve=0.7600316, (0 missing)
Node number 3: 2 observations, complexity param=0.1549672
mean=750000, MSE=6.25e+10
left son=6 (1 obs) right son=7 (1 obs)
Primary splits:
Level < 9.5 to the left, improve=1, (0 missing)
Node number 4: 6 observations
mean=82500, MSE=1.38125e+09
Node number 5: 2 observations
mean=250000, MSE=2.5e+09
Node number 6: 1 observations
mean=500000, MSE=0
Node number 7: 1 observations
mean=1000000, MSE=0
# predicting a new result
y_pred <- predict(regressor, data.frame(Level = 6.5))
y_pred
1
250000
library(ggplot2)
ggplot() +
geom_point(aes(x = dataset$Level, y = dataset$Salary), colour = "red") +
geom_line(aes(x = dataset$Level, y = predict(regressor, newdata = dataset)), colour = "blue")+
ggtitle("Truth or Bluff (Regression Model)") +
xlab("Levels") +
ylab("Salary")
# visualizing in higher resolution
x_grid = seq(min(dataset$Level), max(dataset$Level), 0.1 )
ggplot() +
geom_point(aes(x = dataset$Level, y = dataset$Salary), colour = "red") +
geom_line(aes(x = x_grid, y = predict(regressor, newdata = data.frame(Level = x_grid))), colour = "blue")+
ggtitle("Truth or Bluff (Regression Model)") +
xlab("Levels") +
ylab("Salary")