# Import data set
dataset <- read.csv("G:\\RStudio\\udemy\\ml\\Machine Learning AZ\\Part 2 - Regression\\Section 6 - Polynomial Regression\\Polynomial_Regression\\Position_Salaries.csv")
head(dataset, n = 10)
# Polynomial Regression
# taking care of missing values
# Test for missing values
sum(is.na(dataset$Position))
[1] 0
sum(is.na(dataset$Level))
[1] 0
sum(is.na(dataset$Salary))
[1] 0
There is no missing values in the dataset.
# We want to use only columns 2 and 3
dataset <- dataset[2:3]
head(dataset)
The while the next step should be to split the dataset, since this dataset is small, we will use the full dataset.
# feature scaling
# in this case for polynomial, no need for feature scaling.
# build the linear model for comparison with polynomial regression
lreg <- lm(formula = Salary ~ Level, data = dataset)
summary(lreg)
Call:
lm(formula = Salary ~ Level, data = dataset)
Residuals:
Min 1Q Median 3Q Max
-170818 -129720 -40379 65856 386545
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -195333 124790 -1.565 0.15615
Level 80879 20112 4.021 0.00383 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 182700 on 8 degrees of freedom
Multiple R-squared: 0.669, Adjusted R-squared: 0.6277
F-statistic: 16.17 on 1 and 8 DF, p-value: 0.003833
# building the polynomial regression model
# add a new column in the dataframe
dataset$Level2 <- dataset$Level^2
dataset$Level3 <- dataset$Level^3
preg <- lm(formula = Salary ~ .,
data = dataset)
summary(preg)
Call:
lm(formula = Salary ~ ., data = dataset)
Residuals:
Min 1Q Median 3Q Max
-75695 -28148 7091 29256 49538
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -121333.3 97544.8 -1.244 0.25994
Level 180664.3 73114.5 2.471 0.04839 *
Level2 -48549.0 15081.0 -3.219 0.01816 *
Level3 4120.0 904.3 4.556 0.00387 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 50260 on 6 degrees of freedom
Multiple R-squared: 0.9812, Adjusted R-squared: 0.9718
F-statistic: 104.4 on 3 and 6 DF, p-value: 1.441e-05
# Visualising the Linear Regression results
# make sure you have installed the ggplot package
library(ggplot2)
ggplot() +
geom_point(aes(x = dataset$Level, y = dataset$Salary), colour="red") +
geom_line(aes(x = dataset$Level, y = predict(lreg, newdata = dataset)), colour="blue")+
ggtitle("Truth or Bluff")+
xlab("Levels") +
ylab("Salary")
# visualizing polynomial regression
ggplot() +
geom_point(aes(x = dataset$Level, y = dataset$Salary), colour = "red") +
geom_line(aes(x = dataset$Level, y = predict(preg, newdata = dataset)), colour = "blue")+
ggtitle("Truth or Bluff (Polynomial-level3)") +
xlab("Levels") +
ylab("Salary")
# giving it more fit. by adding more levels
dataset$Level4 <- dataset$Level^4
head(dataset)
# recompute preg
preg <- lm(formula = Salary ~ .,
data = dataset)
summary(preg)
Call:
lm(formula = Salary ~ ., data = dataset)
Residuals:
1 2 3 4 5 6 7 8 9 10
-8357 18240 1358 -14633 -11725 6725 15997 10006 -28695 11084
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 184166.7 67768.0 2.718 0.04189 *
Level -211002.3 76382.2 -2.762 0.03972 *
Level2 94765.4 26454.2 3.582 0.01584 *
Level3 -15463.3 3535.0 -4.374 0.00719 **
Level4 890.2 159.8 5.570 0.00257 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 20510 on 5 degrees of freedom
Multiple R-squared: 0.9974, Adjusted R-squared: 0.9953
F-statistic: 478.1 on 4 and 5 DF, p-value: 1.213e-06
ggplot() +
geom_point(aes(x = dataset$Level, y = dataset$Salary), colour = "red") +
geom_line(aes(x = dataset$Level, y = predict(preg, newdata = dataset)), colour = "blue")+
ggtitle("Truth or Bluff (Polynomial-level4)") +
xlab("Levels") +
ylab("Salary")
# Predicting a new result with Linear Regression
# make a prediction for a single level
# so syntax is going to change
y_pred <- predict(lreg, data.frame(Level = 6.5))
y_pred
1
330378.8
# predicting a result with polynomial Regression
y_pred <- predict(preg, data.frame(Level = 6.5, Level2 = 6.5^2, Level3 = 6.5^3, Level4= 6.5^4))
y_pred
1
158862.5
So the 160K that the person was asking for is not far from what we predicted.