# Import data set
dataset <-  read.csv("G:\\RStudio\\udemy\\ml\\Machine Learning AZ\\Part 2 - Regression\\Section 6 - Polynomial Regression\\Polynomial_Regression\\Position_Salaries.csv")
head(dataset, n = 10)
# Polynomial Regression
# taking care of missing values
# Test for missing values
sum(is.na(dataset$Position))
[1] 0
sum(is.na(dataset$Level))
[1] 0
sum(is.na(dataset$Salary))
[1] 0

There is no missing values in the dataset.

# We want to use only columns 2 and 3
dataset <-  dataset[2:3]
head(dataset)

The while the next step should be to split the dataset, since this dataset is small, we will use the full dataset.

# feature scaling
# in this case for polynomial, no need for feature scaling. 
# build the linear model for comparison with polynomial regression
lreg <-  lm(formula = Salary ~ Level, data = dataset)
summary(lreg)

Call:
lm(formula = Salary ~ Level, data = dataset)

Residuals:
    Min      1Q  Median      3Q     Max 
-170818 -129720  -40379   65856  386545 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)   
(Intercept)  -195333     124790  -1.565  0.15615   
Level          80879      20112   4.021  0.00383 **
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 182700 on 8 degrees of freedom
Multiple R-squared:  0.669, Adjusted R-squared:  0.6277 
F-statistic: 16.17 on 1 and 8 DF,  p-value: 0.003833
# building the polynomial regression model
# add a new column in the dataframe
dataset$Level2 <-  dataset$Level^2
dataset$Level3 <-  dataset$Level^3
preg <-  lm(formula = Salary ~ .,
            data = dataset)
summary(preg)

Call:
lm(formula = Salary ~ ., data = dataset)

Residuals:
   Min     1Q Median     3Q    Max 
-75695 -28148   7091  29256  49538 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)   
(Intercept) -121333.3    97544.8  -1.244  0.25994   
Level        180664.3    73114.5   2.471  0.04839 * 
Level2       -48549.0    15081.0  -3.219  0.01816 * 
Level3         4120.0      904.3   4.556  0.00387 **
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 50260 on 6 degrees of freedom
Multiple R-squared:  0.9812,    Adjusted R-squared:  0.9718 
F-statistic: 104.4 on 3 and 6 DF,  p-value: 1.441e-05
# Visualising the Linear Regression results
# make sure you have installed the ggplot package
library(ggplot2)
ggplot() +
  geom_point(aes(x = dataset$Level, y = dataset$Salary), colour="red") +
  geom_line(aes(x = dataset$Level, y = predict(lreg, newdata = dataset)), colour="blue")+
  ggtitle("Truth or Bluff")+
  xlab("Levels") +
  ylab("Salary")

# visualizing polynomial regression
ggplot() +
  geom_point(aes(x = dataset$Level, y = dataset$Salary), colour = "red") +
  geom_line(aes(x = dataset$Level, y = predict(preg, newdata = dataset)), colour = "blue")+
  ggtitle("Truth or Bluff (Polynomial-level3)") +
  xlab("Levels") +
  ylab("Salary")

# giving it more fit. by adding more levels 
dataset$Level4 <-  dataset$Level^4
head(dataset)
# recompute preg
preg <-  lm(formula = Salary ~ .,
            data = dataset)
summary(preg)

Call:
lm(formula = Salary ~ ., data = dataset)

Residuals:
     1      2      3      4      5      6      7      8      9     10 
 -8357  18240   1358 -14633 -11725   6725  15997  10006 -28695  11084 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)   
(Intercept)  184166.7    67768.0   2.718  0.04189 * 
Level       -211002.3    76382.2  -2.762  0.03972 * 
Level2        94765.4    26454.2   3.582  0.01584 * 
Level3       -15463.3     3535.0  -4.374  0.00719 **
Level4          890.2      159.8   5.570  0.00257 **
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 20510 on 5 degrees of freedom
Multiple R-squared:  0.9974,    Adjusted R-squared:  0.9953 
F-statistic: 478.1 on 4 and 5 DF,  p-value: 1.213e-06
ggplot() +
  geom_point(aes(x = dataset$Level, y = dataset$Salary), colour = "red") +
  geom_line(aes(x = dataset$Level, y = predict(preg, newdata = dataset)), colour = "blue")+
  ggtitle("Truth or Bluff (Polynomial-level4)") +
  xlab("Levels") +
  ylab("Salary")

# Predicting a new result with Linear Regression
# make a prediction for a single level 
# so syntax is going to change
y_pred <-  predict(lreg, data.frame(Level = 6.5))
y_pred
       1 
330378.8 
# predicting a result with polynomial Regression
y_pred <-  predict(preg, data.frame(Level = 6.5, Level2 = 6.5^2, Level3 = 6.5^3, Level4= 6.5^4))
y_pred
       1 
158862.5 

So the 160K that the person was asking for is not far from what we predicted.

LS0tDQp0aXRsZTogIlIgTm90ZWJvb2siDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KYGBge3J9DQojIEltcG9ydCBkYXRhIHNldA0KZGF0YXNldCA8LSAgcmVhZC5jc3YoIkc6XFxSU3R1ZGlvXFx1ZGVteVxcbWxcXE1hY2hpbmUgTGVhcm5pbmcgQVpcXFBhcnQgMiAtIFJlZ3Jlc3Npb25cXFNlY3Rpb24gNiAtIFBvbHlub21pYWwgUmVncmVzc2lvblxcUG9seW5vbWlhbF9SZWdyZXNzaW9uXFxQb3NpdGlvbl9TYWxhcmllcy5jc3YiKQ0KaGVhZChkYXRhc2V0LCBuID0gMTApDQpgYGANCg0KYGBge3J9DQojIFBvbHlub21pYWwgUmVncmVzc2lvbg0KIyB0YWtpbmcgY2FyZSBvZiBtaXNzaW5nIHZhbHVlcw0KIyBUZXN0IGZvciBtaXNzaW5nIHZhbHVlcw0Kc3VtKGlzLm5hKGRhdGFzZXQkUG9zaXRpb24pKQ0Kc3VtKGlzLm5hKGRhdGFzZXQkTGV2ZWwpKQ0Kc3VtKGlzLm5hKGRhdGFzZXQkU2FsYXJ5KSkNCmBgYA0KDQpUaGVyZSBpcyBubyBtaXNzaW5nIHZhbHVlcyBpbiB0aGUgZGF0YXNldC4gDQoNCmBgYHtyfQ0KIyBXZSB3YW50IHRvIHVzZSBvbmx5IGNvbHVtbnMgMiBhbmQgMw0KZGF0YXNldCA8LSAgZGF0YXNldFsyOjNdDQpoZWFkKGRhdGFzZXQpDQoNCmBgYA0KDQpUaGUgd2hpbGUgdGhlIG5leHQgc3RlcCBzaG91bGQgYmUgdG8gc3BsaXQgdGhlIGRhdGFzZXQsIHNpbmNlIHRoaXMgZGF0YXNldCBpcyBzbWFsbCwgd2Ugd2lsbCB1c2UgdGhlIGZ1bGwgZGF0YXNldC4gDQoNCmBgYHtyfQ0KIyBmZWF0dXJlIHNjYWxpbmcNCiMgaW4gdGhpcyBjYXNlIGZvciBwb2x5bm9taWFsLCBubyBuZWVkIGZvciBmZWF0dXJlIHNjYWxpbmcuIA0KDQpgYGANCmBgYHtyfQ0KIyBidWlsZCB0aGUgbGluZWFyIG1vZGVsIGZvciBjb21wYXJpc29uIHdpdGggcG9seW5vbWlhbCByZWdyZXNzaW9uDQpscmVnIDwtICBsbShmb3JtdWxhID0gU2FsYXJ5IH4gTGV2ZWwsIGRhdGEgPSBkYXRhc2V0KQ0Kc3VtbWFyeShscmVnKQ0KDQpgYGANCg0KYGBge3J9DQojIGJ1aWxkaW5nIHRoZSBwb2x5bm9taWFsIHJlZ3Jlc3Npb24gbW9kZWwNCiMgYWRkIGEgbmV3IGNvbHVtbiBpbiB0aGUgZGF0YWZyYW1lDQpkYXRhc2V0JExldmVsMiA8LSAgZGF0YXNldCRMZXZlbF4yDQpkYXRhc2V0JExldmVsMyA8LSAgZGF0YXNldCRMZXZlbF4zDQpwcmVnIDwtICBsbShmb3JtdWxhID0gU2FsYXJ5IH4gLiwNCiAgICAgICAgICAgIGRhdGEgPSBkYXRhc2V0KQ0Kc3VtbWFyeShwcmVnKQ0KYGBgDQoNCmBgYHtyfQ0KIyBWaXN1YWxpc2luZyB0aGUgTGluZWFyIFJlZ3Jlc3Npb24gcmVzdWx0cw0KIyBtYWtlIHN1cmUgeW91IGhhdmUgaW5zdGFsbGVkIHRoZSBnZ3Bsb3QgcGFja2FnZQ0KbGlicmFyeShnZ3Bsb3QyKQ0KZ2dwbG90KCkgKw0KICBnZW9tX3BvaW50KGFlcyh4ID0gZGF0YXNldCRMZXZlbCwgeSA9IGRhdGFzZXQkU2FsYXJ5KSwgY29sb3VyPSJyZWQiKSArDQogIGdlb21fbGluZShhZXMoeCA9IGRhdGFzZXQkTGV2ZWwsIHkgPSBwcmVkaWN0KGxyZWcsIG5ld2RhdGEgPSBkYXRhc2V0KSksIGNvbG91cj0iYmx1ZSIpKw0KICBnZ3RpdGxlKCJUcnV0aCBvciBCbHVmZiIpKw0KICB4bGFiKCJMZXZlbHMiKSArDQogIHlsYWIoIlNhbGFyeSIpDQoNCmBgYA0KDQpgYGB7cn0NCiMgdmlzdWFsaXppbmcgcG9seW5vbWlhbCByZWdyZXNzaW9uDQpnZ3Bsb3QoKSArDQogIGdlb21fcG9pbnQoYWVzKHggPSBkYXRhc2V0JExldmVsLCB5ID0gZGF0YXNldCRTYWxhcnkpLCBjb2xvdXIgPSAicmVkIikgKw0KICBnZW9tX2xpbmUoYWVzKHggPSBkYXRhc2V0JExldmVsLCB5ID0gcHJlZGljdChwcmVnLCBuZXdkYXRhID0gZGF0YXNldCkpLCBjb2xvdXIgPSAiYmx1ZSIpKw0KICBnZ3RpdGxlKCJUcnV0aCBvciBCbHVmZiAoUG9seW5vbWlhbC1sZXZlbDMpIikgKw0KICB4bGFiKCJMZXZlbHMiKSArDQogIHlsYWIoIlNhbGFyeSIpDQpgYGANCg0KYGBge3J9DQojIGdpdmluZyBpdCBtb3JlIGZpdC4gYnkgYWRkaW5nIG1vcmUgbGV2ZWxzIA0KZGF0YXNldCRMZXZlbDQgPC0gIGRhdGFzZXQkTGV2ZWxeNA0KaGVhZChkYXRhc2V0KQ0KYGBgDQoNCmBgYHtyfQ0KIyByZWNvbXB1dGUgcHJlZw0KcHJlZyA8LSAgbG0oZm9ybXVsYSA9IFNhbGFyeSB+IC4sDQogICAgICAgICAgICBkYXRhID0gZGF0YXNldCkNCnN1bW1hcnkocHJlZykNCg0KYGBgDQoNCmBgYHtyfQ0KZ2dwbG90KCkgKw0KICBnZW9tX3BvaW50KGFlcyh4ID0gZGF0YXNldCRMZXZlbCwgeSA9IGRhdGFzZXQkU2FsYXJ5KSwgY29sb3VyID0gInJlZCIpICsNCiAgZ2VvbV9saW5lKGFlcyh4ID0gZGF0YXNldCRMZXZlbCwgeSA9IHByZWRpY3QocHJlZywgbmV3ZGF0YSA9IGRhdGFzZXQpKSwgY29sb3VyID0gImJsdWUiKSsNCiAgZ2d0aXRsZSgiVHJ1dGggb3IgQmx1ZmYgKFBvbHlub21pYWwtbGV2ZWw0KSIpICsNCiAgeGxhYigiTGV2ZWxzIikgKw0KICB5bGFiKCJTYWxhcnkiKQ0KYGBgDQoNCmBgYHtyfQ0KIyBQcmVkaWN0aW5nIGEgbmV3IHJlc3VsdCB3aXRoIExpbmVhciBSZWdyZXNzaW9uDQojIG1ha2UgYSBwcmVkaWN0aW9uIGZvciBhIHNpbmdsZSBsZXZlbCANCiMgc28gc3ludGF4IGlzIGdvaW5nIHRvIGNoYW5nZQ0KDQp5X3ByZWQgPC0gIHByZWRpY3QobHJlZywgZGF0YS5mcmFtZShMZXZlbCA9IDYuNSkpDQp5X3ByZWQNCg0KIyBwcmVkaWN0aW5nIGEgcmVzdWx0IHdpdGggcG9seW5vbWlhbCBSZWdyZXNzaW9uDQp5X3ByZWQgPC0gIHByZWRpY3QocHJlZywgZGF0YS5mcmFtZShMZXZlbCA9IDYuNSwgTGV2ZWwyID0gNi41XjIsIExldmVsMyA9IDYuNV4zLCBMZXZlbDQ9IDYuNV40KSkNCnlfcHJlZA0KYGBgDQoNClNvIHRoZSAxNjBLIHRoYXQgdGhlIHBlcnNvbiB3YXMgYXNraW5nIGZvciBpcyBub3QgZmFyIGZyb20gd2hhdCB3ZSBwcmVkaWN0ZWQuIA0KDQo=