Polynomial Regression
The formula is y = \(b_{0}\) + \(b_{1}x_{1}\) + \(b_{2}x^{2}_{1}\).
used in Healthcare/ Epidemiology data
Position Salary dataset
# Importing the dataset
dataset = read.csv('./Part 2 - Regression/Section 6 - Polynomial Regression/R/Position_Salaries.csv')
# use the columns level and salary
dataset = dataset[2:3]
# no data splitting due to small dataset
# no feature scaling needed
# for a baseline comparison, use Simple Linear Regression
lin_reg = lm(formula = Salary ~ .,
data = dataset)
summary(lin_reg)
Call:
lm(formula = Salary ~ ., data = dataset)
Residuals:
Min 1Q Median 3Q Max
-170818 -129720 -40379 65856 386545
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -195333 124790 -1.565 0.15615
Level 80879 20112 4.021 0.00383 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 182700 on 8 degrees of freedom
Multiple R-squared: 0.669, Adjusted R-squared: 0.6277
F-statistic: 16.17 on 1 and 8 DF, p-value: 0.003833
# Fitting Polynomial Regression to the dataset
# polynomial features of indep variables (to any degree you want)
# 1 indep + dep. vars
# add column using $ and name it
# dataset$column^2 returns squared column for all level column values
dataset$Level2 = dataset$Level^2
# dataset$column^3 returns cubed column for all level column values
dataset$Level3 = dataset$Level^3
dataset$Level4 = dataset$Level^4
poly_reg = lm(formula = Salary ~ .,
data = dataset)
summary(poly_reg)
Call:
lm(formula = Salary ~ ., data = dataset)
Residuals:
1 2 3 4 5 6 7 8 9 10
-8357 18240 1358 -14633 -11725 6725 15997 10006 -28695 11084
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 184166.7 67768.0 2.718 0.04189 *
Level -211002.3 76382.2 -2.762 0.03972 *
Level2 94765.4 26454.2 3.582 0.01584 *
Level3 -15463.3 3535.0 -4.374 0.00719 **
Level4 890.2 159.8 5.570 0.00257 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 20510 on 5 degrees of freedom
Multiple R-squared: 0.9974, Adjusted R-squared: 0.9953
F-statistic: 478.1 on 4 and 5 DF, p-value: 1.213e-06
#---------------- Visualizing the Linear Regression results
# install.packages('ggplot2')
library(ggplot2)
ggplot() + # x= indep y= dep var
geom_point(aes(x = dataset$Level, y = dataset$Salary), # real points
colour = 'red') + # predict function
geom_line(aes(x = dataset$Level, y = predict(lin_reg, newdata = dataset)), # predicted
colour = 'blue') +
ggtitle('Truth or Bluff (Linear Regression)') +
xlab('Level') +
ylab('Salary')

# the linear regression line does not fit the real data points,
# this is clearly a polynomial problem
# predicted salaries are linear but real data points are polynomial
# real salary level 5= $125,000 vs predicted= $240,000
#-------------- Visualizing the Polynomial Regression results
#
library(ggplot2)
ggplot() +
geom_point(aes(x = dataset$Level, y = dataset$Salary), # real
colour = 'red') + # predict function, change to poly_reg
geom_line(aes(x = dataset$Level, y = predict(poly_reg, newdata = dataset)), # predicted
colour = 'blue') +
ggtitle('Truth or Bluff (Polynomial Regression)') +
xlab('Level') +
ylab('Salary')

# the predicted line fits better with the data points, curved line
#------------ Visualizing the Regression Model results
# (for higher resolution and smoother curve)
#
library(ggplot2)
x_grid = seq(min(dataset$Level), max(dataset$Level), 0.1)
ggplot() +
geom_point(aes(x = dataset$Level, y = dataset$Salary),
colour = 'red') +
geom_line(aes(x = x_grid, y = predict(poly_reg,
newdata = data.frame(Level = x_grid,
Level2 = x_grid^2,
Level3 = x_grid^3,
Level4 = x_grid^4))),
colour = 'blue') +
ggtitle('Truth or Bluff (Polynomial Regression)') +
xlab('Level') +
ylab('Salary')

# Predicting a new result with Linear Regression
# make a prediction on based on level 6.5
# make a new dataframe row
# y_pred.2 = predict(lin_reg, data.frame(Level = 6.5))
predict(lin_reg, data.frame(Level = 6.5))
1
330378.8
# Predicting a new result with Polynomial Regression
# add polynomial features for each level column
y_pred.3 = predict(poly_reg, data.frame(Level = 6.5,
Level2 = 6.5^2,
Level3 = 6.5^3,
Level4 = 6.5^4))
y_pred.3
1
158862.5
Classification
Logistic Regression
age and email action taken (Y/N | 1/0) correlation, {linear regression is not right model for this but shows a trend between variables}. Sigmoid function forces values 0 to 1, S-shaped curve on plot which makes the best fitting line for variables. Used for predicting probability (\(p^-\)) (p_hat)
Logistic Regression formula \(ln\)(p / 1 - p) = \(b_{0}\) + \(b_{1}*x\)
Example: on x-axis, 4 age column values at random, y-axis is p_hat, the s-curve on the plot. Person of age 20 has a p_hat probability value of 0.7% (p^- = 0.7) of taking action with an email. Person age 40 has p^- = 85% of taking action with an email. Any values below 50% the y_hat value is pushed down towards 0, any value above 50% is pushed up towards 1, so the results in a binary 0/1 outcome.
# Importing the dataset
dataset = read.csv('./Part 3 - Classification/Section 14 - Logistic Regression/R/Social_Network_Ads.csv')
dataset = dataset[3:5]
# Encoding the target feature as factor
dataset$Purchased = factor(dataset$Purchased, levels = c(0, 1))
# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(dataset$Purchased, SplitRatio = 0.75)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)
# Feature Scaling is best practice for Logistic Regression
# [-3] is the last column of dataset
training_set[-3] = scale(training_set[-3])
test_set[-3] = scale(test_set[-3])
# Fitting Logistic Regression to the Training set
# glm (general linear model) builds the logistic regression
# predict the dependent variable of Purchased based on indep. variables: age, estimated salary
classifier = glm(formula = Purchased ~ .,
family = binomial,
data = training_set)
# Predicting the Test set results
prob_pred = predict(classifier,
type = 'response',
newdata = test_set[-3])
y_pred.logr = ifelse(prob_pred > 0.5, 1, 0)
# Making the Confusion Matrix
cm = table(test_set[, 3], y_pred.logr > 0.5)
cm
FALSE TRUE
0 57 7
1 10 26
prob_pred[1]
2
0.01623954
y_pred.logr[1]
2
0
the 1st probability predicted = 0.162 and the test_set 1st value is 0, this mean user #2 is unlikely to purchase. Using the y_pred.logr variable the model predicted that user #2 will not purchase an item.
the model correctly predicted a purchase (57+26) 83 times and 17 incorrect predictions
# Visualising the Training set results
library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
prob_set = predict(classifier, type = 'response', newdata = grid_set)
y_grid = ifelse(prob_set > 0.5, 1, 0)
plot(set[, -3],
main = 'Logistic Regression (Training set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

# Visualising the Test set results
library(ElemStatLearn)
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
prob_set = predict(classifier, type = 'response', newdata = grid_set)
y_grid = ifelse(prob_set > 0.5, 1, 0)
plot(set[, -3],
main = 'Logistic Regression (Test set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

NA
NA
NA
Training set observation datapoints, red points are training observation where (dependent variable) Purchased=0 and the green points are training set observation where purchased=1. red zone is prediction region non-purchase. classifier predicted that the higher age the more estimated salary and to purchased item. Classifier is straight line for linear models. Focus on the dot color and the zone they fall.
K-Nearest Neighbor
KNN process:
- choose the number k of neighbors
- take the KNNs of the new data point, according to Euclidean distance
- among these KNNs, count the number of data points in each category
- assign the new data point to the category where you counted the most neighbors
- model is done
# Importing the dataset
dataset = read.csv('./Part 3 - Classification/Section 15 - K-Nearest Neighbors (K-NN)/R/Social_Network_Ads.csv')
dataset = dataset[3:5]
# Encoding the target feature as factor
dataset$Purchased = factor(dataset$Purchased, levels = c(0, 1))
# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(dataset$Purchased, SplitRatio = 0.75)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)
# Feature Scaling
training_set[-3] = scale(training_set[-3])
test_set[-3] = scale(test_set[-3])
# Fitting classifier to the Training set
# Create your classifier here
# build a KNN classifier
library(class)
# fit a KNN to Training set and Predict the Test set
# remove last column of training set
y_pred.KNN = knn(train= training_set[,-3],
test= test_set[, -3],
cl= training_set[, 3],
k= 5)
# y_pred.KNN[1:5]
# Predicting the Test set results
# for KNN comment out
# y_pred = predict(classifier, newdata = test_set[-3])
# Making the Confusion Matrix
cm = table(test_set[, 3], y_pred.KNN)
cm
y_pred.KNN
0 1
0 59 5
1 6 30
#============ Visualizing the Training set results
library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
# y_grid = predict(classifier, newdata = grid_set)
# for KNN replace the predict and its arguments with KNN arguments
y_grid = knn(train= training_set[,-3],
test= grid_set, # replace test_set
cl= training_set[, 3],
k= 5)
plot(set[, -3],
main = 'KNN Classifier (Training set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

# Visualising the Test set results
library(ElemStatLearn)
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
# for KNN replace the predict and its arguments with KNN arguments
y_grid = knn(train= training_set[,-3],
test= grid_set, # replace test_set
cl= training_set[, 3],
k= 5)
# y_grid = predict(classifier, newdata = grid_set)
plot(set[, -3], main = 'KNN Classifier (Test set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

NA
NA
NA
Support Vector Machines
Started in 1960s and 1990s. Separate datapoints on a plot and classify them. Goal: find best decision boundary using a max margin hyperplane line that has a max margin (distance away from line and distance between max margin lines) and datapoints outside the max margin lines are positive or negative hyperplane.
classify apples and oranges, training on best examples of each fruit
# Importing the dataset
dataset = read.csv('./Part 3 - Classification/Section 16 - Support Vector Machine (SVM)/R/Social_Network_Ads.csv')
dataset = dataset[3:5]
# Encoding the target feature as factor
dataset$Purchased = factor(dataset$Purchased, levels = c(0, 1))
# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(dataset$Purchased, SplitRatio = 0.75)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)
# Feature Scaling
training_set[-3] = scale(training_set[-3])
test_set[-3] = scale(test_set[-3])
# Fitting classifier to the Training set
# Create your classifier here
# library e1071 for SVM
library(e1071)
# read the documentation
classifier.SVM = svm(formula= Purchased ~ .,
data= training_set,
type= 'C-classification', # classification
kernel= 'linear'
)
# Predicting the Test set results
y_pred.SVM = predict(classifier.SVM, newdata = test_set[-3])
# Making the Confusion Matrix
cm = table(test_set[, 3], y_pred.SVM)
cm
y_pred.SVM
0 1
0 57 7
1 13 23
#============= Visualizing the Training set results
library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = predict(classifier.SVM, newdata = grid_set)
plot(set[, -3],
main = 'SVM Classifier (Training set)',
xlab = 'Age',
ylab = 'Estimated Salary',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

#============ Visualizing the Test set results
library(ElemStatLearn)
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = predict(classifier.SVM,
newdata = grid_set)
plot(set[, -3], main = 'SVM Classifier (Test set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

NA
NA
mapping datapoints to a Higher dimensional shape separates points by mapping the points to a algebraic function then to have a hyperplane a separator, but this is very computer intensive and not practical.
Kernel SVM
Decision boundaries for when data points are clustered in circles and is not linear. Gaussian RBF Kernel.
# Importing the dataset
dataset = read.csv('./Part 3 - Classification/Section 17 - Kernel SVM/R/Social_Network_Ads.csv')
dataset = dataset[3:5]
# Encoding the target feature as factor
dataset$Purchased = factor(dataset$Purchased, levels = c(0, 1))
# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(dataset$Purchased, SplitRatio = 0.75)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)
# Feature Scaling
training_set[-3] = scale(training_set[-3])
test_set[-3] = scale(test_set[-3])
# Fitting classifier to the Training set
# Create your classifier here
#=========== Kernel SVM
library(e1071)
# Gaussian classifier, radial
classifier.SVM = svm(formula= Purchased ~ .,
data = training_set,
type = "C-classification",
kernel= 'radial')
# Predicting the Test set results
y_pred = predict(classifier.SVM, newdata = test_set[-3])
y_pred[1:5]
2 4 5 9 12
0 0 0 0 0
Levels: 0 1
# Making the Confusion Matrix
cm = table(test_set[, 3], y_pred)
cm
y_pred
0 1
0 58 6
1 4 32
#============== Visualizing the Training set results
library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = predict(classifier.SVM, newdata = grid_set)
plot(set[, -3],
main = 'Kernel SVM Classifier (Training set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

#================== Visualizing the Test set results
library(ElemStatLearn)
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = predict(classifier.SVM, newdata = grid_set)
plot(set[, -3], main = 'Kernel SVM Classifier (Test set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

Confusion matrix shows 10 incorrect results and 90 correct results
The SVM mapped data to a 3D plane, green zone is users who purchased item and red zone is no purchased item.
Naive Bayes Theorem Classification
Bayes Theorem of probability formula \(P(A|B)\) = \(P(B|A) * P(A) \div P(B)\)
Example: machine.1 makes 30 items/hr and machine.2 makes 20 items/hr. Out of all items made, 1 % is defective, and out of all defective items 50% came from machine.1 and 50% from machine.2. What is the probability that a item made by machine.2 is defective?
- P(Machine.2) = 20/50
- P(Defect) = 1%
- P(Machine.2 | Defect)= 50%
- P(Defect | Machine.2) = ?
P(Defect | Machine.2) = P(Machine.2 | Defect) * P(Defect) / P(Machine.2) P(Defect | Machine.2) = 0.5 * 0.01 / 0.4 == 0.0125 (1.25%)
Example 2 Naive Bayes (assumed independence of variables: x= age, y= salary, datapoints grouped: walks to work or drives to work. X= features P(Walks | X) = P(X | Walks) * P(Walks) / P(X)
P(Drives | X) = P(X | Drives) * P(Drives) / P(X)
repeat steps for both Walks and Drives class:
- prior probability: P(Walks)
- marginal likelihood: P(X)
- likelihood: P(X | Walks)
- posterior probability: P(Walks | X)
step 1: we have no data, so calculate the points in the walks group from a plot. - P(Walks) = number of walkers / total datapoints
step 2: select a radius on a plot, a circle to contain datapoints, look at features (age and salary) and these points will be similar. the probability of a new datapoint would fall into this radius. Count the number of points inside circle. - P(X) = number of observations / total observations
step 3: use the radius circle for similar features of datapoints, what is the likelihood of features of walkers given that person who walks (ignore the drivers). Count the number of datapoints for walkers inside the circle. - P(X | Walkers) = number of similar points for walkers / total number of walkers
step 4: P(Walks | X) = (3/10) * (10/30) / (4/30) == 0.75 (75% likelihood of datapoint being a Walker)
repeat for driver
step 4: P(Drives | X)= (1/20) * (20/30) / (4/30) = 0.25 (25%)
# Importing the dataset
dataset = read.csv('./Part 3 - Classification/Section 18 - Naive Bayes/R/Social_Network_Ads.csv')
dataset = dataset[3:5]
# Encoding the target feature as factor
dataset$Purchased = factor(dataset$Purchased, levels = c(0, 1))
# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(dataset$Purchased, SplitRatio = 0.75)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)
# Feature Scaling
training_set[-3] = scale(training_set[-3])
test_set[-3] = scale(test_set[-3])
# Fitting classifier to the Training set
# Create your classifier here
# === Bayes Classifier
library(e1071)
# press F1 for documentation when mouse is on function name
classifier.Bayes = naiveBayes(x= training_set[-3],
y= training_set$Purchased)
# Predicting the Test set results
y_pred = predict(classifier.Bayes, newdata = test_set[-3])
# Making the Confusion Matrix
cm = table(test_set[, 3], y_pred)
#============== Visualizing the Training set results
library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = predict(classifier.Bayes, newdata = grid_set)
plot(set[, -3],
main = 'Naive Bayes Classifier (Training set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

#========== Visualizing the Test set results
library(ElemStatLearn)
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = predict(classifier.Bayes, newdata = grid_set)
plot(set[, -3], main = 'Naive Bayes Classifier (Test set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

NA
NA
Decision Tree Classification

The Decision Tree Classifier first condition is age < 44.5 and salary < $90,000, our model classifies person as will not buy the item, if salary is > $90,000 person will buy the item . This was made by not running the code of feature scaling and visualizations, but the classifier and plotting function.
Random Forest Classification
Ensemble Learning = take multiple algorithms to make powerful algorithm
step 1: pick at random k data points from training set
step 2: build decision tree associated to these k data points
step 3: choose the number Ntree of trees you want to build and repeat steps 1 & 2
step 4: for a new data point, make each one of your Ntree trees predict the value of Y for the data point in question, and assign the new data point the average across all of the predicted Y values
This model is used for remote controller free gaming consoles (Microsoft connect)
# Importing the dataset
dataset = read.csv('./Part 3 - Classification/Section 20 - Random Forest Classification/R/Social_Network_Ads.csv')
dataset = dataset[3:5]
# Encoding the target feature as factor
dataset$Purchased = factor(dataset$Purchased, levels = c(0, 1))
# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(dataset$Purchased, SplitRatio = 0.75)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)
# Feature Scaling
training_set[-3] = scale(training_set[-3])
test_set[-3] = scale(test_set[-3])
# Fitting Random Forest Classification to the Training set
# install.packages('randomForest')
library(randomForest)
randomForest 4.6-14
Type rfNews() to see new features/changes/bug fixes.
set.seed(123)
classifier = randomForest(x = training_set[-3],
y = training_set$Purchased,
ntree = 500)
# Predicting the Test set results
y_pred = predict(classifier, newdata = test_set[-3])
# Making the Confusion Matrix
cm = table(test_set[, 3], y_pred)
#========== Visualizing the Training set results
library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = predict(classifier, grid_set)
plot(set[, -3],
main = 'Random Forest Classification (Training set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

#======== Visualizing the Test set results
library(ElemStatLearn)
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = predict(classifier, grid_set)
plot(set[, -3], main = 'Random Forest Classification (Test set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

# Choosing the number of trees
plot(classifier)

Association Rule
“people who bought X also bought Y”
Apriori algorithm
movie recommendation:
support(M) = # user watchlist containing M / # user watchlists
movie recommendation:
confidence(M1 -> M2) = # user watchlist containing M1, M2 / # user watchlists containing M1
movie recommendation:
lift(M1 -> M2) = confidence M1, M2 / support M2
market basket optimization:
support(J) = # transactions containing J / # transactions
market basket optimization:
confidence(J1 -> J2) = # transactions containing J1, J2 / # transactions containing J1
market basket optimization:
{what is the random likelihood that person likes this item? Lift is the improvement recommendation}
lift(J1 -> J2) = confidence J1, J2 / support J2
Process:
- set a min support and confidence
- take all the subsets in transactions having higher support than min support
- take all the rules of these subsets having higher confidence than min confidence
- sort the rules by decreasing lift, highest lift is the value you want
# Apriori
# Data Preprocessing
# install.packages('arules')
library(arules)
dataset = read.csv('./Part 5 - Association Rule Learning/Section 28 - Apriori/R/Market_Basket_Optimisation.csv', header = FALSE)
# sparse matrix
dataset = read.transactions('./Part 5 - Association Rule Learning/Section 28 - Apriori/R/Market_Basket_Optimisation.csv', sep = ',', rm.duplicates = TRUE)
distribution of transactions with duplicates:
1
5
summary(dataset)
transactions as itemMatrix in sparse format with
7501 rows (elements/itemsets/transactions) and
119 columns (items) and a density of 0.03288973
most frequent items:
mineral water eggs spaghetti french fries chocolate (Other)
1788 1348 1306 1282 1229 22405
element (itemset/transaction) length distribution:
sizes
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 18 19 20
1754 1358 1044 816 667 493 391 324 259 139 102 67 40 22 17 4 1 2 1
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.000 2.000 3.000 3.914 5.000 20.000
includes extended item information - examples:
itemFrequencyPlot(dataset, topN = 30)

# Training Apriori on the dataset
# items bought 3x's a day divided by total products = {3*7/7500} = 0.028 => 0.003
# items bought 4x's a day divided by total products = {4*7/7500} = 0.0037 => 0.004
# confidence value is arbitrary choice
#
rules = apriori(data = dataset,
parameter = list(support = 0.004,
confidence = 0.2)) # use small values for more rules
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 30
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[119 item(s), 7501 transaction(s)] done [0.00s].
sorting and recoding items ... [114 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 done [0.00s].
writing ... [811 rule(s)] done [0.00s].
creating S4 object ... done [0.00s].
# Visualizing the results
# get the highest rules by lift
inspect(sort(rules, by = 'lift')[1:10])
NA
the rules show that people who bought {light cream} will buy {chicken} 29% [Confidence] of the cases with a lift value of 4.84
Eclat
this algorithm is similar as above, but it only has the support variable using sets
simple results of items commonly purchased together using the support parameter
# Data Preprocessing
# install.packages('arules')
library(arules)
dataset = read.csv('./Part 5 - Association Rule Learning/Section 29 - Eclat/R/Market_Basket_Optimisation.csv')
dataset = read.transactions('./Part 5 - Association Rule Learning/Section 29 - Eclat/R/Market_Basket_Optimisation.csv', sep = ',', rm.duplicates = TRUE)
distribution of transactions with duplicates:
1
5
summary(dataset)
transactions as itemMatrix in sparse format with
7501 rows (elements/itemsets/transactions) and
119 columns (items) and a density of 0.03288973
most frequent items:
mineral water eggs spaghetti french fries chocolate (Other)
1788 1348 1306 1282 1229 22405
element (itemset/transaction) length distribution:
sizes
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 18 19 20
1754 1358 1044 816 667 493 391 324 259 139 102 67 40 22 17 4 1 2 1
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.000 2.000 3.000 3.914 5.000 20.000
includes extended item information - examples:
itemFrequencyPlot(dataset, topN = 10)

# Training Eclat on the dataset
rules = eclat(data = dataset, parameter = list(support = 0.003, minlen = 2))
Eclat
parameter specification:
algorithmic control:
Absolute minimum support count: 22
create itemset ...
set transactions ...[119 item(s), 7501 transaction(s)] done [0.01s].
sorting and recoding items ... [115 item(s)] done [0.00s].
creating sparse bit matrix ... [115 row(s), 7501 column(s)] done [0.00s].
writing ... [1328 set(s)] done [0.01s].
Creating S4 object ... done [0.00s].
# Visualising the results
inspect(sort(rules, by = 'support')[1:10])
NA
NA
End of section
Principal Component Analysis (PCA)
Unsupervised algorithm for : noise filtering, visualization, feature extraction, time series predictions, gene data analysis.
Goal: identify patterns in data, detect the correlation between variables by reducing the dimensions
# Importing the dataset
dataset = read.csv('./Part 9 - Dimensionality Reduction/Section 43 - Principal Component Analysis (PCA)/R/Wine.csv')
# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(dataset$Customer_Segment, SplitRatio = 0.8)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)
# Feature Scaling
training_set[-14] = scale(training_set[-14])
test_set[-14] = scale(test_set[-14])
# Applying PCA
# install.packages('caret')
library(caret)
Loading required package: lattice
Loading required package: ggplot2
Attaching package: ‘ggplot2’
The following object is masked from ‘package:randomForest’:
margin
Registered S3 method overwritten by 'data.table':
method from
print.data.table
# install.packages('e1071')
library(e1071)
pca = preProcess(x = training_set[-14], method = 'pca', pcaComp = 2)
training_set = predict(pca, training_set)
training_set = training_set[c(2, 3, 1)]
test_set = predict(pca, test_set)
test_set = test_set[c(2, 3, 1)]
# Fitting SVM to the Training set
# install.packages('e1071')
library(e1071)
classifier = svm(formula = Customer_Segment ~ .,
data = training_set,
type = 'C-classification',
kernel = 'linear')
# Predicting the Test set results
y_pred = predict(classifier, newdata = test_set[-3])
# Making the Confusion Matrix
cm = table(test_set[, 3], y_pred)
# Visualising the Training set results
library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('PC1', 'PC2')
y_grid = predict(classifier, newdata = grid_set)
plot(set[, -3],
main = 'SVM (Training set)',
xlab = 'PC1', ylab = 'PC2',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 2, 'deepskyblue', ifelse(y_grid == 1, 'springgreen3', 'tomato')))
points(set, pch = 21, bg = ifelse(set[, 3] == 2, 'blue3', ifelse(set[, 3] == 1, 'green4', 'red3')))

# Visualising the Test set results
library(ElemStatLearn)
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('PC1', 'PC2')
y_grid = predict(classifier, newdata = grid_set)
plot(set[, -3], main = 'SVM (Test set)',
xlab = 'PC1', ylab = 'PC2',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 2, 'deepskyblue', ifelse(y_grid == 1, 'springgreen3', 'tomato')))
points(set, pch = 21, bg = ifelse(set[, 3] == 2, 'blue3', ifelse(set[, 3] == 1, 'green4', 'red3')))

NA
NA
