Machine Learning in R by Udemy

step 1 - import dataset

dataset = read.csv('Data.csv')

step 2 - missing data

Often datasets have missing data, and the common practice is to remove missing data rows but it has a negative impact on observations. Best practice is to take the means of each columns with missing data.

dataset = read.csv('./Part 1 Data Preprocessing/R/Data.csv')

dataset$Age = ifelse(is.na(dataset$Age),
                     ave(dataset$Age, 
                         FUN = function(x) mean(x, na.rm = TRUE)),
                     dataset$Age) # return column
                
dataset$Salary = ifelse(is.na(dataset$Salary),
                        ave(dataset$Salary, FUN = function(x) mean(x, na.rm = TRUE)),
                        dataset$Salary)
dataset                       

Categorical data Need to encode categorical data into numerical data type

Step 3 - Encoding categorical data

Need the factor() function which takes categorical variables and stores it in levels (helps reduce redundancy and CPU memory). Requires c() vectors.

Example:

dataset = read.csv('./Part 1 Data Preprocessing/R/Data.csv')

# want to convert countries to number representation 
dataset$Country = factor(dataset$Country,
                         levels = c('France', 'Spain', 'Germany'),
                         labels = c(1, 2, 3))

dataset$Purchased = factor(dataset$Purchased,
                           levels = c('No', 'Yes'),
                           labels = c(0, 1))
dataset                   

step 4 - Train/Test split of data

Need to import a library that makes Train Test split easy.

dataset = read.csv('./Part 1 Data Preprocessing/R/Data.csv')

# install.packages('caTools')

# load library
library(caTools)

set.seed(123)

#------ Train / Test split
#                    dataset$DependentVariable
split = sample.split(dataset$Purchased, 
                     SplitRatio = 0.8) # returns T/F, 

# True => Training set, False => Test set
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)

dataset

step 5 - Feature Scaling

Euclidean Distance between coordinates == x and y values, when one variables has more variance it becomes dominant in data when they are squared. This is why variables need to be in the same scale value range.

Two Feature Scaling methods:

Not all R libraries require feature scaling to be done beforehand, some deal with it for you.

dataset = read.csv('./Part 1 Data Preprocessing/R/Data.csv')
#------ Feature Scaling
# watch out for errors from categorical factors, factors are not numerical
# need to exclude categorical columns
# grab just the numerical, use slicing for columns 2 and 3 Age and Salary
# slice notation [, 2:3]

training_set[, 2:3] = scale(training_set[, 2:3])
test_set[, 2:3] = scale(test_set[, 2:3])   

End of Data Preprocessing


Simple Linear Regression

(dependent var) = (constant) + (coefficient, unit changes) * (Indep. var)

The formula: y = \(b_{0}\) + \(b_{1}\) * \(x_{1}\)

Example:

Find the line of best fit is Ordinary Least Squares

Fitting Simple Linear Regression to the Training set

# step 1
dataset = read.csv('./Part 2 - Regression/Section 4 - Simple Linear Regression/R/Salary_Data.csv')

# step 3, 4
split = sample.split(dataset$Salary, SplitRatio = 2/3)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)

# no feature scaling is required with this caTools library

#-------- Linear Regression lm() 
# the '~' means "as a function of"
# ~ separates dependent variable from the indep. var
# (dep. var) ~ (indep. var)
# how it will be plotted: (y-axis var) ~ (x-axis var)
regressor = lm(formula = Salary ~ YearsExperience,
               data = training_set)

#---- Predicting the Test set results
# predict salaries based on Years Experience
y_pred = predict(regressor, newdata = test_set)

summary(regressor)

Call:
lm(formula = Salary ~ YearsExperience, data = training_set)

Residuals:
   Min     1Q Median     3Q    Max 
 -7580  -4472  -1390   3586  12154 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)      26547.8     2838.2   9.354 2.47e-08 ***
YearsExperience   9206.4      464.4  19.824 1.12e-13 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6098 on 18 degrees of freedom
Multiple R-squared:  0.9562,    Adjusted R-squared:  0.9538 
F-statistic:   393 on 1 and 18 DF,  p-value: 1.122e-13

the summary(regressor) shows that in the coefficients section that the YearsExperience has 3 stars, meaning it is highly statistically significant. The lower the p-value means it is more significant, when p < 0.05 == indep. var (YearsExperience) is statistically significant.

Look at what the salary is for first row item and compare with predicted value:

Visualising the Training set results

# shift + Cmd + c   is shortcut for comment line
dataset = read.csv('./Part 2 - Regression/Section 4 - Simple Linear Regression/R/Salary_Data.csv')

library(ggplot2)
ggplot() +
  geom_point(aes(x = training_set$YearsExperience, # make sure you select correct dataset!
                 y = training_set$Salary),
             colour = 'red') + 
  geom_line(aes(x = training_set$YearsExperience, 
                y = predict(regressor, newdata = training_set)),
            colour = 'blue') +
  ggtitle('Salary vs Experience (Training set)') +
  xlab('Years of experience') +
  ylab('Salary')


# Red dots are the Real data
# blue line is the model prediction

Visualising the Test set results

dataset = read.csv('./Part 2 - Regression/Section 4 - Simple Linear Regression/R/Salary_Data.csv')

library(ggplot2)
ggplot() +
  geom_point(aes(x = test_set$YearsExperience, 
                 y = test_set$Salary),
             colour = 'red') +
  geom_line(aes(x = training_set$YearsExperience, 
                y = predict(regressor, newdata = training_set)),
            colour = 'blue') +
  ggtitle('Salary vs Experience (Test set)') +
  xlab('Years of experience') +
  ylab('Salary')           

Multiple Linear Regression Intuition

The formula is:

Linear Regression Assumptions:

Using the dataset 50_Startups

P-value

Is the result statistically significant?

  • \(H_{0}\) = Null hypothesis (default)
  • \(H_{1}\) = Alt hypothesis

Coin flip:

[H or T] # flips Probability
T 1 0.50
T 2 0.25
T 3 0.12
T 4 0.06 p < 0.05 = Significant
T 5 0.03
T 6 0.01

Multiple Linear Regression Model Methods

Multiple Linear Regression

                   
# Importing the dataset
dataset = read.csv('./Part 2 - Regression/Section 5 - Multiple Linear Regression/R/50_Startups.csv')

# Encoding categorical data
dataset$State = factor(dataset$State,
                       levels = c('New York', 'California', 'Florida'),
                       labels = c(1, 2, 3)) # encoding 

# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)

set.seed(123)

split = sample.split(dataset$Profit, SplitRatio = 0.8)

# ---- Train/Test split
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)                  
                   
#------ caTools library does not need feature scaling
# Feature Scaling
# training_set = scale(training_set)
# test_set = scale(test_set)
# ------

#---- Fitting Multiple Linear Regression to the Training set

# (profit) ~ (column1 + column2 + column3 + column4)
# << shortcut >> is '.'
# (profit) ~ (all other columns)
# (profit) ~ .
regressor = lm(formula = Profit ~ .,
               data = training_set)


#---- Predicting the Test set results
y_pred = predict(regressor, 
                 newdata = test_set)                   

# see the summary statistics 
summary(regressor)

Call:
lm(formula = Profit ~ ., data = training_set)

Residuals:
   Min     1Q Median     3Q    Max 
-33128  -4865      5   6098  18065 

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)      4.965e+04  7.637e+03   6.501 1.94e-07 ***
R.D.Spend        7.986e-01  5.604e-02  14.251 6.70e-16 ***
Administration  -2.942e-02  5.828e-02  -0.505    0.617    
Marketing.Spend  3.268e-02  2.127e-02   1.537    0.134    
State2           1.213e+02  3.751e+03   0.032    0.974    
State3           2.376e+02  4.127e+03   0.058    0.954    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 9908 on 34 degrees of freedom
Multiple R-squared:  0.9499,    Adjusted R-squared:  0.9425 
F-statistic:   129 on 5 and 34 DF,  p-value: < 2.2e-16
# see the predictions
y_pred
        4         5         8        11        16        20        21        24        31 
173981.09 172655.64 160250.02 135513.90 146059.36 114151.03 117081.62 110671.31  98975.29 
       32 
 96867.03 

The summary statistics have state2 and state3 which are dummy variables R created based on the factors encoded above. The p-value show the significance level value of independent variables. Lower the p-value the more significant.

For the data above, R&D Spending has a strong statistical significant (3 stars ***) effect on profits (dependent variable). Takeaway: Look at companies that spend money on R&D as it plays biggest role on profits.

The real dataset profits vs predicted profits: 182901.99 (row 4) vs predicted 173981.09 (row 4)

Backward Elimination model

reuse the code above


dataset = read.csv('./Part 2 - Regression/Section 5 - Multiple Linear Regression/R/50_Startups.csv')


# Encoding categorical data
dataset$State = factor(dataset$State,
                       levels = c('New York', 'California', 'Florida'),
                       labels = c(1, 2, 3)) # encoding 

set.seed(123)

split = sample.split(dataset$Profit, SplitRatio = 0.8)

# ---- Train/Test split
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)   


# Goal: need to remove the non-stat signif indep variables, delete the ., 
#       iterate through steps and remove columns (indep variables)
# columns with spaces => use dots 

#  original:
#     Profit ~ R&D.Spend + Administration + Marketing.Spend + State

# step 1  alpha = 0.05
# step 2
regressor = lm(formula = Profit ~ R.D.Spend + Marketing.Spend,
               data = dataset)

# step 3
# find the values with highest p-values, look for the stars under Pr(>|t|)
summary(regressor)

Call:
lm(formula = Profit ~ R.D.Spend + Marketing.Spend, data = dataset)

Residuals:
   Min     1Q Median     3Q    Max 
-33645  -4632   -414   6484  17097 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)     4.698e+04  2.690e+03  17.464   <2e-16 ***
R.D.Spend       7.966e-01  4.135e-02  19.266   <2e-16 ***
Marketing.Spend 2.991e-02  1.552e-02   1.927     0.06 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 9161 on 47 degrees of freedom
Multiple R-squared:  0.9505,    Adjusted R-squared:  0.9483 
F-statistic: 450.8 on 2 and 47 DF,  p-value: < 2.2e-16
# step 4 remove the predictor (variables after the ~), step 3
# step 3: 
#       State2 p= 0.990 (99% > 0.05) ==> remove it
#       State3 p= 0.943 (94% > 0.05) ==> remove it
#       Admin  p= 0.602 (60% > 0.05) ==> remove it

# go to step 2, remove State, remove Admin

# Summary stats now show that Marketing Spending variable is 0.06 
# and has a '.' meaning it is (weak) stat significant 

# can remove the Marketing Spending and just have the 1 highly stat significant variable
# Step 5

Automate Backward Elimination in R

# automatic backward elimination
backwardElimination <- function(x, sl) {
    numVars = length(x)
    for (i in c(1:numVars)){
      regressor = lm(formula = Profit ~ ., data = x)
      maxVar = max(coef(summary(regressor))[c(2:numVars), "Pr(>|t|)"])
      if (maxVar > sl){
        j = which(coef(summary(regressor))[c(2:numVars), "Pr(>|t|)"] == maxVar)
        x = x[, -j]
      }
      numVars = numVars - 1
    }
    return(summary(regressor))
  }
  
SL = 0.05
dataset = dataset[, c(1,2,3,4,5)]
backwardElimination(training_set, SL)
                   

Polynomial Regression

The formula is y = \(b_{0}\) + \(b_{1}x_{1}\) + \(b_{2}x^{2}_{1}\).

used in Healthcare/ Epidemiology data

Position Salary dataset

# Importing the dataset
dataset = read.csv('./Part 2 - Regression/Section 6 - Polynomial Regression/R/Position_Salaries.csv')

# use the columns level and salary
dataset = dataset[2:3]                    

# no data splitting due to small dataset
# no feature scaling needed

# for a baseline comparison, use Simple Linear Regression
lin_reg = lm(formula = Salary ~ .,
             data = dataset)

summary(lin_reg)

Call:
lm(formula = Salary ~ ., data = dataset)

Residuals:
    Min      1Q  Median      3Q     Max 
-170818 -129720  -40379   65856  386545 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)   
(Intercept)  -195333     124790  -1.565  0.15615   
Level          80879      20112   4.021  0.00383 **
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 182700 on 8 degrees of freedom
Multiple R-squared:  0.669, Adjusted R-squared:  0.6277 
F-statistic: 16.17 on 1 and 8 DF,  p-value: 0.003833
# Fitting Polynomial Regression to the dataset
# polynomial features of indep variables (to any degree you want)
# 1 indep + dep. vars
# add column using $ and name it

#   dataset$column^2 returns squared column for all level column values
dataset$Level2 = dataset$Level^2
#   dataset$column^3 returns cubed column for all level column values
dataset$Level3 = dataset$Level^3
dataset$Level4 = dataset$Level^4

poly_reg = lm(formula = Salary ~ .,
              data = dataset)

summary(poly_reg)

Call:
lm(formula = Salary ~ ., data = dataset)

Residuals:
     1      2      3      4      5      6      7      8      9     10 
 -8357  18240   1358 -14633 -11725   6725  15997  10006 -28695  11084 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)   
(Intercept)  184166.7    67768.0   2.718  0.04189 * 
Level       -211002.3    76382.2  -2.762  0.03972 * 
Level2        94765.4    26454.2   3.582  0.01584 * 
Level3       -15463.3     3535.0  -4.374  0.00719 **
Level4          890.2      159.8   5.570  0.00257 **
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 20510 on 5 degrees of freedom
Multiple R-squared:  0.9974,    Adjusted R-squared:  0.9953 
F-statistic: 478.1 on 4 and 5 DF,  p-value: 1.213e-06
#---------------- Visualizing the Linear Regression results
# install.packages('ggplot2')
library(ggplot2)

ggplot() + # x= indep  y= dep var
  geom_point(aes(x = dataset$Level, y = dataset$Salary), # real points
             colour = 'red') +       # predict function
  geom_line(aes(x = dataset$Level, y = predict(lin_reg, newdata = dataset)), # predicted
            colour = 'blue') +
  ggtitle('Truth or Bluff (Linear Regression)') +
  xlab('Level') +
  ylab('Salary')


# the linear regression line does not fit the real data points, 
# this is clearly a polynomial problem
# predicted salaries are linear but real data points are polynomial
# real salary level 5= $125,000 vs predicted= $240,000


#-------------- Visualizing the Polynomial Regression results
# 
library(ggplot2)
ggplot() +
  geom_point(aes(x = dataset$Level, y = dataset$Salary), # real
             colour = 'red') +      # predict function, change to poly_reg
  geom_line(aes(x = dataset$Level, y = predict(poly_reg, newdata = dataset)), # predicted
            colour = 'blue') +
  ggtitle('Truth or Bluff (Polynomial Regression)') +
  xlab('Level') +
  ylab('Salary')


# the predicted line fits better with the data points, curved line



#------------ Visualizing the Regression Model results 
# (for higher resolution and smoother curve)
# 
library(ggplot2)

x_grid = seq(min(dataset$Level), max(dataset$Level), 0.1)

ggplot() +
  geom_point(aes(x = dataset$Level, y = dataset$Salary),
             colour = 'red') +
  geom_line(aes(x = x_grid, y = predict(poly_reg,
                                        newdata = data.frame(Level = x_grid,
                                                             Level2 = x_grid^2,
                                                             Level3 = x_grid^3,
                                                             Level4 = x_grid^4))),
            colour = 'blue') +
  ggtitle('Truth or Bluff (Polynomial Regression)') +
  xlab('Level') +
  ylab('Salary')


# Predicting a new result with Linear Regression
# make a prediction on based on level 6.5 
# make a new dataframe row
# y_pred.2 = predict(lin_reg, data.frame(Level = 6.5))

predict(lin_reg, data.frame(Level = 6.5))
       1 
330378.8 
# Predicting a new result with Polynomial Regression
# add polynomial features for each level column
y_pred.3 = predict(poly_reg, data.frame(Level = 6.5,
                             Level2 = 6.5^2,
                             Level3 = 6.5^3,
                             Level4 = 6.5^4))

y_pred.3
       1 
158862.5 

Support Vector Regression

# Importing the dataset
dataset = read.csv('Part 2 - Regression/Section 7 - Support Vector Regression (SVR)/R/Position_Salaries.csv')
dataset = dataset[2:3]

# Fitting Support Vector Regression to the dataset
# install.packages('e1071')
library(e1071)

regressor = svm(formula = Salary ~ .,
                data = dataset,
                type = 'eps-regression', # VERY IMPORTANT, options for type (see docs)
                kernel = 'radial')


# Predicting a new result, this is shown in Data environment right panel
y_pred = predict(regressor, data.frame(Level = 6.5))

#----------- Visualizing the SVR results
# install.packages('ggplot2')
library(ggplot2)
ggplot() +
  geom_point(aes(x = dataset$Level, y = dataset$Salary),
             colour = 'red') + # real data points
  geom_line(aes(x = dataset$Level, y = predict(regressor, newdata = dataset)),
            colour = 'blue') + # predicted points
  ggtitle('Truth or Bluff (SVR 1)') +
  xlab('Level') +
  ylab('Salary')


#----------- Visualizing the SVR results 
# (for higher resolution and smoother curve)
# install.packages('ggplot2')
library(ggplot2)
x_grid = seq(min(dataset$Level), max(dataset$Level), 0.1)
ggplot() +
  geom_point(aes(x = dataset$Level, y = dataset$Salary),
             colour = 'red') +
  geom_line(aes(x = x_grid, y = predict(regressor, newdata = data.frame(Level = x_grid))),
            colour = 'blue') +
  ggtitle('Truth or Bluff (SVR 2)') +
  xlab('Level') +
  ylab('Salary')

Decision Tree Regression

Decision Trees have 2 types (CART): Classification Trees and Regression Trees (more complex)

predicting 3rd variable (y), using x1 and x2

# Importing the dataset
dataset = read.csv('Part 2 - Regression/Section 8 - Decision Tree Regression/R/Position_Salaries.csv')
dataset = dataset[2:3]

# Fitting Support Vector Regression to the dataset
# install.packages('e1071')
library(e1071)

# Fitting Decision Tree Regression to the dataset
# install.packages('rpart')
library(rpart)

# RPART = Recursive Partitioning 
regressor = rpart(formula = Salary ~ .,
                  data = dataset,
                  control = rpart.control(minsplit = 1)) # new part 


# Predicting a new result with Decision Tree Regression
y_pred = predict(regressor, data.frame(Level = 6.5))

#--------------- Visualizing the Decision Tree Regression 
# results (higher resolution)
# install.packages('ggplot2')

library(ggplot2)

x_grid = seq(min(dataset$Level), max(dataset$Level), 0.01)

ggplot() +
  geom_point(aes(x = dataset$Level, y = dataset$Salary),
             colour = 'red') +
  geom_line(aes(x = x_grid, y = predict(regressor, newdata = data.frame(Level = x_grid))),
            colour = 'blue') +
  ggtitle('Truth or Bluff (Decision Tree Regression)') +
  xlab('Level') +
  ylab('Salary')

NA
NA
NA

this shows with the blue line the average of salaries for level 6.5 is $250,000

# Plotting the tree
plot(regressor)
text(regressor)

Random Forest Regression

Ensemble Learning = take multiple algorithms to make powerful algorithm

# Importing the dataset
dataset = read.csv('Part 2 - Regression/Section 9 - Random Forest Regression/R/Position_Salaries.csv')
dataset = dataset[2:3]

#----- Fitting Random Forest Regression to the dataset
# install.packages('randomForest')
library(randomForest)

set.seed(1234) # common random seed in R

# -- build Random Forest Model
regressor = randomForest(x = dataset[-2],
                         y = dataset$Salary,
                         ntree = 500) # change values to find best value, 500

# Predicting a new result
y_pred = predict(regressor, data.frame(Level = 6.5))
# y_pred = 160908 for ntree= 500

#----------- Visualizing the Random Forest Regression Model 
# results (for higher resolution and smoother curve)
# install.packages('ggplot2')
library(ggplot2)

x_grid = seq(min(dataset$Level), max(dataset$Level), 0.01)
ggplot() +
  geom_point(aes(x = dataset$Level, y = dataset$Salary),
             colour = 'red') +
  geom_line(aes(x = x_grid, y = predict(regressor, newdata = data.frame(Level = x_grid))),
            colour = 'blue') +
  ggtitle('Truth or Bluff (Random Forest Regression)') +
  xlab('Level') +
  ylab('Salary')

NA
NA
NA

The blue line is the predicted averages of salaries, with 500 trees, the model predicted salary of $160,908 for level 6.5

R Squared

the total of sum of squares, try to fit a line to minimize the line of least squares residuals, how good is your line compared to the average ? The closer \(R^2\) is to 1 the better!

adjusted R squared is used for multiple regression.

  • p = number of regressors
  • n= sample size
  • adj. \(R^2\) = 1 - (1 - \(R^2\)) (n - 1)/ (n - p - 1)

how well your model has been fitted, closer to 1 the better, but it is biased OLS method, R^2 never decreases, adding more variables R^2 grows. Adjusted R^2 penalizes the growing value from variables, hence it decreases in value. Droppiung unhelpful columns for models helps the adjusted R^2 value get closer to 1, which is all good news and what we want.

So in models above, the 3rd and 4th models R^2 values:

  • lm(formula= Profit ~ R&D.Spend + Marketing.Spend, data = dataset) \(R^2\) = 0.9483
  • lm(formula= Profit ~ R&D.Spend, data = dataset) \(R^2\) = 0.9454

Comparing the last 2 models: 0.9454 - 0.9483 = -0.0029, so last model did worse then 3rd model

coefficients

Understanding the coefficients

lm(formula = Profit ~ R.D.Spend + Marketing.Spend, data = dataset)

Residuals:
   Min     1Q Median     3Q    Max 
-33645  -4632   -414   6484  17097 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)     4.698e+04  2.690e+03  17.464   <2e-16 ***
R.D.Spend       7.966e-01  4.135e-02  19.266   <2e-16 ***
Marketing.Spend 2.991e-02  1.552e-02   1.927     0.06 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 9161 on 47 degrees of freedom
Multiple R-squared:  0.9505,    Adjusted R-squared:  0.9483 
F-statistic: 450.8 on 2 and 47 DF,  p-value: < 2.2e-16

positive value is positive correlated, increase in R.D.Spend == increase in Profit, and vice-versa.

Magnitude is column under Estimate, R.D.Spend 7.966e-01 in units of independent variable values. Correct to say: RD Spend has greater impact on profit per unit of RD Spend than marketing spend. For every unit increase of RD Spend ($1), profits will increase by 0.79 cents per unit. Marketing Spend adds value of 3 cents units to profit.

End of Part 2 – Regression

Classification

Logistic Regression

age and email action taken (Y/N | 1/0) correlation, {linear regression is not right model for this but shows a trend between variables}. Sigmoid function forces values 0 to 1, S-shaped curve on plot which makes the best fitting line for variables. Used for predicting probability (\(p^-\)) (p_hat)

Logistic Regression formula \(ln\)(p / 1 - p) = \(b_{0}\) + \(b_{1}*x\)

Example: on x-axis, 4 age column values at random, y-axis is p_hat, the s-curve on the plot. Person of age 20 has a p_hat probability value of 0.7% (p^- = 0.7) of taking action with an email. Person age 40 has p^- = 85% of taking action with an email. Any values below 50% the y_hat value is pushed down towards 0, any value above 50% is pushed up towards 1, so the results in a binary 0/1 outcome.


# Importing the dataset
dataset = read.csv('./Part 3 - Classification/Section 14 - Logistic Regression/R/Social_Network_Ads.csv')

dataset = dataset[3:5]

# Encoding the target feature as factor
dataset$Purchased = factor(dataset$Purchased, levels = c(0, 1))

# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(dataset$Purchased, SplitRatio = 0.75)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)

# Feature Scaling is best practice for Logistic Regression
#  [-3] is the last column of dataset
training_set[-3] = scale(training_set[-3])
test_set[-3] = scale(test_set[-3])

# Fitting Logistic Regression to the Training set
# glm (general linear model) builds the logistic regression
# predict the dependent variable of Purchased based on indep. variables: age, estimated salary
classifier = glm(formula = Purchased ~ ., 
                 family = binomial,
                 data = training_set)

# Predicting the Test set results
prob_pred = predict(classifier, 
                    type = 'response', 
                    newdata = test_set[-3])

y_pred.logr = ifelse(prob_pred > 0.5, 1, 0)

# Making the Confusion Matrix
cm = table(test_set[, 3], y_pred.logr > 0.5)
cm
   
    FALSE TRUE
  0    57    7
  1    10   26
prob_pred[1]
         2 
0.01623954 
y_pred.logr[1]
2 
0 

the 1st probability predicted = 0.162 and the test_set 1st value is 0, this mean user #2 is unlikely to purchase. Using the y_pred.logr variable the model predicted that user #2 will not purchase an item.

the model correctly predicted a purchase (57+26) 83 times and 17 incorrect predictions

# Visualising the Training set results
library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
prob_set = predict(classifier, type = 'response', newdata = grid_set)
y_grid = ifelse(prob_set > 0.5, 1, 0)
plot(set[, -3],
     main = 'Logistic Regression (Training set)',
     xlab = 'Age', ylab = 'Estimated Salary',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))


# Visualising the Test set results
library(ElemStatLearn)
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
prob_set = predict(classifier, type = 'response', newdata = grid_set)
y_grid = ifelse(prob_set > 0.5, 1, 0)
plot(set[, -3],
     main = 'Logistic Regression (Test set)',
     xlab = 'Age', ylab = 'Estimated Salary',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

NA
NA
NA

Training set observation datapoints, red points are training observation where (dependent variable) Purchased=0 and the green points are training set observation where purchased=1. red zone is prediction region non-purchase. classifier predicted that the higher age the more estimated salary and to purchased item. Classifier is straight line for linear models. Focus on the dot color and the zone they fall.

K-Nearest Neighbor

KNN process:

  1. choose the number k of neighbors
  2. take the KNNs of the new data point, according to Euclidean distance
  3. among these KNNs, count the number of data points in each category
  4. assign the new data point to the category where you counted the most neighbors
  5. model is done

# Importing the dataset
dataset = read.csv('./Part 3 - Classification/Section 15 - K-Nearest Neighbors (K-NN)/R/Social_Network_Ads.csv')

dataset = dataset[3:5]

# Encoding the target feature as factor
dataset$Purchased = factor(dataset$Purchased, levels = c(0, 1))

# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(dataset$Purchased, SplitRatio = 0.75)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)

# Feature Scaling
training_set[-3] = scale(training_set[-3])
test_set[-3] = scale(test_set[-3])

# Fitting classifier to the Training set
# Create your classifier here

# build a KNN classifier
library(class)
# fit a KNN to Training set and Predict the Test set
# remove last column of training set
y_pred.KNN = knn(train= training_set[,-3],
                 test= test_set[, -3],
                 cl= training_set[, 3],
                 k= 5) 

# y_pred.KNN[1:5]


# Predicting the Test set results
# for KNN comment out 
# y_pred = predict(classifier, newdata = test_set[-3])

# Making the Confusion Matrix
cm = table(test_set[, 3], y_pred.KNN)
cm
   y_pred.KNN
     0  1
  0 59  5
  1  6 30
#============ Visualizing the Training set results
library(ElemStatLearn)
set = training_set

X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)

grid_set = expand.grid(X1, X2)

colnames(grid_set) = c('Age', 'EstimatedSalary')

# y_grid = predict(classifier, newdata = grid_set)

# for KNN replace the predict and its arguments with KNN arguments
y_grid = knn(train= training_set[,-3],
                 test= grid_set, # replace test_set
                 cl= training_set[, 3],
                 k= 5)

plot(set[, -3],
     main = 'KNN Classifier (Training set)',
     xlab = 'Age', ylab = 'Estimated Salary',
     xlim = range(X1), ylim = range(X2))

contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))


# Visualising the Test set results
library(ElemStatLearn)
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')

# for KNN replace the predict and its arguments with KNN arguments
y_grid = knn(train= training_set[,-3],
                 test= grid_set, # replace test_set
                 cl= training_set[, 3],
                 k= 5)
# y_grid = predict(classifier, newdata = grid_set)

plot(set[, -3], main = 'KNN Classifier (Test set)',
     xlab = 'Age', ylab = 'Estimated Salary',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

NA
NA
NA

Support Vector Machines

Started in 1960s and 1990s. Separate datapoints on a plot and classify them. Goal: find best decision boundary using a max margin hyperplane line that has a max margin (distance away from line and distance between max margin lines) and datapoints outside the max margin lines are positive or negative hyperplane.

classify apples and oranges, training on best examples of each fruit


# Importing the dataset
dataset = read.csv('./Part 3 - Classification/Section 16 - Support Vector Machine (SVM)/R/Social_Network_Ads.csv')

dataset = dataset[3:5]

# Encoding the target feature as factor
dataset$Purchased = factor(dataset$Purchased, levels = c(0, 1))

# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)

set.seed(123)

split = sample.split(dataset$Purchased, SplitRatio = 0.75)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)

# Feature Scaling
training_set[-3] = scale(training_set[-3])
test_set[-3] = scale(test_set[-3])

# Fitting classifier to the Training set
# Create your classifier here

# library e1071 for SVM
library(e1071)

# read the documentation
classifier.SVM = svm(formula= Purchased ~ .,
                     data= training_set,
                     type= 'C-classification', # classification
                     kernel= 'linear'
                     )


# Predicting the Test set results
y_pred.SVM = predict(classifier.SVM, newdata = test_set[-3])

# Making the Confusion Matrix
cm = table(test_set[, 3], y_pred.SVM)
cm
   y_pred.SVM
     0  1
  0 57  7
  1 13 23
#============= Visualizing the Training set results
library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)

grid_set = expand.grid(X1, X2)

colnames(grid_set) = c('Age', 'EstimatedSalary')

y_grid = predict(classifier.SVM, newdata = grid_set)
plot(set[, -3],
     main = 'SVM Classifier (Training set)',
     xlab = 'Age', 
     ylab = 'Estimated Salary',
     xlim = range(X1), ylim = range(X2))

contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))



#============ Visualizing the Test set results

library(ElemStatLearn)

set = test_set

X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)

grid_set = expand.grid(X1, X2)

colnames(grid_set) = c('Age', 'EstimatedSalary')

y_grid = predict(classifier.SVM, 
                 newdata = grid_set)
plot(set[, -3], main = 'SVM Classifier (Test set)',
     xlab = 'Age', ylab = 'Estimated Salary',
     xlim = range(X1), ylim = range(X2))

contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

NA
NA

mapping datapoints to a Higher dimensional shape separates points by mapping the points to a algebraic function then to have a hyperplane a separator, but this is very computer intensive and not practical.

Kernel SVM

Decision boundaries for when data points are clustered in circles and is not linear. Gaussian RBF Kernel.

              
 # Importing the dataset
dataset = read.csv('./Part 3 - Classification/Section 17 - Kernel SVM/R/Social_Network_Ads.csv')
dataset = dataset[3:5]

# Encoding the target feature as factor
dataset$Purchased = factor(dataset$Purchased, levels = c(0, 1))

# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(dataset$Purchased, SplitRatio = 0.75)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)

# Feature Scaling
training_set[-3] = scale(training_set[-3])
test_set[-3] = scale(test_set[-3])


# Fitting classifier to the Training set
# Create your classifier here

#=========== Kernel SVM
library(e1071)

# Gaussian classifier, radial
classifier.SVM = svm(formula= Purchased ~ .,
                 data = training_set,
                 type = "C-classification",
                 kernel= 'radial')


# Predicting the Test set results
y_pred = predict(classifier.SVM, newdata = test_set[-3])
y_pred[1:5]
 2  4  5  9 12 
 0  0  0  0  0 
Levels: 0 1
# Making the Confusion Matrix
cm = table(test_set[, 3], y_pred)
cm
   y_pred
     0  1
  0 58  6
  1  4 32
#============== Visualizing the Training set results
library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = predict(classifier.SVM, newdata = grid_set)
plot(set[, -3],
     main = 'Kernel SVM Classifier (Training set)',
     xlab = 'Age', ylab = 'Estimated Salary',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))


#================== Visualizing the Test set results
library(ElemStatLearn)
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = predict(classifier.SVM, newdata = grid_set)
plot(set[, -3], main = 'Kernel SVM Classifier (Test set)',
     xlab = 'Age', ylab = 'Estimated Salary',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

Confusion matrix shows 10 incorrect results and 90 correct results

The SVM mapped data to a 3D plane, green zone is users who purchased item and red zone is no purchased item.

Naive Bayes Theorem Classification

Bayes Theorem of probability formula \(P(A|B)\) = \(P(B|A) * P(A) \div P(B)\)

Example: machine.1 makes 30 items/hr and machine.2 makes 20 items/hr. Out of all items made, 1 % is defective, and out of all defective items 50% came from machine.1 and 50% from machine.2. What is the probability that a item made by machine.2 is defective?

  • P(Machine.2) = 20/50
  • P(Defect) = 1%
  • P(Machine.2 | Defect)= 50%
  • P(Defect | Machine.2) = ?

P(Defect | Machine.2) = P(Machine.2 | Defect) * P(Defect) / P(Machine.2) P(Defect | Machine.2) = 0.5 * 0.01 / 0.4 == 0.0125 (1.25%)

Example 2 Naive Bayes (assumed independence of variables: x= age, y= salary, datapoints grouped: walks to work or drives to work. X= features P(Walks | X) = P(X | Walks) * P(Walks) / P(X)
P(Drives | X) = P(X | Drives) * P(Drives) / P(X)

repeat steps for both Walks and Drives class:

  1. prior probability: P(Walks)
  2. marginal likelihood: P(X)
  3. likelihood: P(X | Walks)
  4. posterior probability: P(Walks | X)

step 1: we have no data, so calculate the points in the walks group from a plot. - P(Walks) = number of walkers / total datapoints

step 2: select a radius on a plot, a circle to contain datapoints, look at features (age and salary) and these points will be similar. the probability of a new datapoint would fall into this radius. Count the number of points inside circle. - P(X) = number of observations / total observations

step 3: use the radius circle for similar features of datapoints, what is the likelihood of features of walkers given that person who walks (ignore the drivers). Count the number of datapoints for walkers inside the circle. - P(X | Walkers) = number of similar points for walkers / total number of walkers

step 4: P(Walks | X) = (3/10) * (10/30) / (4/30) == 0.75 (75% likelihood of datapoint being a Walker)

repeat for driver
step 4: P(Drives | X)= (1/20) * (20/30) / (4/30) = 0.25 (25%)

              
# Importing the dataset
dataset = read.csv('./Part 3 - Classification/Section 18 - Naive Bayes/R/Social_Network_Ads.csv')
dataset = dataset[3:5]

# Encoding the target feature as factor
dataset$Purchased = factor(dataset$Purchased, levels = c(0, 1))

# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(dataset$Purchased, SplitRatio = 0.75)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)

# Feature Scaling
training_set[-3] = scale(training_set[-3])
test_set[-3] = scale(test_set[-3])

# Fitting classifier to the Training set
# Create your classifier here

# === Bayes Classifier
library(e1071)

# press F1 for documentation when mouse is on function name
classifier.Bayes = naiveBayes(x= training_set[-3],
                              y= training_set$Purchased)



# Predicting the Test set results
y_pred = predict(classifier.Bayes, newdata = test_set[-3])

# Making the Confusion Matrix
cm = table(test_set[, 3], y_pred)

#============== Visualizing the Training set results
library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = predict(classifier.Bayes, newdata = grid_set)
plot(set[, -3],
     main = 'Naive Bayes Classifier (Training set)',
     xlab = 'Age', ylab = 'Estimated Salary',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))


#========== Visualizing the Test set results
library(ElemStatLearn)
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = predict(classifier.Bayes, newdata = grid_set)
plot(set[, -3], main = 'Naive Bayes Classifier (Test set)',
     xlab = 'Age', ylab = 'Estimated Salary',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))            

NA
NA

Decision Tree Classification

The Decision Tree Classifier first condition is age < 44.5 and salary < $90,000, our model classifies person as will not buy the item, if salary is > $90,000 person will buy the item . This was made by not running the code of feature scaling and visualizations, but the classifier and plotting function.

Random Forest Classification

Ensemble Learning = take multiple algorithms to make powerful algorithm

step 1: pick at random k data points from training set

step 2: build decision tree associated to these k data points

step 3: choose the number Ntree of trees you want to build and repeat steps 1 & 2

step 4: for a new data point, make each one of your Ntree trees predict the value of Y for the data point in question, and assign the new data point the average across all of the predicted Y values

This model is used for remote controller free gaming consoles (Microsoft connect)

# Importing the dataset
dataset = read.csv('./Part 3 - Classification/Section 20 - Random Forest Classification/R/Social_Network_Ads.csv')

dataset = dataset[3:5]

# Encoding the target feature as factor
dataset$Purchased = factor(dataset$Purchased, levels = c(0, 1))

# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(dataset$Purchased, SplitRatio = 0.75)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)

# Feature Scaling
training_set[-3] = scale(training_set[-3])
test_set[-3] = scale(test_set[-3])

# Fitting Random Forest Classification to the Training set
# install.packages('randomForest')
library(randomForest)
randomForest 4.6-14
Type rfNews() to see new features/changes/bug fixes.
set.seed(123)
classifier = randomForest(x = training_set[-3],
                          y = training_set$Purchased,
                          ntree = 500)

# Predicting the Test set results
y_pred = predict(classifier, newdata = test_set[-3])

# Making the Confusion Matrix
cm = table(test_set[, 3], y_pred)

#========== Visualizing the Training set results
library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = predict(classifier, grid_set)

plot(set[, -3],
     main = 'Random Forest Classification (Training set)',
     xlab = 'Age', ylab = 'Estimated Salary',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))


#======== Visualizing the Test set results
library(ElemStatLearn)
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = predict(classifier, grid_set)
plot(set[, -3], main = 'Random Forest Classification (Test set)',
     xlab = 'Age', ylab = 'Estimated Salary',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))




# Choosing the number of trees
plot(classifier)

Classification Model Evaluations

False Positives & False Negatives

False Positive (Type 1 error) {warning}, False Negative (type 2 error) {nothing to see but disaster happens}

Confusion Matrix

y_pred

|     Pred. |  0  | 1  |
|----------------------|
| actual  0 |  56 |  8 |
| actual  1 |  7  | 29 |

Accuracy Rate = correct / total . 56+29= 85 85/100== 85%
Error Rate = wrong/total . 7+8=15 15/100= 15%

Cumulative Accuracy Profile (CAP)

The curved line above the linear line on a plot, AKA Gain Chart.
CAP Analysis: area under perfect model line and under red line.

ROC is Receiver Operating Characteristic (not the same)

End of Part 3 - Classification


Clustering

In Clustering you don’t know what you are looking for, and you are trying to identify some segments or clusters in your data. When you use clustering algorithms on your dataset, unexpected things can suddenly pop up like structures, clusters and groupings you would have never thought of otherwise

K-Means Clustering

This finds the clusters for you.

Process:

  • step 1. choose the number of K of clusters
  • step 2. select at random k points, the centroids
  • step 3. assign each data point to the closest centroid => forms clusters
  • step 4. compute and place the new centroid of each cluster
  • step 5. reassign each datapoint to the new closest centroid, step 4 and back, then finished

selecting k using the “elbow method” k=3 or 4 based on a plot line


# Importing the dataset
dataset = read.csv('./Part 4 - Clustering/Section 24 - K-Means Clustering/R/Mall_Customers.csv')
dataset = dataset[4:5]

# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
# library(caTools)
# set.seed(123)
# split = sample.split(dataset$DependentVariable, SplitRatio = 0.8)
# training_set = subset(dataset, split == TRUE)
# test_set = subset(dataset, split == FALSE)

# Feature Scaling
# training_set = scale(training_set)
# test_set = scale(test_set)

# Using the elbow method to find the optimal number of clusters
set.seed(6)
wcss = vector()
for (i in 1:10) wcss[i] = sum(kmeans(dataset, i)$withinss)
plot(1:10,
     wcss,
     type = 'b',
     main = paste('The Elbow Method'),
     xlab = 'Number of clusters',
     ylab = 'WCSS')


# Fitting K-Means to the dataset
set.seed(29)
kmeans = kmeans(x = dataset, centers = 5)
y_kmeans = kmeans$cluster

# Visualising the clusters
library(cluster)
clusplot(dataset,
         y_kmeans,
         lines = 0,
         shade = TRUE,
         color = TRUE,
         labels = 2,
         plotchar = FALSE,
         span = TRUE,
         main = paste('Clusters of customers'),
         xlab = 'Annual Income',
         ylab = 'Spending Score')

Hierarchical Clustering

Agglomerative (bottom-up) and Divisive (top-down)

Dendrograms (section 27: video 178, 179)


# Importing the dataset
dataset = read.csv('./Part 4 - Clustering/Section 25 - Hierarchical Clustering/R/Mall_Customers.csv')
dataset = dataset[4:5]

# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
# library(caTools)
# set.seed(123)
# split = sample.split(dataset$DependentVariable, SplitRatio = 0.8)
# training_set = subset(dataset, split == TRUE)
# test_set = subset(dataset, split == FALSE)

# Feature Scaling
# training_set = scale(training_set)
# test_set = scale(test_set)

# Using the dendrogram to find the optimal number of clusters
dendrogram = hclust(d = dist(dataset, method = 'euclidean'), method = 'ward.D')
plot(dendrogram,
     main = paste('Dendrogram'),
     xlab = 'Customers',
     ylab = 'Euclidean distances')


# Fitting Hierarchical Clustering to the dataset
hc = hclust(d = dist(dataset, method = 'euclidean'), method = 'ward.D')
y_hc = cutree(hc, 5)

# Visualising the clusters
library(cluster)
clusplot(dataset,
         y_hc,
         lines = 0,
         shade = TRUE,
         color = TRUE,
         labels= 2,
         plotchar = FALSE,
         span = TRUE,
         main = paste('Clusters of customers'),
         xlab = 'Annual Income',
         ylab = 'Spending Score')

NA
NA

Association Rule

“people who bought X also bought Y”

Apriori algorithm


movie recommendation: 
  support(M) = # user watchlist containing M / # user watchlists

movie recommendation: 
  confidence(M1 -> M2) = # user watchlist containing M1, M2 / # user watchlists containing M1

movie recommendation: 
  lift(M1 -> M2) = confidence M1, M2 / support M2
  
  
market basket optimization: 
  support(J) = # transactions containing J / # transactions
  
market basket optimization: 
  confidence(J1 -> J2) = # transactions containing J1, J2 / # transactions containing J1
  
market basket optimization: 
  {what is the random likelihood that person likes this item? Lift is the improvement recommendation}
  lift(J1 -> J2) = confidence J1, J2 / support J2

Process:

  1. set a min support and confidence
  2. take all the subsets in transactions having higher support than min support
  3. take all the rules of these subsets having higher confidence than min confidence
  4. sort the rules by decreasing lift, highest lift is the value you want
# Apriori

# Data Preprocessing
# install.packages('arules')
library(arules)

dataset = read.csv('./Part 5 - Association Rule Learning/Section 28 - Apriori/R/Market_Basket_Optimisation.csv', header = FALSE)

# sparse matrix
dataset = read.transactions('./Part 5 - Association Rule Learning/Section 28 - Apriori/R/Market_Basket_Optimisation.csv', sep = ',', rm.duplicates = TRUE)
distribution of transactions with duplicates:
1 
5 
summary(dataset)
transactions as itemMatrix in sparse format with
 7501 rows (elements/itemsets/transactions) and
 119 columns (items) and a density of 0.03288973 

most frequent items:
mineral water          eggs     spaghetti  french fries     chocolate       (Other) 
         1788          1348          1306          1282          1229         22405 

element (itemset/transaction) length distribution:
sizes
   1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   18   19   20 
1754 1358 1044  816  667  493  391  324  259  139  102   67   40   22   17    4    1    2    1 

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.000   2.000   3.000   3.914   5.000  20.000 

includes extended item information - examples:
itemFrequencyPlot(dataset, topN = 30)


# Training Apriori on the dataset

# items bought 3x's a day divided by total products = {3*7/7500} = 0.028 => 0.003
# items bought 4x's a day divided by total products = {4*7/7500} = 0.0037 => 0.004
# confidence value is arbitrary choice 
# 
rules = apriori(data = dataset, 
                parameter = list(support = 0.004,  
                                 confidence = 0.2)) # use small values for more rules
Apriori

Parameter specification:

Algorithmic control:

Absolute minimum support count: 30 

set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[119 item(s), 7501 transaction(s)] done [0.00s].
sorting and recoding items ... [114 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 done [0.00s].
writing ... [811 rule(s)] done [0.00s].
creating S4 object  ... done [0.00s].
# Visualizing the results
# get the highest rules by lift
inspect(sort(rules, by = 'lift')[1:10])
NA

the rules show that people who bought {light cream} will buy {chicken} 29% [Confidence] of the cases with a lift value of 4.84

Eclat

this algorithm is similar as above, but it only has the support variable using sets

simple results of items commonly purchased together using the support parameter


# Data Preprocessing
# install.packages('arules')
library(arules)
dataset = read.csv('./Part 5 - Association Rule Learning/Section 29 - Eclat/R/Market_Basket_Optimisation.csv')
dataset = read.transactions('./Part 5 - Association Rule Learning/Section 29 - Eclat/R/Market_Basket_Optimisation.csv', sep = ',', rm.duplicates = TRUE)
distribution of transactions with duplicates:
1 
5 
summary(dataset)
transactions as itemMatrix in sparse format with
 7501 rows (elements/itemsets/transactions) and
 119 columns (items) and a density of 0.03288973 

most frequent items:
mineral water          eggs     spaghetti  french fries     chocolate       (Other) 
         1788          1348          1306          1282          1229         22405 

element (itemset/transaction) length distribution:
sizes
   1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   18   19   20 
1754 1358 1044  816  667  493  391  324  259  139  102   67   40   22   17    4    1    2    1 

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.000   2.000   3.000   3.914   5.000  20.000 

includes extended item information - examples:
itemFrequencyPlot(dataset, topN = 10)


# Training Eclat on the dataset
rules = eclat(data = dataset, parameter = list(support = 0.003, minlen = 2))
Eclat

parameter specification:

algorithmic control:

Absolute minimum support count: 22 

create itemset ... 
set transactions ...[119 item(s), 7501 transaction(s)] done [0.01s].
sorting and recoding items ... [115 item(s)] done [0.00s].
creating sparse bit matrix ... [115 row(s), 7501 column(s)] done [0.00s].
writing  ... [1328 set(s)] done [0.01s].
Creating S4 object  ... done [0.00s].
# Visualising the results
inspect(sort(rules, by = 'support')[1:10])
NA
NA
End of section

Principal Component Analysis (PCA)

Unsupervised algorithm for : noise filtering, visualization, feature extraction, time series predictions, gene data analysis.

Goal: identify patterns in data, detect the correlation between variables by reducing the dimensions

# Importing the dataset
dataset = read.csv('./Part 9 - Dimensionality Reduction/Section 43 - Principal Component Analysis (PCA)/R/Wine.csv')

# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(dataset$Customer_Segment, SplitRatio = 0.8)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)

# Feature Scaling
training_set[-14] = scale(training_set[-14])
test_set[-14] = scale(test_set[-14])

# Applying PCA
# install.packages('caret')
library(caret)
Loading required package: lattice
Loading required package: ggplot2

Attaching package: ‘ggplot2’

The following object is masked from ‘package:randomForest’:

    margin

Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     
# install.packages('e1071')
library(e1071)
pca = preProcess(x = training_set[-14], method = 'pca', pcaComp = 2)
training_set = predict(pca, training_set)
training_set = training_set[c(2, 3, 1)]
test_set = predict(pca, test_set)
test_set = test_set[c(2, 3, 1)]

# Fitting SVM to the Training set
# install.packages('e1071')
library(e1071)
classifier = svm(formula = Customer_Segment ~ .,
                 data = training_set,
                 type = 'C-classification',
                 kernel = 'linear')

# Predicting the Test set results
y_pred = predict(classifier, newdata = test_set[-3])

# Making the Confusion Matrix
cm = table(test_set[, 3], y_pred)

# Visualising the Training set results
library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('PC1', 'PC2')
y_grid = predict(classifier, newdata = grid_set)
plot(set[, -3],
     main = 'SVM (Training set)',
     xlab = 'PC1', ylab = 'PC2',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 2, 'deepskyblue', ifelse(y_grid == 1, 'springgreen3', 'tomato')))
points(set, pch = 21, bg = ifelse(set[, 3] == 2, 'blue3', ifelse(set[, 3] == 1, 'green4', 'red3')))


# Visualising the Test set results
library(ElemStatLearn)
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('PC1', 'PC2')
y_grid = predict(classifier, newdata = grid_set)
plot(set[, -3], main = 'SVM (Test set)',
     xlab = 'PC1', ylab = 'PC2',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 2, 'deepskyblue', ifelse(y_grid == 1, 'springgreen3', 'tomato')))
points(set, pch = 21, bg = ifelse(set[, 3] == 2, 'blue3', ifelse(set[, 3] == 1, 'green4', 'red3')))     

NA
NA
---
title: "**Learning R Notebook**"
author: "Zane Dax"
output: 
  html_notebook: 
    toc: yes
---
<style>
body{font-size: 16px;}
h1 {color: #9900cc;}
</style>

### Machine Learning in R by **Udemy**

# step 1 - import dataset 
``dataset = read.csv('Data.csv')``

# step 2 - missing data
Often datasets have missing data, and the common practice is to remove missing data rows but it has a negative impact on observations. 
Best practice is to take the means of each columns with missing data. 

- to select the dataframe columns, use ``$``, if-else function
- ``is.na`` is a function that checks for NaNs in columns.
- to get the average use ``ave()``
- create a function, here it's named FUN, to get the mean of column Age and return True
- get the average of column Salary and get the function to check for missing values and return True

```{r}
dataset = read.csv('./Part 1 Data Preprocessing/R/Data.csv')

dataset$Age = ifelse(is.na(dataset$Age),
                     ave(dataset$Age, 
                         FUN = function(x) mean(x, na.rm = TRUE)),
                     dataset$Age) # return column
                
dataset$Salary = ifelse(is.na(dataset$Salary),
                        ave(dataset$Salary, FUN = function(x) mean(x, na.rm = TRUE)),
                        dataset$Salary)
dataset                       
```                     
                        
                        
Categorical data
Need to encode categorical data into numerical data type

- for this dataset Country is categorical, need to encode

# Step 3 - Encoding categorical data
Need the ``factor()`` function which takes categorical variables and stores it in levels (helps reduce redundancy and CPU memory). Requires ``c()`` vectors. 

Example:

- ``coffee.size = c("Small","Medium","Large","XLarge")``
- ``coffee.size.factor = factor(coffee.size)``
- ``coffee.size.factor = factor(levels= coffee.size, ordered = T, labels = c(1,2,3,4))``
- ``coffee.size.factor``


```{r factor categorical data}
dataset = read.csv('./Part 1 Data Preprocessing/R/Data.csv')

# want to convert countries to number representation 
dataset$Country = factor(dataset$Country,
                         levels = c('France', 'Spain', 'Germany'),
                         labels = c(1, 2, 3))

dataset$Purchased = factor(dataset$Purchased,
                           levels = c('No', 'Yes'),
                           labels = c(0, 1))
dataset                   
```

# step 4 - Train/Test split of data
Need to import a library that makes Train Test split easy.

```{r train-test-split}
dataset = read.csv('./Part 1 Data Preprocessing/R/Data.csv')

# install.packages('caTools')

# load library
library(caTools)

set.seed(123)

#------ Train / Test split
#                    dataset$DependentVariable
split = sample.split(dataset$Purchased, 
                     SplitRatio = 0.8) # returns T/F, 

# True => Training set, False => Test set
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)

dataset
```                       


# step 5 - Feature Scaling
Euclidean Distance between coordinates == x and y values, when one variables has more variance it becomes dominant in data when they are squared. This is why variables need to be in the same scale value range.

Two Feature Scaling methods:

- **Standardization** : $X_{stand}$ = $x$ -  mean($x$) $\div$ std($x$)
- **Normalization** : $X_{norm}$ = $x$ -  min($x$) $\div$ max($x$) - min($x$)                   

Not all R libraries require feature scaling to be done beforehand, some deal with it for you.

```{r Feature Scaling}    
dataset = read.csv('./Part 1 Data Preprocessing/R/Data.csv')
#------ Feature Scaling
# watch out for errors from categorical factors, factors are not numerical
# need to exclude categorical columns
# grab just the numerical, use slicing for columns 2 and 3 Age and Salary
# slice notation [, 2:3]

training_set[, 2:3] = scale(training_set[, 2:3])
test_set[, 2:3] = scale(test_set[, 2:3])   



```                      
                        
End of Data Preprocessing

<hr>

# Simple **Linear Regression**
(dependent var) = (constant) + (coefficient, unit changes) * (Indep. var)

The formula: **y** = $b_{0}$ + $b_{1}$  * $x_{1}$ 

Example:

- Experience (x)
- Salary (y)
- Query: does salary factor in with experience ?
- Salary = (constant) + (coefficient) * Experience    
- plot it, the constant of 0 says that person with 0 experience will have salary of $30k, the coefficient shows that 1 year of experience increases salary by $10k

Find the line of best fit is **Ordinary Least Squares**

- on the plot, the dot above the linear regression line is where person is with salary and experience ($y_{i}$) and the line is where the model says where the person should be in regards to salary ($y^{-}_{i}$) - the model evaluation value
- sum( $y_{i}$ - $y^{-}_{i}$)^2, then find the minimum value 

Fitting Simple Linear Regression to the Training set

- must understand which is the dependent var and which is the indep. variable
- indep. var = experience in years
- dep. var = salary in $

```{r Linear Regression}
# step 1
dataset = read.csv('./Part 2 - Regression/Section 4 - Simple Linear Regression/R/Salary_Data.csv')

# step 3, 4
split = sample.split(dataset$Salary, SplitRatio = 2/3)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)

# no feature scaling is required with this caTools library

#-------- Linear Regression lm() 
# the '~' means "as a function of"
# ~ separates dependent variable from the indep. var
# (dep. var) ~ (indep. var)
# how it will be plotted: (y-axis var) ~ (x-axis var)
regressor = lm(formula = Salary ~ YearsExperience,
               data = training_set)

#---- Predicting the Test set results
# predict salaries based on Years Experience
y_pred = predict(regressor, newdata = test_set)

summary(regressor)
```
                        
the ``summary(regressor)`` shows that in the coefficients section that the ``YearsExperience`` has 3 stars, meaning it is highly statistically significant. The lower the p-value means it is more significant, when p < 0.05 == indep. var (``YearsExperience``) is statistically significant.

Look at what the salary is for first row item and compare with predicted value: 

- Salary_Data : 1.3 YearsExperience, Salary 46205
- Linear Model prediction: Salary 37766



## Visualising the Training set results

```{r}
# shift + Cmd + c   is shortcut for comment line
dataset = read.csv('./Part 2 - Regression/Section 4 - Simple Linear Regression/R/Salary_Data.csv')

library(ggplot2)
ggplot() +
  geom_point(aes(x = training_set$YearsExperience, # make sure you select correct dataset!
                 y = training_set$Salary),
             colour = 'red') + 
  geom_line(aes(x = training_set$YearsExperience, 
                y = predict(regressor, newdata = training_set)),
            colour = 'blue') +
  ggtitle('Salary vs Experience (Training set)') +
  xlab('Years of experience') +
  ylab('Salary')

# Red dots are the Real data
# blue line is the model prediction
```                    
                        
## Visualising the Test set results
```{r}
dataset = read.csv('./Part 2 - Regression/Section 4 - Simple Linear Regression/R/Salary_Data.csv')

library(ggplot2)
ggplot() +
  geom_point(aes(x = test_set$YearsExperience, 
                 y = test_set$Salary),
             colour = 'red') +
  geom_line(aes(x = training_set$YearsExperience, 
                y = predict(regressor, newdata = training_set)),
            colour = 'blue') +
  ggtitle('Salary vs Experience (Test set)') +
  xlab('Years of experience') +
  ylab('Salary')           

```                     
                        

# Multiple Linear Regression Intuition
The formula is:

- dependent var (y), constant (b), indep. vars (b1, b2, ...)
- **y** = $b_{0}$ + $b_{1} * x_{1}$ + $b_{2} * x_{2}$ + ...

Linear Regression Assumptions:

- linearity
- homoscedasticity
- multivariate normality
- independence of errors
- lack of multicollinearity

Using the dataset ``50_Startups``

- for the column State, it has categorical data, so need to get dummy variables to replace strings for integer values. 
- Make new columns for each category, for New York column, every New York value gets 1. 
- (profit) = constant + (R&D Spend) * (R&D Spend values) + (Admin) * (Admin values) + (Marketing) * (Marketing values) + (dummy_var)
- **WARNING**: always omit 1 dummy variable regardless number of dummy variables. Do not include more than 1 dummy variable

## P-value
Is the result statistically significant? 

- $H_{0}$ = Null hypothesis (default)
- $H_{1}$ = Alt hypothesis

Coin flip: 

|[H or T] | # flips | Probability |
|---------|---------|-------------|
| T       | 1       | 0.50 |
| T       | 2       | 0.25 |
| T       | 3       | 0.12 |
| T       | 4       | 0.06   **p < 0.05 = Significant**|
| T       | 5       | 0.03 |
| T       | 6       | 0.01 |

               
               
# Multiple Linear Regression Model Methods

- step 0: select what columns to include {garbage in = garbage out}
  and you will need to be able to explain your model, so keep what is essential
- **5 Methods of building models** :

  1. all-in = putting in all variables {don't do unless you need to}
  2. **backward elimination** :
  
    - 2.1. select a significance level ($\alpha$ = 0.05)
    - 2.2. fit the full model with all possible predictors
    - 2.3. consider the predictor with highest p-value, if p > $\alpha$ then go to step 2.4, else: break, model is ready
    - 2.4 remove the predictor
    - 2.5 fit the model without this variable, go to step 2.3
    
  3. **forward selection** :
  
    - 3.1 select a significance level ($\alpha$ = 0.05)
    - 3.2 fit all simple regression models y ~ $x_{n}$, select the one with lowest p-value
    - 3.3 keep this variable and fit all possible models with one extra predictor assed to the one(s) you already have
    - 3.4 consider the predictor with lowest p-value, if p < $\alpha$ go to step 3.3, else: break, model is ready (keep previous model)
    
  4. **bidirectional elimination (stepwise regression)**:
  
    - 4.1 select significance level to enter and stay in the model $\alpha$ = 0.05
    - 4.2  do forward selection, new variables must be p < $\alpha$ to enter
    - 4.3 do all steps of backward elimination, old variables must be p < $\alpha$ to stay
    - 4.4 no new variables can enter and no old variables can exit, model is ready
  
  5. **score comparison** :
  
    - 5.1 select a criterion of goodness of fit (Akaike criterion)
    - 5.2 construct all possible regression models 
    - 5.3 select one of the best criterion, your model is ready 
    
    *Example: 10 columns means == 1023 models*

                        
                        
# **Multiple** Linear Regression                     

```{r Multiple Linear Regression}                        
                   
# Importing the dataset
dataset = read.csv('./Part 2 - Regression/Section 5 - Multiple Linear Regression/R/50_Startups.csv')

# Encoding categorical data
dataset$State = factor(dataset$State,
                       levels = c('New York', 'California', 'Florida'),
                       labels = c(1, 2, 3)) # encoding 

# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)

set.seed(123)

split = sample.split(dataset$Profit, SplitRatio = 0.8)

# ---- Train/Test split
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)                  
                   
#------ caTools library does not need feature scaling
# Feature Scaling
# training_set = scale(training_set)
# test_set = scale(test_set)
# ------

#---- Fitting Multiple Linear Regression to the Training set

# (profit) ~ (column1 + column2 + column3 + column4)
# << shortcut >> is '.'
# (profit) ~ (all other columns)
# (profit) ~ .
regressor = lm(formula = Profit ~ .,
               data = training_set)


#---- Predicting the Test set results
y_pred = predict(regressor, 
                 newdata = test_set)                   

# see the summary statistics 
summary(regressor)

# see the predictions
y_pred
```                
                   
The summary statistics have state2 and state3 which are dummy variables R created based on the factors encoded above. The p-value show the significance level value of independent variables.
Lower the p-value the more significant. 

For the data above, R&D Spending has a strong statistical significant (3 stars ***) effect on profits (dependent variable). Takeaway: Look at companies that spend money on R&D as it plays biggest role on profits. 

The real dataset profits vs predicted profits: 182901.99 (row 4) vs predicted 173981.09 (row 4)
                
                   
## Backward Elimination model
reuse the code above

```{r Backward Elim}

dataset = read.csv('./Part 2 - Regression/Section 5 - Multiple Linear Regression/R/50_Startups.csv')


# Encoding categorical data
dataset$State = factor(dataset$State,
                       levels = c('New York', 'California', 'Florida'),
                       labels = c(1, 2, 3)) # encoding 

set.seed(123)

split = sample.split(dataset$Profit, SplitRatio = 0.8)

# ---- Train/Test split
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)   


# Goal: need to remove the non-stat signif indep variables, delete the ., 
#       iterate through steps and remove columns (indep variables)
# columns with spaces => use dots 

#  original:
#     Profit ~ R&D.Spend + Administration + Marketing.Spend + State

# step 1  alpha = 0.05
# step 2
regressor = lm(formula = Profit ~ R.D.Spend + Marketing.Spend,
               data = dataset)

# step 3
# find the values with highest p-values, look for the stars under Pr(>|t|)
summary(regressor)

# step 4 remove the predictor (variables after the ~), step 3
# step 3: 
#       State2 p= 0.990 (99% > 0.05) ==> remove it
#       State3 p= 0.943 (94% > 0.05) ==> remove it
#       Admin  p= 0.602 (60% > 0.05) ==> remove it

# go to step 2, remove State, remove Admin

# Summary stats now show that Marketing Spending variable is 0.06 
# and has a '.' meaning it is (weak) stat significant 

# can remove the Marketing Spending and just have the 1 highly stat significant variable
# Step 5
```                   
                   
## Automate Backward Elimination in R
```{r Backward Elim Auto}
# automatic backward elimination
backwardElimination <- function(x, sl) {
    numVars = length(x)
    for (i in c(1:numVars)){
      regressor = lm(formula = Profit ~ ., data = x)
      maxVar = max(coef(summary(regressor))[c(2:numVars), "Pr(>|t|)"])
      if (maxVar > sl){
        j = which(coef(summary(regressor))[c(2:numVars), "Pr(>|t|)"] == maxVar)
        x = x[, -j]
      }
      numVars = numVars - 1
    }
    return(summary(regressor))
  }
  
SL = 0.05
dataset = dataset[, c(1,2,3,4,5)]
backwardElimination(training_set, SL)
                   
```                 
                   
                   
# Polynomial Regression                    
The formula is **y** = $b_{0}$ + $b_{1}x_{1}$ + $b_{2}x^{2}_{1}$.


used in Healthcare/ Epidemiology data    

Position Salary dataset
                   
```{r}                      
# Importing the dataset
dataset = read.csv('./Part 2 - Regression/Section 6 - Polynomial Regression/R/Position_Salaries.csv')

# use the columns level and salary
dataset = dataset[2:3]                    

# no data splitting due to small dataset
# no feature scaling needed

# for a baseline comparison, use Simple Linear Regression
lin_reg = lm(formula = Salary ~ .,
             data = dataset)

summary(lin_reg)


# Fitting Polynomial Regression to the dataset
# polynomial features of indep variables (to any degree you want)
# 1 indep + dep. vars
# add column using $ and name it

#   dataset$column^2 returns squared column for all level column values
dataset$Level2 = dataset$Level^2
#   dataset$column^3 returns cubed column for all level column values
dataset$Level3 = dataset$Level^3
dataset$Level4 = dataset$Level^4

poly_reg = lm(formula = Salary ~ .,
              data = dataset)

summary(poly_reg)



#---------------- Visualizing the Linear Regression results
# install.packages('ggplot2')
library(ggplot2)

ggplot() + # x= indep  y= dep var
  geom_point(aes(x = dataset$Level, y = dataset$Salary), # real points
             colour = 'red') +       # predict function
  geom_line(aes(x = dataset$Level, y = predict(lin_reg, newdata = dataset)), # predicted
            colour = 'blue') +
  ggtitle('Truth or Bluff (Linear Regression)') +
  xlab('Level') +
  ylab('Salary')

# the linear regression line does not fit the real data points, 
# this is clearly a polynomial problem
# predicted salaries are linear but real data points are polynomial
# real salary level 5= $125,000 vs predicted= $240,000


#-------------- Visualizing the Polynomial Regression results
# 
library(ggplot2)
ggplot() +
  geom_point(aes(x = dataset$Level, y = dataset$Salary), # real
             colour = 'red') +      # predict function, change to poly_reg
  geom_line(aes(x = dataset$Level, y = predict(poly_reg, newdata = dataset)), # predicted
            colour = 'blue') +
  ggtitle('Truth or Bluff (Polynomial Regression)') +
  xlab('Level') +
  ylab('Salary')

# the predicted line fits better with the data points, curved line



#------------ Visualizing the Regression Model results 
# (for higher resolution and smoother curve)
# 
library(ggplot2)

x_grid = seq(min(dataset$Level), max(dataset$Level), 0.1)

ggplot() +
  geom_point(aes(x = dataset$Level, y = dataset$Salary),
             colour = 'red') +
  geom_line(aes(x = x_grid, y = predict(poly_reg,
                                        newdata = data.frame(Level = x_grid,
                                                             Level2 = x_grid^2,
                                                             Level3 = x_grid^3,
                                                             Level4 = x_grid^4))),
            colour = 'blue') +
  ggtitle('Truth or Bluff (Polynomial Regression)') +
  xlab('Level') +
  ylab('Salary')

# Predicting a new result with Linear Regression
# make a prediction on based on level 6.5 
# make a new dataframe row
# y_pred.2 = predict(lin_reg, data.frame(Level = 6.5))

predict(lin_reg, data.frame(Level = 6.5))

# Predicting a new result with Polynomial Regression
# add polynomial features for each level column
y_pred.3 = predict(poly_reg, data.frame(Level = 6.5,
                             Level2 = 6.5^2,
                             Level3 = 6.5^3,
                             Level4 = 6.5^4))

y_pred.3
```



# Support Vector Regression

```{r Support Vector Machine}
# Importing the dataset
dataset = read.csv('Part 2 - Regression/Section 7 - Support Vector Regression (SVR)/R/Position_Salaries.csv')
dataset = dataset[2:3]

# Fitting Support Vector Regression to the dataset
# install.packages('e1071')
library(e1071)

regressor = svm(formula = Salary ~ .,
                data = dataset,
                type = 'eps-regression', # VERY IMPORTANT, options for type (see docs)
                kernel = 'radial')


# Predicting a new result, this is shown in Data environment right panel
y_pred = predict(regressor, data.frame(Level = 6.5))

#----------- Visualizing the SVR results
# install.packages('ggplot2')
library(ggplot2)
ggplot() +
  geom_point(aes(x = dataset$Level, y = dataset$Salary),
             colour = 'red') + # real data points
  geom_line(aes(x = dataset$Level, y = predict(regressor, newdata = dataset)),
            colour = 'blue') + # predicted points
  ggtitle('Truth or Bluff (SVR 1)') +
  xlab('Level') +
  ylab('Salary')

#----------- Visualizing the SVR results 
# (for higher resolution and smoother curve)
# install.packages('ggplot2')
library(ggplot2)
x_grid = seq(min(dataset$Level), max(dataset$Level), 0.1)
ggplot() +
  geom_point(aes(x = dataset$Level, y = dataset$Salary),
             colour = 'red') +
  geom_line(aes(x = x_grid, y = predict(regressor, newdata = data.frame(Level = x_grid))),
            colour = 'blue') +
  ggtitle('Truth or Bluff (SVR 2)') +
  xlab('Level') +
  ylab('Salary')

```




# Decision Tree Regression
Decision Trees have 2 types (CART): **Classification Trees** and **Regression Trees** (more complex)

predicting 3rd variable (y), using x1 and x2

```{r Decision Tree Regression}
# Importing the dataset
dataset = read.csv('Part 2 - Regression/Section 8 - Decision Tree Regression/R/Position_Salaries.csv')
dataset = dataset[2:3]

# Fitting Support Vector Regression to the dataset
# install.packages('e1071')
library(e1071)

# Fitting Decision Tree Regression to the dataset
# install.packages('rpart')
library(rpart)

# RPART = Recursive Partitioning 
regressor = rpart(formula = Salary ~ .,
                  data = dataset,
                  control = rpart.control(minsplit = 1)) # new part 


# Predicting a new result with Decision Tree Regression
y_pred = predict(regressor, data.frame(Level = 6.5))

#--------------- Visualizing the Decision Tree Regression 
# results (higher resolution)
# install.packages('ggplot2')

library(ggplot2)

x_grid = seq(min(dataset$Level), max(dataset$Level), 0.01)

ggplot() +
  geom_point(aes(x = dataset$Level, y = dataset$Salary),
             colour = 'red') +
  geom_line(aes(x = x_grid, y = predict(regressor, newdata = data.frame(Level = x_grid))),
            colour = 'blue') +
  ggtitle('Truth or Bluff (Decision Tree Regression)') +
  xlab('Level') +
  ylab('Salary')



```

this shows with the blue line the average of salaries for level 6.5 is $250,000

```{r}
# Plotting the tree
plot(regressor)
text(regressor)
```


# Random Forest Regression 
Ensemble Learning = take multiple algorithms to make powerful algorithm

  - step 1: pick at random k data points from training set
  - step 2: build decision tree associated to these k data points
  - step 3: choose the number *N*tree of trees you want to build and repeat steps 1 & 2
  - step 4: for a new data point, make each one of your *N*tree trees predict the value of Y for the data point in question, and assign the new data point the average across all of the predicted Y values

```{r Random Forest Regression}
# Importing the dataset
dataset = read.csv('Part 2 - Regression/Section 9 - Random Forest Regression/R/Position_Salaries.csv')
dataset = dataset[2:3]

#----- Fitting Random Forest Regression to the dataset
# install.packages('randomForest')
library(randomForest)

set.seed(1234) # common random seed in R

# -- build Random Forest Model
regressor = randomForest(x = dataset[-2],
                         y = dataset$Salary,
                         ntree = 500) # change values to find best value, 500

# Predicting a new result
y_pred = predict(regressor, data.frame(Level = 6.5))
# y_pred = 160908 for ntree= 500

#----------- Visualizing the Random Forest Regression Model 
# results (for higher resolution and smoother curve)
# install.packages('ggplot2')
library(ggplot2)

x_grid = seq(min(dataset$Level), max(dataset$Level), 0.01)
ggplot() +
  geom_point(aes(x = dataset$Level, y = dataset$Salary),
             colour = 'red') +
  geom_line(aes(x = x_grid, y = predict(regressor, newdata = data.frame(Level = x_grid))),
            colour = 'blue') +
  ggtitle('Truth or Bluff (Random Forest Regression)') +
  xlab('Level') +
  ylab('Salary')



```
The blue line is the predicted averages of salaries, with 500 trees, the model predicted salary of $160,908 for level 6.5



# R Squared

- **sum of squares residuals** (SSres) = sum($y_{i}$ - $y_{i^i}$)^2
- average line on plot for **sum of squares total** = sum($y_{i}$ - $y_{avg}$)^2
- $R^{2}$ = 1 - $SS_{res}$ $\div$ $SS_{tot}$ 

the total of sum of squares, try to fit a line to minimize the line of least squares residuals, *how good is your line compared to the average ?* The closer $R^2$ is to 1 the better! 


## **adjusted R squared** is used for multiple regression. 

- p = number of regressors
- n= sample size
- adj. $R^2$ = 1 - (1 - $R^2$) (n - 1)/ (n - p - 1)


how well your model has been fitted, closer to 1 the better, but it is biased OLS method, R^2 never decreases, adding more variables R^2 grows. Adjusted R^2 penalizes the growing value from variables, hence it decreases in value. Droppiung unhelpful columns for models helps the adjusted R^2 value get closer to 1, which is all good news and what we want. 

So in models above, the 3rd and 4th models R^2 values:

- ``lm(formula= Profit ~ R&D.Spend + Marketing.Spend, data = dataset)`` $R^2$ = 0.9483
- ``lm(formula= Profit ~ R&D.Spend, data = dataset)`` $R^2$ = 0.9454

Comparing the last 2 models:  ``0.9454 - 0.9483 = -0.0029``, so last model did worse then 3rd model



## coefficients 
Understanding the coefficients
```
lm(formula = Profit ~ R.D.Spend + Marketing.Spend, data = dataset)

Residuals:
   Min     1Q Median     3Q    Max 
-33645  -4632   -414   6484  17097 

Coefficients:
                 Estimate Std. Error t value Pr(>|t|)    
(Intercept)     4.698e+04  2.690e+03  17.464   <2e-16 ***
R.D.Spend       7.966e-01  4.135e-02  19.266   <2e-16 ***
Marketing.Spend 2.991e-02  1.552e-02   1.927     0.06 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 9161 on 47 degrees of freedom
Multiple R-squared:  0.9505,	Adjusted R-squared:  0.9483 
F-statistic: 450.8 on 2 and 47 DF,  p-value: < 2.2e-16
```

positive value is positive correlated, increase in R.D.Spend == increase in Profit, and vice-versa.


Magnitude is column under Estimate, R.D.Spend ``7.966e-01`` in units of independent variable values. Correct to say: RD Spend has greater impact on profit per unit of RD Spend than 
marketing spend. *For every unit increase of RD Spend ($1), profits will increase by 0.79 cents per unit*. Marketing Spend adds value of 3 cents units to profit.


End of Part 2 -- Regression
<hr>

# **Classification**

## Logistic Regression
age and email action taken (Y/N | 1/0) correlation, {linear regression is not right model for this but shows a trend between variables}. Sigmoid function forces values 0 to 1, S-shaped curve on plot which makes the best fitting line for variables. **Used for predicting probability ($p^-$) (p_hat)**

Logistic Regression formula  $ln$(p / 1 - p) = $b_{0}$ + $b_{1}*x$ 

Example: on x-axis, 4 age column values at random, y-axis is p_hat, the s-curve on the plot. Person of age 20 has a p_hat probability value of 0.7% (p^- = 0.7) of taking action with an email. Person age 40 has p^- = 85% of taking action with an email. Any values below 50% the y_hat value is pushed down towards 0, any value above 50% is pushed up towards 1, so the results in a binary 0/1 outcome. 

```{r Logistic Regression}

# Importing the dataset
dataset = read.csv('./Part 3 - Classification/Section 14 - Logistic Regression/R/Social_Network_Ads.csv')

dataset = dataset[3:5]

# Encoding the target feature as factor
dataset$Purchased = factor(dataset$Purchased, levels = c(0, 1))

# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(dataset$Purchased, SplitRatio = 0.75)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)

# Feature Scaling is best practice for Logistic Regression
#  [-3] is the last column of dataset
training_set[-3] = scale(training_set[-3])
test_set[-3] = scale(test_set[-3])

# Fitting Logistic Regression to the Training set
# glm (general linear model) builds the logistic regression
# predict the dependent variable of Purchased based on indep. variables: age, estimated salary
classifier = glm(formula = Purchased ~ ., 
                 family = binomial,
                 data = training_set)

# Predicting the Test set results
prob_pred = predict(classifier, 
                    type = 'response', 
                    newdata = test_set[-3])

y_pred.logr = ifelse(prob_pred > 0.5, 1, 0)

# Making the Confusion Matrix
cm = table(test_set[, 3], y_pred.logr > 0.5)
cm

prob_pred[1]
y_pred.logr[1]
```
the 1st probability predicted = 0.162 and the test_set 1st value is 0, this mean user #2 is unlikely to purchase. Using the y_pred.logr variable the model predicted that user #2 will not purchase an item.


the model correctly predicted a purchase (57+26) 83 times and 17 incorrect predictions


```{r}
# Visualizing the Training set results
library(ElemStatLearn)

set = training_set

# the ranges of observations , -1 to avoid squeezing
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01) # age
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01) # salary

# make grid matrix using the variables above
grid_set = expand.grid(X1, X2)

colnames(grid_set) = c('Age', 'EstimatedSalary')

prob_set = predict(classifier, 
                   type = 'response', 
                   newdata = grid_set)

y_grid = ifelse(prob_set > 0.5, 1, 0)



#============== Visualizing the Training set
plot(set[, -3],
     main = 'Logistic Regression Classifier (Training set)',
     xlab = 'Age', 
     ylab = 'Estimated Salary',
     xlim = range(X1), 
     ylim = range(X2))

contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

#============== Visualizing the Test set results
library(ElemStatLearn)

set = test_set

X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)

grid_set = expand.grid(X1, X2)

colnames(grid_set) = c('Age', 'EstimatedSalary')

prob_set = predict(classifier, type = 'response', newdata = grid_set)

y_grid = ifelse(prob_set > 0.5, 1, 0)

plot(set[, -3],
     main = 'Logistic Regression Classifier (Test set)',
     xlab = 'Age', 
     ylab = 'Estimated Salary',
     xlim = range(X1), 
     ylim = range(X2))

contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))



```

Training set observation datapoints, red points are training observation where (dependent variable) Purchased=0 and the green points are training set observation where purchased=1. red zone is prediction region non-purchase. classifier predicted that the higher age the more estimated salary and to  purchased item. Classifier is straight line for linear models. Focus on the dot color and the zone they fall. 



## K-Nearest Neighbor 

KNN process:

1. choose the number k of neighbors
2. take the KNNs of the new data point, according to Euclidean distance
3. among these KNNs, count the number of data points in each category
4. assign the new data point to the category where you counted the most neighbors
5. model is done

```{r KNN Classifier}

# Importing the dataset
dataset = read.csv('./Part 3 - Classification/Section 15 - K-Nearest Neighbors (K-NN)/R/Social_Network_Ads.csv')

dataset = dataset[3:5]

# Encoding the target feature as factor
dataset$Purchased = factor(dataset$Purchased, levels = c(0, 1))

# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(dataset$Purchased, SplitRatio = 0.75)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)

# Feature Scaling
training_set[-3] = scale(training_set[-3])
test_set[-3] = scale(test_set[-3])

# Fitting classifier to the Training set
# Create your classifier here

# build a KNN classifier
library(class)
# fit a KNN to Training set and Predict the Test set
# remove last column of training set
y_pred.KNN = knn(train= training_set[,-3],
                 test= test_set[, -3],
                 cl= training_set[, 3],
                 k= 5) 

# y_pred.KNN[1:5]


# Predicting the Test set results
# for KNN comment out 
# y_pred = predict(classifier, newdata = test_set[-3])

# Making the Confusion Matrix
cm = table(test_set[, 3], y_pred.KNN)
cm

#============ Visualizing the Training set results
library(ElemStatLearn)
set = training_set

X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)

grid_set = expand.grid(X1, X2)

colnames(grid_set) = c('Age', 'EstimatedSalary')

# y_grid = predict(classifier, newdata = grid_set)

# for KNN replace the predict and its arguments with KNN arguments
y_grid = knn(train= training_set[,-3],
                 test= grid_set, # replace test_set
                 cl= training_set[, 3],
                 k= 5)

plot(set[, -3],
     main = 'KNN Classifier (Training set)',
     xlab = 'Age', ylab = 'Estimated Salary',
     xlim = range(X1), ylim = range(X2))

contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

# Visualising the Test set results
library(ElemStatLearn)
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')

# for KNN replace the predict and its arguments with KNN arguments
y_grid = knn(train= training_set[,-3],
                 test= grid_set, # replace test_set
                 cl= training_set[, 3],
                 k= 5)
# y_grid = predict(classifier, newdata = grid_set)

plot(set[, -3], main = 'KNN Classifier (Test set)',
     xlab = 'Age', ylab = 'Estimated Salary',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))



```


## Support Vector Machines
Started in 1960s and 1990s. Separate datapoints on a plot and classify them. Goal: find best decision boundary using a max margin hyperplane line that has a max margin (distance away from line and distance between max margin lines) and datapoints outside the max margin lines are positive or negative hyperplane. 

classify apples and oranges, training on best examples of each fruit

```{r SVM Classification}

# Importing the dataset
dataset = read.csv('./Part 3 - Classification/Section 16 - Support Vector Machine (SVM)/R/Social_Network_Ads.csv')

dataset = dataset[3:5]

# Encoding the target feature as factor
dataset$Purchased = factor(dataset$Purchased, levels = c(0, 1))

# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)

set.seed(123)

split = sample.split(dataset$Purchased, SplitRatio = 0.75)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)

# Feature Scaling
training_set[-3] = scale(training_set[-3])
test_set[-3] = scale(test_set[-3])

# Fitting classifier to the Training set
# Create your classifier here

# library e1071 for SVM
library(e1071)

# read the documentation
classifier.SVM = svm(formula= Purchased ~ .,
                     data= training_set,
                     type= 'C-classification', # classification
                     kernel= 'linear'
                     )


# Predicting the Test set results
y_pred.SVM = predict(classifier.SVM, newdata = test_set[-3])

# Making the Confusion Matrix
cm = table(test_set[, 3], y_pred.SVM)
cm



#============= Visualizing the Training set results
library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)

grid_set = expand.grid(X1, X2)

colnames(grid_set) = c('Age', 'EstimatedSalary')

y_grid = predict(classifier.SVM, newdata = grid_set)
plot(set[, -3],
     main = 'SVM Classifier (Training set)',
     xlab = 'Age', 
     ylab = 'Estimated Salary',
     xlim = range(X1), ylim = range(X2))

contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))


#============ Visualizing the Test set results

library(ElemStatLearn)

set = test_set

X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)

grid_set = expand.grid(X1, X2)

colnames(grid_set) = c('Age', 'EstimatedSalary')

y_grid = predict(classifier.SVM, 
                 newdata = grid_set)
plot(set[, -3], main = 'SVM Classifier (Test set)',
     xlab = 'Age', ylab = 'Estimated Salary',
     xlim = range(X1), ylim = range(X2))

contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))


```             
              
mapping datapoints to a Higher dimensional shape separates points by mapping the points to a algebraic function then to have a hyperplane a separator, but this is very computer intensive and not practical.
             
            
##  Kernel SVM
Decision boundaries for when data points are clustered in circles and is not linear. 
Gaussian RBF Kernel. 

```{r Kernel SVM}
              
 # Importing the dataset
dataset = read.csv('./Part 3 - Classification/Section 17 - Kernel SVM/R/Social_Network_Ads.csv')
dataset = dataset[3:5]

# Encoding the target feature as factor
dataset$Purchased = factor(dataset$Purchased, levels = c(0, 1))

# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(dataset$Purchased, SplitRatio = 0.75)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)

# Feature Scaling
training_set[-3] = scale(training_set[-3])
test_set[-3] = scale(test_set[-3])


# Fitting classifier to the Training set
# Create your classifier here

#=========== Kernel SVM
library(e1071)

# Gaussian classifier, radial
classifier.SVM = svm(formula= Purchased ~ .,
                 data = training_set,
                 type = "C-classification",
                 kernel= 'radial')


# Predicting the Test set results
y_pred = predict(classifier.SVM, newdata = test_set[-3])
y_pred[1:5]

# Making the Confusion Matrix
cm = table(test_set[, 3], y_pred)
cm


#============== Visualizing the Training set results
library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = predict(classifier.SVM, newdata = grid_set)
plot(set[, -3],
     main = 'Kernel SVM Classifier (Training set)',
     xlab = 'Age', ylab = 'Estimated Salary',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

#================== Visualizing the Test set results
library(ElemStatLearn)
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = predict(classifier.SVM, newdata = grid_set)
plot(set[, -3], main = 'Kernel SVM Classifier (Test set)',
     xlab = 'Age', ylab = 'Estimated Salary',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

```

Confusion matrix shows 10 incorrect results and 90 correct results         
              
 The SVM mapped data to a 3D plane, green zone is users who purchased item and red zone is no purchased item.              
              
              
## Naive Bayes Theorem Classification
Bayes Theorem of probability formula $P(A|B)$ = $P(B|A) * P(A) \div P(B)$ 
           
Example: machine.1 makes 30 items/hr and machine.2 makes 20 items/hr. Out of all items made, 1 % is defective, and out of all defective items 50% came from machine.1 and 50% from machine.2. *What is the probability that a item made by machine.2 is defective?*         

- P(Machine.2) = 20/50
- P(Defect) = 1%
- P(Machine.2 | Defect)= 50%
- P(Defect | Machine.2) = ?

P(Defect | Machine.2) = P(Machine.2 | Defect) * P(Defect)  / P(Machine.2)
P(Defect | Machine.2) = 0.5 * 0.01 / 0.4 == 0.0125 (1.25%)            

Example 2 Naive Bayes (assumed independence of variables:
x= age, y= salary, datapoints grouped: walks to work or drives to work. X= features
P(Walks | X) = P(X | Walks) * P(Walks) / P(X)         
P(Drives | X) = P(X | Drives) * P(Drives) / P(X)     

repeat steps for both Walks and Drives class:

1. prior probability: P(Walks)
2. marginal likelihood: P(X)
3. likelihood: P(X | Walks)
4. posterior probability: P(Walks | X)
              
              
step 1: we have no data, so calculate the points in the walks group from a plot. 
- P(Walks)  = number of walkers / total datapoints
  
step 2: select a radius on a plot, a circle to contain datapoints, look at features (age and salary) and these points will be similar. the probability of a new datapoint would fall into this radius. Count the number of points inside circle.
- P(X) = number of observations / total observations 
  
step 3: use the radius circle for similar features of datapoints, what is the likelihood of features of walkers given that person who walks (ignore the drivers). Count the number of datapoints for walkers inside the circle. 
- P(X | Walkers) = number of similar points for walkers / total number of walkers
  
step 4: P(Walks | X) = (3/10) * (10/30) / (4/30) == 0.75    (75% likelihood of datapoint being a Walker)       
              
repeat for driver           
step 4: P(Drives | X)= (1/20) * (20/30) / (4/30) = 0.25  (25%)             
              
```{r Bayes Theorem}            
              
# Importing the dataset
dataset = read.csv('./Part 3 - Classification/Section 18 - Naive Bayes/R/Social_Network_Ads.csv')
dataset = dataset[3:5]

# Encoding the target feature as factor
dataset$Purchased = factor(dataset$Purchased, levels = c(0, 1))

# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(dataset$Purchased, SplitRatio = 0.75)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)

# Feature Scaling
training_set[-3] = scale(training_set[-3])
test_set[-3] = scale(test_set[-3])

# Fitting classifier to the Training set
# Create your classifier here

# === Bayes Classifier
library(e1071)

# press F1 for documentation when mouse is on function name
classifier.Bayes = naiveBayes(x= training_set[-3],
                              y= training_set$Purchased)



# Predicting the Test set results
y_pred = predict(classifier.Bayes, newdata = test_set[-3])

# Making the Confusion Matrix
cm = table(test_set[, 3], y_pred)

#============== Visualizing the Training set results
library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = predict(classifier.Bayes, newdata = grid_set)
plot(set[, -3],
     main = 'Naive Bayes Classifier (Training set)',
     xlab = 'Age', ylab = 'Estimated Salary',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

#========== Visualizing the Test set results
library(ElemStatLearn)
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = predict(classifier.Bayes, newdata = grid_set)
plot(set[, -3], main = 'Naive Bayes Classifier (Test set)',
     xlab = 'Age', ylab = 'Estimated Salary',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))            
              
              
```         
              
              
              
## Decision Tree Classification              
```{r Decision Tree}          

# Importing the dataset
dataset = read.csv('./Part 3 - Classification/Section 19 - Decision Tree Classification/R/Social_Network_Ads.csv')

dataset = dataset[3:5]

# Encoding the target feature as factor
dataset$Purchased = factor(dataset$Purchased, levels = c(0, 1))

# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(dataset$Purchased, SplitRatio = 0.75)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)

# Feature Scaling
training_set[-3] = scale(training_set[-3])
test_set[-3] = scale(test_set[-3])

#=========== Fitting Decision Tree Classification to the Training set
# install.packages('rpart')
library(rpart)
classifier = rpart(formula = Purchased ~ .,
                   data = training_set)

# Predicting the Test set results
y_pred = predict(classifier, newdata = test_set[-3], type = 'class')

# Making the Confusion Matrix
cm = table(test_set[, 3], y_pred)

# ========== Visualizing the Training set results
library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = predict(classifier, newdata = grid_set, type = 'class')
plot(set[, -3],
     main = 'Decision Tree Classification (Training set)',
     xlab = 'Age', ylab = 'Estimated Salary',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

#============ Visualizing the Test set results
library(ElemStatLearn)
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = predict(classifier, newdata = grid_set, type = 'class')
plot(set[, -3], main = 'Decision Tree Classification (Test set)',
     xlab = 'Age', ylab = 'Estimated Salary',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

# Plotting the tree
plot(classifier)
text(classifier)

```
              
              
The Decision Tree Classifier first condition is age < 44.5 and salary < $90,000, our model classifies person as will not buy the item, if salary is > $90,000 person will buy the item . *This was made by not running the code of feature scaling and visualizations, but the classifier and plotting function.*          
              
## Random Forest Classification

Ensemble Learning = take multiple algorithms to make powerful algorithm

step 1: pick at random k data points from training set

step 2: build decision tree associated to these k data points

step 3: choose the number Ntree of trees you want to build and repeat steps 1 & 2

step 4: for a new data point, make each one of your Ntree trees predict the value of Y for the data point in question, and assign the new data point the average across all of the predicted Y values
              
This model is used for remote controller free gaming consoles (Microsoft connect)              
              
```{r Random Forest}              
# Importing the dataset
dataset = read.csv('./Part 3 - Classification/Section 20 - Random Forest Classification/R/Social_Network_Ads.csv')

dataset = dataset[3:5]

# Encoding the target feature as factor
dataset$Purchased = factor(dataset$Purchased, levels = c(0, 1))

# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(dataset$Purchased, SplitRatio = 0.75)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)

# Feature Scaling
training_set[-3] = scale(training_set[-3])
test_set[-3] = scale(test_set[-3])

# Fitting Random Forest Classification to the Training set
# install.packages('randomForest')
library(randomForest)
set.seed(123)
classifier = randomForest(x = training_set[-3],
                          y = training_set$Purchased,
                          ntree = 500)

# Predicting the Test set results
y_pred = predict(classifier, newdata = test_set[-3])

# Making the Confusion Matrix
cm = table(test_set[, 3], y_pred)



#========== Visualizing the Training set results
library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = predict(classifier, grid_set)

plot(set[, -3],
     main = 'Random Forest Classification (Training set)',
     xlab = 'Age', ylab = 'Estimated Salary',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

#======== Visualizing the Test set results
library(ElemStatLearn)
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = predict(classifier, grid_set)
plot(set[, -3], main = 'Random Forest Classification (Test set)',
     xlab = 'Age', ylab = 'Estimated Salary',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))



# Choosing the number of trees
plot(classifier)

```
              
              
              
# Classification Model Evaluations            

## False Positives & False Negatives
False Positive (Type 1 error) {warning}, False Negative (type 2 error) {nothing to see but disaster happens}
              
## Confusion Matrix

y_pred
```{}
|     Pred. |  0  | 1  |
|----------------------|
| actual  0 |  56 |  8 |
| actual  1 |  7  | 29 |
```            
Accuracy Rate = correct / total . 56+29= 85  85/100== 85%         
Error Rate = wrong/total . 7+8=15  15/100= 15%              
              
## Cumulative Accuracy Profile (CAP)
The curved line above the linear line on a plot, AKA Gain Chart.          
CAP Analysis: area under perfect model line and under red line. 
              
ROC is Receiver Operating Characteristic (not the same)             
              
End of Part 3 - Classification

<hr>

# **Clustering**
In Clustering you don’t know what you are looking for, and you are trying to identify some segments or clusters in your data. When you use clustering algorithms on your dataset, unexpected things can suddenly pop up like structures, clusters and groupings you would have never thought of otherwise

              
## K-Means Clustering
This finds the clusters for you.

Process:

- step 1. choose the number of K of clusters
- step 2. select at random k points, the centroids
- step 3. assign each data point to the closest centroid => forms clusters
- step 4. compute and place the new centroid of each cluster
- step 5. reassign each datapoint to the new closest centroid, step 4 and back, then finished


selecting k using the "elbow method" k=3 or 4 based on a plot line

```{r K Means}

# Importing the dataset
dataset = read.csv('./Part 4 - Clustering/Section 24 - K-Means Clustering/R/Mall_Customers.csv')
dataset = dataset[4:5]

# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
# library(caTools)
# set.seed(123)
# split = sample.split(dataset$DependentVariable, SplitRatio = 0.8)
# training_set = subset(dataset, split == TRUE)
# test_set = subset(dataset, split == FALSE)

# Feature Scaling
# training_set = scale(training_set)
# test_set = scale(test_set)

# Using the elbow method to find the optimal number of clusters
set.seed(6)
wcss = vector()
for (i in 1:10) wcss[i] = sum(kmeans(dataset, i)$withinss)
plot(1:10,
     wcss,
     type = 'b',
     main = paste('The Elbow Method'),
     xlab = 'Number of clusters',
     ylab = 'WCSS')

# Fitting K-Means to the dataset
set.seed(29)
kmeans = kmeans(x = dataset, centers = 5) # cluster == centers
y_kmeans = kmeans$cluster

# Visualising the clusters
library(cluster)
clusplot(dataset,   
         y_kmeans,
         lines = 0,
         shade = TRUE,
         color = TRUE,
         labels = 2,
         plotchar = FALSE,
         span = TRUE,
         main = paste('Clusters of customers'),
         xlab = 'Annual Income',
         ylab = 'Spending Score')

```


## Hierarchical Clustering 
Agglomerative (bottom-up) and Divisive (top-down)

Dendrograms (section 27: video 178, 179)

```{r Hierarchical Clustering}

# Importing the dataset
dataset = read.csv('./Part 4 - Clustering/Section 25 - Hierarchical Clustering/R/Mall_Customers.csv')
dataset = dataset[4:5]

# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
# library(caTools)
# set.seed(123)
# split = sample.split(dataset$DependentVariable, SplitRatio = 0.8)
# training_set = subset(dataset, split == TRUE)
# test_set = subset(dataset, split == FALSE)

# Feature Scaling
# training_set = scale(training_set)
# test_set = scale(test_set)

# Using the dendrogram to find the optimal number of clusters
dendrogram = hclust(d = dist(dataset, method = 'euclidean'), method = 'ward.D')
plot(dendrogram,
     main = paste('Dendrogram'),
     xlab = 'Customers',
     ylab = 'Euclidean distances')

# Fitting Hierarchical Clustering to the dataset
hc = hclust(d = dist(dataset, method = 'euclidean'), method = 'ward.D')
y_hc = cutree(hc, 5)

# Visualising the clusters
library(cluster)
clusplot(dataset,
         y_hc,
         lines = 0,
         shade = TRUE,
         color = TRUE,
         labels= 2,
         plotchar = FALSE,
         span = TRUE,
         main = paste('Clusters of customers'),
         xlab = 'Annual Income',
         ylab = 'Spending Score')


```









# Association Rule
"people who bought X also bought Y"

Apriori algorithm
```{}

movie recommendation: 
  support(M) = # user watchlist containing M / # user watchlists

movie recommendation: 
  confidence(M1 -> M2) = # user watchlist containing M1, M2 / # user watchlists containing M1

movie recommendation: 
  lift(M1 -> M2) = confidence M1, M2 / support M2
  
  
market basket optimization: 
  support(J) = # transactions containing J / # transactions
  
market basket optimization: 
  confidence(J1 -> J2) = # transactions containing J1, J2 / # transactions containing J1
  
market basket optimization: 
  {what is the random likelihood that person likes this item? Lift is the improvement recommendation}
  lift(J1 -> J2) = confidence J1, J2 / support J2

```

Process:

1. set a min support and confidence
2. take all the subsets in transactions having higher support than min support
3. take all the rules of these subsets having higher confidence than min confidence
4. sort the rules by decreasing lift, highest lift is the value you want 

```{r Association Rule}
# Apriori

# Data Preprocessing
# install.packages('arules')
library(arules)

dataset = read.csv('./Part 5 - Association Rule Learning/Section 28 - Apriori/R/Market_Basket_Optimisation.csv', header = FALSE)

# sparse matrix
dataset = read.transactions('./Part 5 - Association Rule Learning/Section 28 - Apriori/R/Market_Basket_Optimisation.csv', sep = ',', rm.duplicates = TRUE)

summary(dataset)
itemFrequencyPlot(dataset, topN = 30)

# Training Apriori on the dataset

# items bought 3x's a day divided by total products = {3*7/7500} = 0.028 => 0.003
# items bought 4x's a day divided by total products = {4*7/7500} = 0.0037 => 0.004
# confidence value is arbitrary choice 
# 
rules = apriori(data = dataset, 
                parameter = list(support = 0.004,  
                                 confidence = 0.2)) # use small values for more rules

# Visualizing the results
# get the highest rules by lift
inspect(sort(rules, by = 'lift')[1:10])

```

the rules show that people who bought {light cream} will buy {chicken} 29% [Confidence] of the cases with a lift value of 4.84


## Eclat
this algorithm is similar as above, but it only has the support variable using sets

simple results of items commonly purchased together using the support parameter
```{r Eclat}

# Data Preprocessing
# install.packages('arules')
library(arules)
dataset = read.csv('./Part 5 - Association Rule Learning/Section 29 - Eclat/R/Market_Basket_Optimisation.csv')

# sparse matrix
dataset = read.transactions('./Part 5 - Association Rule Learning/Section 29 - Eclat/R/Market_Basket_Optimisation.csv', sep = ',', rm.duplicates = TRUE)

summary(dataset)

itemFrequencyPlot(dataset, topN = 10)

# Training Eclat on the dataset
rules = eclat(data = dataset, parameter = list(support = 0.003, 
                                               minlen = 2)) # min number of purchased items bought together

# Visualising the results
inspect(sort(rules, by = 'support')[1:10])


```
              
      
End of section
<hr>
      

# **Principal Component Analysis (PCA)**
Unsupervised algorithm for : noise filtering, visualization, feature extraction, time series predictions, gene data analysis.
  
Goal: identify patterns in data, detect the correlation between variables  by reducing the dimensions

```{r PCA}
# Importing the dataset
dataset = read.csv('./Part 9 - Dimensionality Reduction/Section 43 - Principal Component Analysis (PCA)/R/Wine.csv')

# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(dataset$Customer_Segment, SplitRatio = 0.8)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)

# Feature Scaling
training_set[-14] = scale(training_set[-14])
test_set[-14] = scale(test_set[-14])

# Applying PCA
# install.packages('caret')
library(caret)
# install.packages('e1071')
library(e1071)
pca = preProcess(x = training_set[-14], method = 'pca', pcaComp = 2)
training_set = predict(pca, training_set)
training_set = training_set[c(2, 3, 1)]
test_set = predict(pca, test_set)
test_set = test_set[c(2, 3, 1)]

# Fitting SVM to the Training set
# install.packages('e1071')
library(e1071)
classifier = svm(formula = Customer_Segment ~ .,
                 data = training_set,
                 type = 'C-classification',
                 kernel = 'linear')

# Predicting the Test set results
y_pred = predict(classifier, newdata = test_set[-3])

# Making the Confusion Matrix
cm = table(test_set[, 3], y_pred)

# Visualising the Training set results
library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('PC1', 'PC2')
y_grid = predict(classifier, newdata = grid_set)
plot(set[, -3],
     main = 'SVM (Training set)',
     xlab = 'PC1', ylab = 'PC2',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 2, 'deepskyblue', ifelse(y_grid == 1, 'springgreen3', 'tomato')))
points(set, pch = 21, bg = ifelse(set[, 3] == 2, 'blue3', ifelse(set[, 3] == 1, 'green4', 'red3')))

# Visualising the Test set results
library(ElemStatLearn)
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('PC1', 'PC2')
y_grid = predict(classifier, newdata = grid_set)
plot(set[, -3], main = 'SVM (Test set)',
     xlab = 'PC1', ylab = 'PC2',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 2, 'deepskyblue', ifelse(y_grid == 1, 'springgreen3', 'tomato')))
points(set, pch = 21, bg = ifelse(set[, 3] == 2, 'blue3', ifelse(set[, 3] == 1, 'green4', 'red3')))     
      
      
```  
      
      
      
      

      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
      
              