First things first. Let’s import the dataset and look at what data we have here.
setwd("/Users/jennycheng/Downloads")
# Importing the dataset
dataset = read.csv('Social_Network_Ads.csv')
#What data types do we have?
str(dataset)
## 'data.frame': 400 obs. of 5 variables:
## $ User.ID : int 15624510 15810944 15668575 15603246 15804002 15728773 15598044 15694829 15600575 15727311 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 2 2 1 1 2 2 1 1 2 1 ...
## $ Age : int 19 35 26 27 19 27 27 32 25 35 ...
## $ EstimatedSalary: int 19000 20000 43000 57000 76000 58000 84000 150000 33000 65000 ...
## $ Purchased : int 0 0 0 0 0 0 0 1 0 0 ...
head(dataset)
## User.ID Gender Age EstimatedSalary Purchased
## 1 15624510 Male 19 19000 0
## 2 15810944 Male 35 20000 0
## 3 15668575 Female 26 43000 0
## 4 15603246 Female 27 57000 0
## 5 15804002 Male 19 76000 0
## 6 15728773 Male 27 58000 0
summary(dataset)
## User.ID Gender Age EstimatedSalary
## Min. :15566689 Female:204 Min. :18.00 Min. : 15000
## 1st Qu.:15626764 Male :196 1st Qu.:29.75 1st Qu.: 43000
## Median :15694342 Median :37.00 Median : 70000
## Mean :15691540 Mean :37.66 Mean : 69742
## 3rd Qu.:15750363 3rd Qu.:46.00 3rd Qu.: 88000
## Max. :15815236 Max. :60.00 Max. :150000
## Purchased
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.3575
## 3rd Qu.:1.0000
## Max. :1.0000
#We can conclude there are 5 variables in this dataset, namely:
#User ID
#Gender
#Age
#Estimated Salary
#Purchased
#For the purposes of our modelling, we are interested in looking at how age and salary affects a customer's purchase decision when a specific ad is shown. Hence dataset is override to account for only these two independent variables.
dataset = dataset[3:5]
Now let’s do some data pre-processing.
# Encoding the target feature (purchase decision) as factor
dataset$Purchased = factor(dataset$Purchased, levels = c(0, 1))
# Splitting the dataset into the Training set and Test set
library(caTools)
set.seed(123)
split = sample.split(dataset$Purchased, SplitRatio = 0.75)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)
# Feature Scaling - Let's make the independent variables standardised.
training_set[-3] = scale(training_set[-3])
test_set[-3] = scale(test_set[-3])
Now that we have completed our data pre-processing. It’s time to make models! Let’s start with a simple Logistic Regression model.
# Fitting Logistic Regression to the Training set
classifier = glm(formula = Purchased ~ .,
family = binomial,
data = training_set)
# Predicting the Test set results
prob_pred = predict(classifier, type = 'response', newdata = test_set[-3])
y_pred = ifelse(prob_pred > 0.5, 1, 0)
# Making the Confusion Matrix
cm = table(test_set[, 3], y_pred > 0.5)
# Visualising the Training set results
library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
prob_set = predict(classifier, type = 'response', newdata = grid_set)
y_grid = ifelse(prob_set > 0.5, 1, 0)
plot(set[, -3],
main = 'Logistic Regression (Training set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))
# Visualising the Test set results
library(ElemStatLearn)
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
prob_set = predict(classifier, type = 'response', newdata = grid_set)
y_grid = ifelse(prob_set > 0.5, 1, 0)
plot(set[, -3],
main = 'Logistic Regression (Test set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))
#How well did our logistic regression model perform against the test set?
print(paste('Accuracy of Model:',1-(mean(y_pred != test_set$Purchased))))
## [1] "Accuracy of Model: 0.83"
The logistic regression model had an accuracy of 83%
Now, we move on to another classification model, The Naive Bayes.
#Naive Bayes
# Fitting SVM to the Training set
library(e1071)
classifier = naiveBayes(x = training_set[-3],
y = training_set$Purchased)
# Predicting the Test set results
y_pred = predict(classifier, newdata = test_set[-3])
# Making the Confusion Matrix
cm = table(test_set[, 3], y_pred)
# Visualising the Training set results
library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = predict(classifier, newdata = grid_set)
plot(set[, -3],
main = 'Naive bayes (Training set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))
# Visualising the Test set results
library(ElemStatLearn)
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = predict(classifier, newdata = grid_set)
plot(set[, -3], main = 'Naive Bayes (Test set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))
#How well did our naive bayes model perform against the test set?
print(paste('Accuracy of Model:',1-(mean(y_pred != test_set$Purchased))))
## [1] "Accuracy of Model: 0.86"
#The Naive Bayes model had an accuracy of 86%, which gives better results compared to the simple Logistic regression model.
#Why not we look at it from a Decision Tree perspective this time.
#Decision Tree Classification
# Fitting Decision Tree Classification to the Training set
# install.packages('rpart')
library(rpart)
classifier = rpart(formula = Purchased ~ .,
data = training_set)
# Predicting the Test set results
y_pred = predict(classifier, newdata = test_set[-3], type = 'class')
# Making the Confusion Matrix
cm = table(test_set[, 3], y_pred)
cm
## y_pred
## 0 1
## 0 53 11
## 1 6 30
# Visualising the Training set results
library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = predict(classifier, newdata = grid_set, type = 'class')
plot(set[, -3],
main = 'Decision Tree Classification (Training set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))
# Visualising the Test set results
library(ElemStatLearn)
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = predict(classifier, newdata = grid_set, type = 'class')
plot(set[, -3], main = 'Decision Tree Classification (Test set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))
#How well did our decision tree classification model perform against the test set?
print(paste('Accuracy of Model:',1-(mean(y_pred != test_set$Purchased))))
## [1] "Accuracy of Model: 0.83"
#Plotting a decision tree
# Importing the dataset
dataset_dt = read.csv('Social_Network_Ads.csv')
dataset_dt = dataset_dt[3:5]
# Encoding the target feature as factor
dataset_dt$Purchased = factor(dataset_dt$Purchased, levels = c(0, 1))
# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(dataset_dt$Purchased, SplitRatio = 0.75)
training_set_dt = subset(dataset_dt, split == TRUE)
test_set_dt = subset(dataset_dt, split == FALSE)
# Fitting Decision Tree Classification to the Training set
library(rpart)
classifier_dt = rpart(formula = Purchased ~ .,
data = training_set_dt)
# Plotting the tree
plot(classifier_dt, margin=0.1)
text(classifier_dt)
The decision tree model has an accuracy of 83%. So it performed not as accurate as Naive Bayes. (86%) Nevertheless, you can still see how important it is for its intepretability.
Some important things we can notice from our decision tree.
At the first split, the condition states that if the user is below 44.5 years old, he will go to the left split where a new condition arises. If the user is younger than 44.5 years old AND his estimated salary is less than $90,000, then according to our decision tree classifier, he will NOT buy the product. If this young user earns more than 90k however, than he/she will buy the product.
A user who is between ages 44.5 and 49.5 years old will only buy the product if his salary is more than $81,000.
If the user is older than 49.5 years old, our decision tree classifier predicts the user is very likely to buy the product.
What happens if we model after many decision trees? Will it improve the performance of our model?
#Random Forest Classification
# Fitting Random Forest Classification to the Training set
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
set.seed(123)
classifier = randomForest(x = training_set[-3],
y = training_set$Purchased,
ntree = 500)
# Predicting the Test set results
y_pred = predict(classifier, newdata = test_set[-3])
# Making the Confusion Matrix
cm = table(test_set[, 3], y_pred)
# Visualising the Training set results
library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = predict(classifier, grid_set)
plot(set[, -3],
main = 'Random Forest Classification (Training set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))
# Visualising the Test set results
library(ElemStatLearn)
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = predict(classifier, grid_set)
plot(set[, -3], main = 'Random Forest Classification (Test set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))
#How well did our random forest classification model perform against the test set?
print(paste('Accuracy of Model:',1-(mean(y_pred != test_set$Purchased))))
## [1] "Accuracy of Model: 0.85"
The random forest classification model has improved our decision tree model, gathering an accuracy of 85%.
Next, we will look at the K Nearest-Neighbour classification model.
# Fitting K-NN to the Training set and Predicting the Test set results
library(class)
y_pred = knn(train = training_set[, -3],
test = test_set[, -3],
cl = training_set[, 3],
k = 5,
prob = TRUE)
# Making the Confusion Matrix
cm = table(test_set[, 3], y_pred)
# Visualising the Training set results
library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = knn(train = training_set[, -3], test = grid_set, cl = training_set[, 3], k = 5)
plot(set[, -3],
main = 'K-NN (Training set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))
# Visualising the Test set results
library(ElemStatLearn)
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = knn(train = training_set[, -3], test = grid_set, cl = training_set[, 3], k = 5)
plot(set[, -3],
main = 'K-NN (Test set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))
#How well did our K-Nearest Neighbour model perform against the test set?
print(paste('Accuracy of Model:',1-(mean(y_pred != test_set$Purchased))))
## [1] "Accuracy of Model: 0.89"
The KNN model has performed really well, giving an accuracy of 89% against the test set.
And finally, let’s try the Kernel SVM model.
# Fitting Kernel SVM to the Training set
# install.packages('e1071')
library(e1071)
classifier = svm(formula = Purchased ~ .,
data = training_set,
type = 'C-classification',
kernel = 'radial')
# Predicting the Test set results
y_pred = predict(classifier, newdata = test_set[-3])
# Making the Confusion Matrix
cm = table(test_set[, 3], y_pred)
# Visualising the Training set results
library(ElemStatLearn)
set = training_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = predict(classifier, newdata = grid_set)
plot(set[, -3],
main = 'Kernel SVM (Training set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))
# Visualising the Test set results
library(ElemStatLearn)
set = test_set
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = predict(classifier, newdata = grid_set)
plot(set[, -3], main = 'Kernel SVM (Test set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))
#How well did our Kernel SVM model perform against the test set?
print(paste('Accuracy of Model:',1-(mean(y_pred != test_set$Purchased))))
## [1] "Accuracy of Model: 0.9"
It appears that our Kernel SVM performed the best out of the 6 models. To further validate its performance, lets conduct the K-Fold Cross Validation.
# Applying the 10 k-Fold Cross Validation to our model
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
folds = createFolds(training_set$Purchased, k = 10)
cv = lapply(folds, function(x) {
training_fold = training_set[-x, ]
test_fold = training_set[x, ]
classifier = svm(formula = Purchased ~ .,
data = training_fold,
type = 'C-classification',
kernel = 'radial')
y_pred = predict(classifier, newdata = test_fold[-3])
cm = table(test_fold[, 3], y_pred)
accuracy = (cm[1,1] + cm[2,2]) / (cm[1,1] + cm[2,2] + cm[1,2] + cm[2,1])
return(accuracy)
})
accuracy = mean(as.numeric(cv))
accuracy
## [1] 0.9130886
Accuracy of the model is 91% following the k-fold cross validation method.
Finally, to improve our model’s performance, let’s fine tune our chosen model’s parameters!
#To improve our model's performance, let's fine tune our model's parameters!
#Applying Grid Search using the caret package
library(caret)
classifier = train(form=Purchased~., data=training_set,method = 'svmRadial')
classifier
## Support Vector Machines with Radial Basis Function Kernel
##
## 300 samples
## 2 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 300, 300, 300, 300, 300, 300, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.25 0.9166434 0.8173727
## 0.50 0.9188799 0.8221778
## 1.00 0.9199985 0.8243464
##
## Tuning parameter 'sigma' was held constant at a value of 1.672135
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 1.672135 and C = 1.
By fine-tuning our chosen model’s parameters to sigma = 1.672135 and C = 1 respectively, our model’s performance will increase to produce an accuracy of 92%!