Social Network Advertising

Sameer Mathur

kNN Algorithm using R

---

Case Study

Social Network Advertising

Marketing Problem

Can we predict the impact of Advertising in Social Media on Sales?

Dataset

Consider a simple dataset of “Social Network Advertising”

Reading Social Network Advertising Dataset

# reading data
advdata <- read.csv('Social_Network_Ads.csv')

Dimensions of the Dataset

# printing Dimensions
dim(advdata)
[1] 400   5

Some Top Rows of the Dataset

# few rows of the dataset
head(advdata)
   User.ID Gender Age EstimatedSalary Purchased
1 15624510   Male  19           19000         0
2 15810944   Male  35           20000         0
3 15668575 Female  26           43000         0
4 15603246 Female  27           57000         0
5 15804002   Male  19           76000         0
6 15728773   Male  27           58000         0

Subsetting Data

# taking subset of the data set 
subAdv <- advdata[3:5]
# dimensions of the subset dataset
dim(subAdv)
[1] 400   3

Some Top Rows of the Subset Dataset

# few rows of the subset dataset
head(subAdv)
  Age EstimatedSalary Purchased
1  19           19000         0
2  35           20000         0
3  26           43000         0
4  27           57000         0
5  19           76000         0
6  27           58000         0

Structure of the Dataset

# structure of the dataset
str(subAdv)
'data.frame':   400 obs. of  3 variables:
 $ Age            : int  19 35 26 27 19 27 27 32 25 35 ...
 $ EstimatedSalary: int  19000 20000 43000 57000 76000 58000 84000 150000 33000 65000 ...
 $ Purchased      : int  0 0 0 0 0 0 0 1 0 0 ...

Encoding the Target Feature as factor

# encoading purchased variable as factor variable
subAdv$Purchased = factor(subAdv$Purchased, levels = c(0, 1))
# structure of the Pruchased variable
str(subAdv$Purchased)
 Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...

Splitting the Dataset into the Training set and Test set

# Loading the required package 
library(caTools)
# fixing the observations in training set and test set
set.seed(123)
# splitting the data set into ratio 0.75:0.25
split <- sample.split(subAdv$Purchased, SplitRatio = 0.75)
# creating training dataset
trainingSet <- subset(subAdv, split == TRUE)
# creating test data set
testSet <- subset(subAdv, split == FALSE)

Some Top Rows of the Training Dataset

# some rows of training set
head(trainingSet)
   Age EstimatedSalary Purchased
1   19           19000         0
3   26           43000         0
6   27           58000         0
7   27           84000         0
8   32          150000         1
10  35           65000         0

Some Top Rows of the test Dataset

# some rows of test set
head(testSet)
   Age EstimatedSalary Purchased
2   35           20000         0
4   27           57000         0
5   19           76000         0
9   25           33000         0
12  26           52000         0
18  45           26000         1

Feature Scaling

Feature Scaling

Feature scaling. Feature scaling is a method used to standardize the range of independent variables or features of data. In data processing, it is also known as data normalization and is generally performed during the data preprocessing step.

formula

\[ x' = \frac{x - \bar{x}} {\sigma} \]

Where x is the original feature vector, \( \bar{x} \) is the mean of that feature vector, and \( \sigma \) is its standard deviation.

Applying Feature Scaling

# feature scaling of trainingset
trainingSet[-3] = scale(trainingSet[-3])
# varifying feature scaling in training set
head(trainingSet)
          Age EstimatedSalary Purchased
1  -1.7655475      -1.4733414         0
3  -1.0962966      -0.7883761         0
6  -1.0006894      -0.3602727         0
7  -1.0006894       0.3817730         0
8  -0.5226531       2.2654277         1
10 -0.2358313      -0.1604912         0
# feature scaling of testset
testSet[-3] = scale(testSet[-3])
# varifying feature scaling in test set
head(testSet)
          Age EstimatedSalary Purchased
2  -0.3041906      -1.5135434         0
4  -1.0599437      -0.3245603         0
5  -1.8156969       0.2859986         0
9  -1.2488820      -1.0957926         0
12 -1.1544129      -0.4852337         0
18  0.6405008      -1.3207353         1

Fitting K-NN to the Training set and Predicting the Test set results

library(class)
# fitting Knn model
y_pred <- knn(train = trainingSet[, -3],
             test = testSet[, -3],
             cl = trainingSet[, 3],
             k = 5,
             prob = TRUE)

Creating Data Set with Predicted values

# creating new dataframe with predicted values
pred_DF <- data.frame(subAdv$Age,subAdv$EstimatedSalary,y_pred)
   Age EstimatedSalary y_pred
1   19           19000      0
2   35           20000      0
3   26           43000      0
4   27           57000      0
5   19           76000      0
6   27           58000      1
7   27           84000      1
8   32          150000      1
9   25           33000      0
10  35           65000      0
11  26           80000      1
12  26           52000      0

Visualising Training set results using contour Plot

plot of chunk unnamed-chunk-22

R Code for the Contour Plot

library(ElemStatLearn)
set = trainingSet
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = knn(train = trainingSet[, -3], test = grid_set, cl = trainingSet[, 3], k = 5)
plot(set[, -3],
     main = 'K-NN (Training set)',
     xlab = 'Age', ylab = 'Estimated Salary',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))

Confusion Matrix for the Test Set

# Confusion Matrix
cm <- table(testSet[, 3], y_pred)
cm
   y_pred
     0  1
  0 59  5
  1  6 30

Visualising Test Set results using contour Plot

plot of chunk unnamed-chunk-25

R Code for the Contour Plot

library(ElemStatLearn)
set = testSet
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = knn(train = trainingSet[, -3], test = grid_set, cl = trainingSet[, 3], k = 5)
plot(set[, -3],
     main = 'K-NN (Test set)',
     xlab = 'Age', ylab = 'Estimated Salary',
     xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))