Sameer Mathur
kNN Algorithm using R
---
Can we predict the impact of Advertising in Social Media on Sales?
Consider a simple dataset of “Social Network Advertising”
# reading data
advdata <- read.csv('Social_Network_Ads.csv')
# printing Dimensions
dim(advdata)
[1] 400 5
# few rows of the dataset
head(advdata)
User.ID Gender Age EstimatedSalary Purchased
1 15624510 Male 19 19000 0
2 15810944 Male 35 20000 0
3 15668575 Female 26 43000 0
4 15603246 Female 27 57000 0
5 15804002 Male 19 76000 0
6 15728773 Male 27 58000 0
# taking subset of the data set
subAdv <- advdata[3:5]
# dimensions of the subset dataset
dim(subAdv)
[1] 400 3
# few rows of the subset dataset
head(subAdv)
Age EstimatedSalary Purchased
1 19 19000 0
2 35 20000 0
3 26 43000 0
4 27 57000 0
5 19 76000 0
6 27 58000 0
# structure of the dataset
str(subAdv)
'data.frame': 400 obs. of 3 variables:
$ Age : int 19 35 26 27 19 27 27 32 25 35 ...
$ EstimatedSalary: int 19000 20000 43000 57000 76000 58000 84000 150000 33000 65000 ...
$ Purchased : int 0 0 0 0 0 0 0 1 0 0 ...
# encoading purchased variable as factor variable
subAdv$Purchased = factor(subAdv$Purchased, levels = c(0, 1))
# structure of the Pruchased variable
str(subAdv$Purchased)
Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...
# Loading the required package
library(caTools)
# fixing the observations in training set and test set
set.seed(123)
# splitting the data set into ratio 0.75:0.25
split <- sample.split(subAdv$Purchased, SplitRatio = 0.75)
# creating training dataset
trainingSet <- subset(subAdv, split == TRUE)
# creating test data set
testSet <- subset(subAdv, split == FALSE)
# some rows of training set
head(trainingSet)
Age EstimatedSalary Purchased
1 19 19000 0
3 26 43000 0
6 27 58000 0
7 27 84000 0
8 32 150000 1
10 35 65000 0
# some rows of test set
head(testSet)
Age EstimatedSalary Purchased
2 35 20000 0
4 27 57000 0
5 19 76000 0
9 25 33000 0
12 26 52000 0
18 45 26000 1
Feature scaling. Feature scaling is a method used to standardize the range of independent variables or features of data. In data processing, it is also known as data normalization and is generally performed during the data preprocessing step.
Where x is the original feature vector, \( \bar{x} \) is the mean of that feature vector, and \( \sigma \) is its standard deviation.
# feature scaling of trainingset
trainingSet[-3] = scale(trainingSet[-3])
# varifying feature scaling in training set
head(trainingSet)
Age EstimatedSalary Purchased
1 -1.7655475 -1.4733414 0
3 -1.0962966 -0.7883761 0
6 -1.0006894 -0.3602727 0
7 -1.0006894 0.3817730 0
8 -0.5226531 2.2654277 1
10 -0.2358313 -0.1604912 0
# feature scaling of testset
testSet[-3] = scale(testSet[-3])
# varifying feature scaling in test set
head(testSet)
Age EstimatedSalary Purchased
2 -0.3041906 -1.5135434 0
4 -1.0599437 -0.3245603 0
5 -1.8156969 0.2859986 0
9 -1.2488820 -1.0957926 0
12 -1.1544129 -0.4852337 0
18 0.6405008 -1.3207353 1
library(class)
# fitting Knn model
y_pred <- knn(train = trainingSet[, -3],
test = testSet[, -3],
cl = trainingSet[, 3],
k = 5,
prob = TRUE)
# creating new dataframe with predicted values
pred_DF <- data.frame(subAdv$Age,subAdv$EstimatedSalary,y_pred)
Age EstimatedSalary y_pred
1 19 19000 0
2 35 20000 0
3 26 43000 0
4 27 57000 0
5 19 76000 0
6 27 58000 1
7 27 84000 1
8 32 150000 1
9 25 33000 0
10 35 65000 0
11 26 80000 1
12 26 52000 0
library(ElemStatLearn)
set = trainingSet
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = knn(train = trainingSet[, -3], test = grid_set, cl = trainingSet[, 3], k = 5)
plot(set[, -3],
main = 'K-NN (Training set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))
# Confusion Matrix
cm <- table(testSet[, 3], y_pred)
cm
y_pred
0 1
0 59 5
1 6 30
library(ElemStatLearn)
set = testSet
X1 = seq(min(set[, 1]) - 1, max(set[, 1]) + 1, by = 0.01)
X2 = seq(min(set[, 2]) - 1, max(set[, 2]) + 1, by = 0.01)
grid_set = expand.grid(X1, X2)
colnames(grid_set) = c('Age', 'EstimatedSalary')
y_grid = knn(train = trainingSet[, -3], test = grid_set, cl = trainingSet[, 3], k = 5)
plot(set[, -3],
main = 'K-NN (Test set)',
xlab = 'Age', ylab = 'Estimated Salary',
xlim = range(X1), ylim = range(X2))
contour(X1, X2, matrix(as.numeric(y_grid), length(X1), length(X2)), add = TRUE)
points(grid_set, pch = '.', col = ifelse(y_grid == 1, 'springgreen3', 'tomato'))
points(set, pch = 21, bg = ifelse(set[, 3] == 1, 'green4', 'red3'))