##Project Decription:

The goal of this project is to figure out who is probably going to click on an internet security ad. It looks at the data closely to understand the traits of people who have clicked on the ad and uses this information to make predictions about who might click in the future. The project uses four specific ways to learn from the data: K-Nearest Neighbours, Naive Bayes, Support Vector Machines, and Random Forests.

# Importing the required packages
library("data.table")
library("tidyr")
## Warning: package 'tidyr' was built under R version 4.3.1
library("lubridate")
## Warning: package 'lubridate' was built under R version 4.3.1
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library("plyr")
## Warning: package 'plyr' was built under R version 4.3.1
library("dplyr")
## Warning: package 'dplyr' was built under R version 4.3.1
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

The following objects are masked from ‘package:data.table’:

hour, isoweek, mday, minute, month, quarter, second, wday, week, yday, year

The following objects are masked from ‘package:base’:

date, intersect, setdiff, union

library("ggcorrplot")
## Warning: package 'ggcorrplot' was built under R version 4.3.1
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.3.1
library("ggplot2")
library("corrplot")
## Warning: package 'corrplot' was built under R version 4.3.1
## corrplot 0.92 loaded
## corrplot 0.84 loaded
library("moments")
library("psych")
## Warning: package 'psych' was built under R version 4.3.1
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

Attaching package: ‘psych’

The following objects are masked from ‘package:ggplot2’:

library("countrycode")
## Warning: package 'countrycode' was built under R version 4.3.1
library("class")
## Warning: package 'class' was built under R version 4.3.1
library("rpart")
## Warning: package 'rpart' was built under R version 4.3.1
library("rpart.plot")
## Warning: package 'rpart.plot' was built under R version 4.3.1
library("mlbench")
## Warning: package 'mlbench' was built under R version 4.3.1
library("e1071")
## Warning: package 'e1071' was built under R version 4.3.1
## 
## Attaching package: 'e1071'
## The following objects are masked from 'package:moments':
## 
##     kurtosis, moment, skewness

Attaching package: ‘e1071’

The following objects are masked from ‘package:moments’:

kurtosis, moment, skewness

library("rpart")
library("caret")
## Warning: package 'caret' was built under R version 4.3.1
## Loading required package: lattice

Loading required package: lattice

Attaching package: ‘caret’

The following object is masked from ‘package:purrr’:

lift

library("ranger")
## Warning: package 'ranger' was built under R version 4.3.1
library("kernlab")
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:psych':
## 
##     alpha
## The following object is masked from 'package:ggplot2':
## 
##     alpha

Attaching package: ‘kernlab’

The following object is masked from ‘package:psych’:

alpha ## The following object is masked from ‘package:purrr’: cross ## The following object is masked from ‘package:ggplot2’: alpha

library("pdp")

Attaching package: ‘pdp’

The following object is masked from ‘package:purrr’:

partial

library("vip")
## Warning: package 'vip' was built under R version 4.3.1
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi

Attaching package: ‘vip’

The following object is masked from ‘package:utils’:

vi # Loading the Dataset

ad_df <- read.csv(url("http://bit.ly/IPAdvertisingData"))

Previewing The First Seven records in the Dataset

head(ad_df, n=7)
##   Daily.Time.Spent.on.Site Age Area.Income Daily.Internet.Usage
## 1                    68.95  35    61833.90               256.09
## 2                    80.23  31    68441.85               193.77
## 3                    69.47  26    59785.94               236.50
## 4                    74.15  29    54806.18               245.89
## 5                    68.37  35    73889.99               225.58
## 6                    59.99  23    59761.56               226.74
## 7                    88.91  33    53852.85               208.36
##                           Ad.Topic.Line           City Male    Country
## 1    Cloned 5thgeneration orchestration    Wrightburgh    0    Tunisia
## 2    Monitored national standardization      West Jodi    1      Nauru
## 3      Organic bottom-line service-desk       Davidton    0 San Marino
## 4 Triple-buffered reciprocal time-frame West Terrifurt    1      Italy
## 5         Robust logistical utilization   South Manuel    0    Iceland
## 6       Sharable client-driven software      Jamieberg    1     Norway
## 7            Enhanced dedicated support    Brandonstad    0    Myanmar
##             Timestamp Clicked.on.Ad
## 1 2016-03-27 00:53:11             0
## 2 2016-04-04 01:39:02             0
## 3 2016-03-13 20:35:42             0
## 4 2016-01-10 02:31:19             0
## 5 2016-06-03 03:36:18             0
## 6 2016-05-19 14:30:17             0
## 7 2016-01-28 20:59:32             0
# Previewing The Last Seven records in the Dataset
tail(ad_df, n=7)
##      Daily.Time.Spent.on.Site Age Area.Income Daily.Internet.Usage
## 994                     64.20  27    66200.96               227.63
## 995                     43.70  28    63126.96               173.01
## 996                     72.97  30    71384.57               208.58
## 997                     51.30  45    67782.17               134.42
## 998                     51.63  51    42415.72               120.37
## 999                     55.55  19    41920.79               187.95
## 1000                    45.01  26    29875.80               178.35
##                             Ad.Topic.Line          City Male
## 994        Phased zero tolerance extranet  Edwardsmouth    1
## 995         Front-line bifurcated ability  Nicholasland    0
## 996         Fundamental modular algorithm     Duffystad    1
## 997       Grass-roots cohesive monitoring   New Darlene    1
## 998          Expanded intangible solution South Jessica    1
## 999  Proactive bandwidth-monitored policy   West Steven    0
## 1000      Virtual 5thgeneration emulation   Ronniemouth    0
##                     Country           Timestamp Clicked.on.Ad
## 994             Isle of Man 2016-02-11 23:45:01             0
## 995                 Mayotte 2016-04-04 03:57:48             1
## 996                 Lebanon 2016-02-11 21:49:00             1
## 997  Bosnia and Herzegovina 2016-04-22 02:07:01             1
## 998                Mongolia 2016-02-01 17:24:57             1
## 999               Guatemala 2016-03-24 02:35:54             0
## 1000                 Brazil 2016-06-03 21:43:21             1
# Checking the Data Dimensions
dim(ad_df)
## [1] 1000   10
# Checking the Data Dimensions
dim(ad_df)
## [1] 1000   10
# Checking the Data Dimensions
dim(ad_df)
## [1] 1000   10

Data Preparation

# Check column names
colnames(ad_df)
##  [1] "Daily.Time.Spent.on.Site" "Age"                     
##  [3] "Area.Income"              "Daily.Internet.Usage"    
##  [5] "Ad.Topic.Line"            "City"                    
##  [7] "Male"                     "Country"                 
##  [9] "Timestamp"                "Clicked.on.Ad"
# Renaming column names
names(ad_df)[1] <- "daily_time_spent_on_site"
names(ad_df)[2] <- "age"
names(ad_df)[3] <- "area_income"
names(ad_df)[4] <- "daily_internet_usage"
names(ad_df)[5] <- "ad_topic_line"
names(ad_df)[6] <- "city"
names(ad_df)[7] <- "male"
names(ad_df)[8] <- "country"
names(ad_df)[9] <- "timestamp"
names(ad_df)[10] <- "clicked_on_ad"
# Checking whether the column names have been changed
colnames(ad_df)
##  [1] "daily_time_spent_on_site" "age"                     
##  [3] "area_income"              "daily_internet_usage"    
##  [5] "ad_topic_line"            "city"                    
##  [7] "male"                     "country"                 
##  [9] "timestamp"                "clicked_on_ad"
# Checking for the length of unique values in each column
lapply(ad_df, function (x) {length(unique(x))})
## $daily_time_spent_on_site
## [1] 900
## 
## $age
## [1] 43
## 
## $area_income
## [1] 1000
## 
## $daily_internet_usage
## [1] 966
## 
## $ad_topic_line
## [1] 1000
## 
## $city
## [1] 969
## 
## $male
## [1] 2
## 
## $country
## [1] 237
## 
## $timestamp
## [1] 1000
## 
## $clicked_on_ad
## [1] 2

##We can observe that the ’Male’and ’Clicked_on_ad’columns are categorical since they only have 2 factor variables

# Converting timestamp column to datetime datatype
ad_df[["timestamp"]] <- as.POSIXct(ad_df$timestamp, tz=Sys.timezone())
str(ad_df)
## 'data.frame':    1000 obs. of  10 variables:
##  $ daily_time_spent_on_site: num  69 80.2 69.5 74.2 68.4 ...
##  $ age                     : int  35 31 26 29 35 23 33 48 30 20 ...
##  $ area_income             : num  61834 68442 59786 54806 73890 ...
##  $ daily_internet_usage    : num  256 194 236 246 226 ...
##  $ ad_topic_line           : chr  "Cloned 5thgeneration orchestration" "Monitored national standardization" "Organic bottom-line service-desk" "Triple-buffered reciprocal time-frame" ...
##  $ city                    : chr  "Wrightburgh" "West Jodi" "Davidton" "West Terrifurt" ...
##  $ male                    : int  0 1 0 1 0 1 0 1 1 1 ...
##  $ country                 : chr  "Tunisia" "Nauru" "San Marino" "Italy" ...
##  $ timestamp               : POSIXct, format: "2016-03-27 00:53:11" "2016-04-04 01:39:02" ...
##  $ clicked_on_ad           : int  0 0 0 0 0 0 0 1 0 0 ...
glimpse(ad_df)
## Rows: 1,000
## Columns: 10
## $ daily_time_spent_on_site <dbl> 68.95, 80.23, 69.47, 74.15, 68.37, 59.99, 88.…
## $ age                      <int> 35, 31, 26, 29, 35, 23, 33, 48, 30, 20, 49, 3…
## $ area_income              <dbl> 61833.90, 68441.85, 59785.94, 54806.18, 73889…
## $ daily_internet_usage     <dbl> 256.09, 193.77, 236.50, 245.89, 225.58, 226.7…
## $ ad_topic_line            <chr> "Cloned 5thgeneration orchestration", "Monito…
## $ city                     <chr> "Wrightburgh", "West Jodi", "Davidton", "West…
## $ male                     <int> 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, …
## $ country                  <chr> "Tunisia", "Nauru", "San Marino", "Italy", "I…
## $ timestamp                <dttm> 2016-03-27 00:53:11, 2016-04-04 01:39:02, 20…
## $ clicked_on_ad            <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, …

##We can observe that the change has taken shape successfully. We now want to split the column to date and time

# Splitting datetime into date and time
Time <- format(as.POSIXct(strptime(ad_df$timestamp,"%Y-%m-%d %H:%M:%S",tz="")) ,format = "%H:%M:%S")
head(Time)
## [1] "00:53:11" "01:39:02" "20:35:42" "02:31:19" "03:36:18" "14:30:17"
Dates <- format(as.POSIXct(strptime(ad_df$timestamp,"%Y-%m-%d %H:%M:%S",tz="")) ,format = "%Y-%m-%d")
head(Dates)
## [1] "2016-03-27" "2016-04-04" "2016-03-13" "2016-01-10" "2016-06-03"
## [6] "2016-05-19"
ad_df$Dates <- Dates
ad_df$Time <- Time
str(ad_df)
## 'data.frame':    1000 obs. of  12 variables:
##  $ daily_time_spent_on_site: num  69 80.2 69.5 74.2 68.4 ...
##  $ age                     : int  35 31 26 29 35 23 33 48 30 20 ...
##  $ area_income             : num  61834 68442 59786 54806 73890 ...
##  $ daily_internet_usage    : num  256 194 236 246 226 ...
##  $ ad_topic_line           : chr  "Cloned 5thgeneration orchestration" "Monitored national standardization" "Organic bottom-line service-desk" "Triple-buffered reciprocal time-frame" ...
##  $ city                    : chr  "Wrightburgh" "West Jodi" "Davidton" "West Terrifurt" ...
##  $ male                    : int  0 1 0 1 0 1 0 1 1 1 ...
##  $ country                 : chr  "Tunisia" "Nauru" "San Marino" "Italy" ...
##  $ timestamp               : POSIXct, format: "2016-03-27 00:53:11" "2016-04-04 01:39:02" ...
##  $ clicked_on_ad           : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ Dates                   : chr  "2016-03-27" "2016-04-04" "2016-03-13" "2016-01-10" ...
##  $ Time                    : chr  "00:53:11" "01:39:02" "20:35:42" "02:31:19" ...
# Separating dates to hours minutes and days and dropping the timestamp column
ad_df <- separate(ad_df, "Dates", c("year", "month", "day"), sep = "-")
9
## [1] 9
ad_df <- separate(ad_df, "Time", c("hour", "minutes", "seconds"), sep = ":")
colnames(ad_df)
##  [1] "daily_time_spent_on_site" "age"                     
##  [3] "area_income"              "daily_internet_usage"    
##  [5] "ad_topic_line"            "city"                    
##  [7] "male"                     "country"                 
##  [9] "timestamp"                "clicked_on_ad"           
## [11] "year"                     "month"                   
## [13] "day"                      "hour"                    
## [15] "minutes"                  "seconds"
# Changing the new derived columns to factors for ease of analysis
ad_df$Male = factor(ad_df$male)
ad_df$Year = factor(ad_df$year)
ad_df$Month = factor(ad_df$month)
ad_df$Day = factor(ad_df$day)
ad_df$Hour = factor(ad_df$hour)
ad_df$Minutes = factor(ad_df$minutes)
ad_df$Seconds = factor(ad_df$seconds)

##We can see that the date and time have their respective columns #——- ## Completeness

# Checking for missing values
colSums(is.na(ad_df))
## daily_time_spent_on_site                      age              area_income 
##                        0                        0                        0 
##     daily_internet_usage            ad_topic_line                     city 
##                        0                        0                        0 
##                     male                  country                timestamp 
##                        0                        0                        0 
##            clicked_on_ad                     year                    month 
##                        0                        0                        0 
##                      day                     hour                  minutes 
##                        0                        0                        0 
##                  seconds                     Male                     Year 
##                        0                        0                        0 
##                    Month                      Day                     Hour 
##                        0                        0                        0 
##                  Minutes                  Seconds 
##                        0                        0

##Our data is complete hence no missing values #——- ## Consistency

# Checking for duplicate values
duplicates <- ad_df[duplicated(ad_df),]
duplicates
##  [1] daily_time_spent_on_site age                      area_income             
##  [4] daily_internet_usage     ad_topic_line            city                    
##  [7] male                     country                  timestamp               
## [10] clicked_on_ad            year                     month                   
## [13] day                      hour                     minutes                 
## [16] seconds                  Male                     Year                    
## [19] Month                    Day                      Hour                    
## [22] Minutes                  Seconds                 
## <0 rows> (or 0-length row.names)

##Our data is consistent due to no duplicate values present #——- ### Anomaly Detection #### # Checking for anomalies in our numerical variables i.e daily_time_spent_on_site, area income, age, and daily_internet usage

# Assuming 'ad_df' is your dataframe

# Plotting boxplots for all the numerical variables
par(mfrow=c(2,2))

##Observation:

  1. Area income variable has values ranging from below 0 to 80,000. We have a few values below 20,000 which are outliers but we’ll keep them because they represent crucial data for analysis
  2. Daily time spent on site has values from around 20 to 90 with the mode between 50 to 80
  3. Age variable has observations from the age of 20 to 60 with the mode between 30 to 40
  4. Daily internet usage has values from 100 to slightly above 250 with the mode between 150 to 200
# Checking the statistical summary of the data
summary(ad_df)
##  daily_time_spent_on_site      age         area_income    daily_internet_usage
##  Min.   :32.60            Min.   :19.00   Min.   :13996   Min.   :104.8       
##  1st Qu.:51.36            1st Qu.:29.00   1st Qu.:47032   1st Qu.:138.8       
##  Median :68.22            Median :35.00   Median :57012   Median :183.1       
##  Mean   :65.00            Mean   :36.01   Mean   :55000   Mean   :180.0       
##  3rd Qu.:78.55            3rd Qu.:42.00   3rd Qu.:65471   3rd Qu.:218.8       
##  Max.   :91.43            Max.   :61.00   Max.   :79485   Max.   :270.0       
##                                                                               
##  ad_topic_line          city                male         country         
##  Length:1000        Length:1000        Min.   :0.000   Length:1000       
##  Class :character   Class :character   1st Qu.:0.000   Class :character  
##  Mode  :character   Mode  :character   Median :0.000   Mode  :character  
##                                        Mean   :0.481                     
##                                        3rd Qu.:1.000                     
##                                        Max.   :1.000                     
##                                                                          
##    timestamp                      clicked_on_ad     year          
##  Min.   :2016-01-01 02:52:10.00   Min.   :0.0   Length:1000       
##  1st Qu.:2016-02-18 02:55:42.00   1st Qu.:0.0   Class :character  
##  Median :2016-04-07 17:27:29.50   Median :0.5   Mode  :character  
##  Mean   :2016-04-10 10:56:04.24   Mean   :0.5                     
##  3rd Qu.:2016-05-31 03:18:14.00   3rd Qu.:1.0                     
##  Max.   :2016-07-24 00:22:16.00   Max.   :1.0                     
##                                                                   
##     month               day                hour             minutes         
##  Length:1000        Length:1000        Length:1000        Length:1000       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    seconds          Male      Year      Month         Day           Hour    
##  Length:1000        0:519   2016:1000   01:147   03     : 46   07     : 54  
##  Class :character   1:481               02:160   17     : 42   20     : 50  
##  Mode  :character                       03:156   15     : 41   09     : 49  
##                                         04:147   10     : 37   21     : 48  
##                                         05:147   04     : 36   00     : 45  
##                                         06:142   26     : 36   05     : 44  
##                                         07:101   (Other):762   (Other):710  
##     Minutes       Seconds   
##  02     : 26   22     : 28  
##  07     : 24   10     : 27  
##  13     : 24   35     : 27  
##  10     : 22   37     : 27  
##  21     : 21   38     : 24  
##  33     : 21   15     : 23  
##  (Other):862   (Other):844

##Central Tendancy - Mode, Mean and Median

# First, a function for mode will be created since R does not have a built in function.
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
# City
# This column represents the city where the most users are from
mode.city <- getmode(ad_df$city)
mode.city
## [1] "Lisamouth"
## [1] "Lisamouth"
# Country
# This column represents the country where the most users are from
mode.country <- getmode(ad_df$country)
mode.country
## [1] "Czech Republic"
# Age
# This column represents the Age That most users are, its mean and median
mode.age <- getmode(ad_df$age)
mode.age
## [1] 31
mean(ad_df$age)
## [1] 36.009
median(ad_df$age)
## [1] 35
# Daily Internet Usage
# This column represents the daily internet usage for most users, its mean and median
mode.usage <- getmode(ad_df$daily_internet_usage)
mode.usage
## [1] 167.22
mean(ad_df$daily_internet_usage)
## [1] 180.0001
median(ad_df$daily_internet_usage)
## [1] 183.13
# Area Income
# This column represents most of the Area Income
mode.income <- getmode(ad_df$area_income)
mode.income
## [1] 61833.9
mean(ad_df$area_income)
## [1] 55000
median(ad_df$area_income)
## [1] 57012.3
# Male
# This column represents gender with the most users
mode.male <- getmode(ad_df$male)
mode.male
## [1] 0
# Ad_Topic_line
# This column represents most advertisement topic line
mode.adline<-getmode(ad_df$ad_topic_line)
mode.adline
## [1] "Cloned 5thgeneration orchestration"
# Daily_Time_Spent
# This column represents most frequent daily time spent on site
mode.time <- getmode(ad_df$daily_time_spent_on_site)
mode.time
## [1] 62.26
mean(ad_df$daily_time_spent_on_site)
## [1] 65.0002
median(ad_df$daily_time_spent_on_site)
## [1] 68.215
# Month
# This column represents most frequent months during usage
mode.month <- getmode(ad_df$month)
mode.month
## [1] "02"
# Day
# This column represents most frequent day during usage
mode.day <- getmode(ad_df$day)
mode.day
## [1] "03"
# Hour
# This column represents most frequent hour during usage
mode.hour <- getmode(ad_df$hour)
mode.hour
## [1] "07"
# Minute
# This column represents most frequent Minutes during usage
mode.minutes <- getmode(ad_df$minutes)
mode.minutes
## [1] "02"
# Seconds
# This column represents most frequent months during usage
mode.seconds <- getmode(ad_df$seconds)
mode.seconds
## [1] "22"
# Age
sd.age <- sd(ad_df$age)
sd.age
## [1] 8.785562
var.age <- var(ad_df$age)
var.age
## [1] 77.18611
range.age <- range(ad_df$age)
range.age
## [1] 19 61
skew.age <- skewness(ad_df$age)
skew.age
## [1] 0.4777052
kurt.age <- kurtosis(ad_df$age)
kurt.age
## [1] -0.4097066
# Daily Internet Usage
sd.daily_internet_usage <- sd(ad_df$daily_internet_usage)
sd.daily_internet_usage
## [1] 43.90234
var.daily_internet_usage <- var(ad_df$daily_internet_usage)
var.daily_internet_usage
## [1] 1927.415
range.daily_internet_usage <- range(ad_df$daily_internet_usage)
range.daily_internet_usage
## [1] 104.78 269.96
skew.daily_internet_usage <- skewness(ad_df$daily_internet_usage)
skew.daily_internet_usage
## [1] -0.03343681
kurt.daily_internet_usage <- kurtosis(ad_df$daily_internet_usage)
kurt.daily_internet_usage
## [1] -1.275752
# Daily time spent on site
sd.daily_time_spent_on_site <- sd(ad_df$daily_time_spent_on_site)
sd.daily_time_spent_on_site
## [1] 15.85361
var.daily_time_spent_on_site <- var(ad_df$daily_time_spent_on_site)
var.daily_time_spent_on_site
## [1] 251.3371
range.daily_time_spent_on_site <- range(ad_df$daily_time_spent_on_site)
range.daily_time_spent_on_site
## [1] 32.60 91.43
skew.daily_time_spent_on_site <- skewness(ad_df$daily_time_spent_on_site)
skew.daily_time_spent_on_site
## [1] -0.370646
kurt.daily_time_spent_on_site <- kurtosis(ad_df$daily_time_spent_on_site)
kurt.daily_time_spent_on_site
## [1] -1.099864
# Area Income
sd.area_income <- sd(ad_df$area_income)
sd.area_income
## [1] 13414.63
var.area_income <- var(ad_df$area_income)
var.area_income
## [1] 179952406
range.area_income <- range(ad_df$area_income)
range.area_income
## [1] 13996.5 79484.8
skew.area_income <- skewness(ad_df$area_income)
skew.area_income
## [1] -0.6484229
kurt.area_income <- kurtosis(ad_df$area_income)
kurt.area_income
## [1] -0.1110924
# Correlation Matrix
# Calling all the numerical data present
age<- ad_df$age
income<-ad_df$area_income
time<-ad_df$daily_time_spent_on_site
usage<-ad_df$daily_internet_usage

# Creating a new dataframe num with numerical data variables
num_data <- data.frame(age, income, time, usage)
head(num_data)
##   age   income  time  usage
## 1  35 61833.90 68.95 256.09
## 2  31 68441.85 80.23 193.77
## 3  26 59785.94 69.47 236.50
## 4  29 54806.18 74.15 245.89
## 5  35 73889.99 68.37 225.58
## 6  23 59761.56 59.99 226.74

Correlation:

# Correlation is a statistical technique that can show whether and how strongly pairs of variables are related.
# Calculating the correlation matrix
corr <- cor(num_data)
head(corr)
##               age     income       time      usage
## age     1.0000000 -0.1826050 -0.3315133 -0.3672086
## income -0.1826050  1.0000000  0.3109544  0.3374955
## time   -0.3315133  0.3109544  1.0000000  0.5186585
## usage  -0.3672086  0.3374955  0.5186585  1.0000000
# Plotting the correlation matrix
ggcorrplot(corr,hc.order = TRUE)

Observations:

  1. Daily_internet_usage and Daily_time_spent_on_site seem to have a moderate positive correlation
  2. Daily_internet_usage and Age seem to have a negative correlation
  3. Area Income and Age are weakly correlated.
# Finding out and previewing the Number of clicked and no clicked ads
ggplot(ad_df, aes(clicked_on_ad)) + geom_bar(fill = "red")

# Finding out and previewing the month with the most clicked ads
ggplot(ad_df, aes(x = clicked_on_ad, y = month)) + geom_col(aes(fill = clicked_on_ad))

# Finding out and previewing the day with the most clicked ads
ggplot(data = ad_df) +
geom_bar(mapping = aes(y = day, fill = clicked_on_ad), position = "dodge")
## Warning: The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

##Observation:

February and May had the most clicked ads while July had the least. March and April had an equal number of clicked ads.

The most activity recorded is in the first 3 months, from both who clicked the ads and those who didn’t. January (1), March (3) and July (7) had more activity from those who did not click on the ads as compared to those who clicked on the ads. Months February (2), April (4) and May (5) had more people who clicked on the ads as compared to those who did not click on the ads June (6) had an equal number of people who clicked on the ads and those who did Not

We observe that at around mid month we had more people who were not clicking on the ads as compared to the beginning and the end of the month

# Finding out and previewing the hours with the most clicked ads
ggplot(data = ad_df) +
geom_bar(mapping = aes(y = hour, fill = clicked_on_ad), position = "dodge")
## Warning: The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

##Observation:

From around 8 pm to 11 pm, we have more people not clicking on ads as compared to those who clicked on the ads before 8 pm and a little after Midnight. 3, 6, 9 and 11 am are the morning hours with the most clicked ads while 3,5 and 6 pm are the hours with the most clicks on the ads in the evening.

# Finding out and previewing the gender with the most clicked ads
ggplot(data = ad_df) +
geom_bar(mapping = aes(y = male, fill = clicked_on_ad), position = "dodge")
## Warning: The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

Observations -

We have more number of females who clicked on the ads as compared to those who did not. Most males did not click on the ads.

# Area Income vs Number of ad clicks
# Finding out and previewing boxplots to show how the area income relates with the number of clicks
ggplot(data = ad_df, mapping = aes( x = area_income, y = clicked_on_ad, fill = clicked_on_ad)) +
geom_boxplot() +
coord_flip()
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Warning: The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

Most people who clicked on the ads have a lower income as compared to those who did Not click on the ads

# Age vs Number of ad clicks
# Finding out and previewing boxplots to show how the age relates with the number of clicks
ggplot(data = ad_df, mapping = aes( x = age, y = clicked_on_ad, fill = clicked_on_ad)) +
geom_boxplot() +
coord_flip()
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Warning: The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

Most people who clicked on the ads were older than those who did NOt click on the ads

# Daily Time spent on site vs Number of ad clicks
# Finding out and previewing boxplots to show how the daily time spent on site relates with the number of clicks
ggplot(data = ad_df, mapping = aes( x = daily_time_spent_on_site, y = clicked_on_ad, fill = clicked_on_ad)) +
geom_boxplot() +
coord_flip()
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Warning: The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

Most people who clicked on the ads spent way less time on the site as compared to thos who did not click on the ads

# Daily internet usage vs Number of ad clicks
# Finding out and previewing boxplots to show how the daily internet usage relates with the number of clicks
ggplot(data = ad_df, mapping = aes( x = daily_internet_usage, y = clicked_on_ad, fill = clicked_on_ad)) +
geom_boxplot() +
coord_flip()
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Warning: The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

The daily internet usage of most people who clicked on the ads is way less than those who did NOt click on the ads

# Age vs Gender
# Finding out and previewing boxplots to show how the Age relates with the gender
ggplot(data = ad_df, mapping = aes( x = male, y = age, fill = clicked_on_ad)) +
geom_boxplot()
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Warning: The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

Conclusion:

The entepreneur should target people with lower area income levels, older and those who spend less time on the site.

Generally, those who clicked on the ads were older, but the males were slightly older than the females

# Daily time spent on site vs Gender
# Finding out and previewing boxplots to show how the Age relates with the gender
ggplot(data = ad_df, mapping = aes( x = male , y = daily_time_spent_on_site, fill = clicked_on_ad)) +
geom_boxplot() +
coord_flip()
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Warning: The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

More of those who click on the ads spend less time on the site. Of those who click on the ads, the females generally spend more time on the site as compared to the males

# Daily internet usage vs Gender
# Finding out and previewing boxplots to show how the Age relates with the gender
ggplot(data = ad_df, mapping = aes( x = male , y = daily_internet_usage, fill = clicked_on_ad)) +
geom_boxplot() +
coord_flip()
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?
## Warning: The following aesthetics were dropped during statistical transformation: fill
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

In general, those who click on the ads have a lower daily internet usage, with a few observations as outlier values with the males were slightly more than the females

# Age vs Area Income
# Finding out and previewing scatterplots showing how the Age relates with the Area Income
ggplot(data = ad_df) +
geom_point(mapping = aes(x = age , y = area_income, color = clicked_on_ad))

We observe that the number of people who clicked on the ads are more evenly distributed while most of the people who did not click on the ads have a higher area income and a bit younger

# Daily Internet usage vs Area Income
# Finding out and previewing scatterplots showing how the Daily internet usage relates with the Area Income
ggplot(data = ad_df) +
geom_point(mapping = aes(x = daily_internet_usage , y = area_income, color = clicked_on_ad))

A great number of clicks comes from people who’s daily internet usage is quite low and area income is also lower as compared to those who do Not click on the ads whose daily internet usage is significantly higher

# Age vs Daily time spent on site
# Finding out and previewing scatterplots showing how the Daily time spent on sites relates with the Age
ggplot(data = ad_df) +
geom_point(mapping = aes(x = daily_time_spent_on_site , y = age, color = clicked_on_ad))

A huge chuck of clicks come from people who spend significantly little time on the site as compared to those who spend more time on the site regardless of age

# Daily Internet Usage vs Daily time spent on site
# Finding out and previewing scatterplots showing how the Daily time spent on sites relates with the Daily internet usage
ggplot(data = ad_df) + 
  geom_point(mapping = aes(x = daily_time_spent_on_site, y = daily_internet_usage, color = clicked_on_ad))

Clearly, more clicks come from people who spend less time on the site and people whose daily internet usage is significantly lower as compared to those who spend more time on the site and have a high daily internet usage The ads are getting more clicks from people who spend less time on the site and those whose daily internet usage is low.

Implimenting the Solution:

# Getting a glimpse of our columns and datatypes
glimpse(ad_df)
## Rows: 1,000
## Columns: 23
## $ daily_time_spent_on_site <dbl> 68.95, 80.23, 69.47, 74.15, 68.37, 59.99, 88.…
## $ age                      <int> 35, 31, 26, 29, 35, 23, 33, 48, 30, 20, 49, 3…
## $ area_income              <dbl> 61833.90, 68441.85, 59785.94, 54806.18, 73889…
## $ daily_internet_usage     <dbl> 256.09, 193.77, 236.50, 245.89, 225.58, 226.7…
## $ ad_topic_line            <chr> "Cloned 5thgeneration orchestration", "Monito…
## $ city                     <chr> "Wrightburgh", "West Jodi", "Davidton", "West…
## $ male                     <int> 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, …
## $ country                  <chr> "Tunisia", "Nauru", "San Marino", "Italy", "I…
## $ timestamp                <dttm> 2016-03-27 00:53:11, 2016-04-04 01:39:02, 20…
## $ clicked_on_ad            <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, …
## $ year                     <chr> "2016", "2016", "2016", "2016", "2016", "2016…
## $ month                    <chr> "03", "04", "03", "01", "06", "05", "01", "03…
## $ day                      <chr> "27", "04", "13", "10", "03", "19", "28", "07…
## $ hour                     <chr> "00", "01", "20", "02", "03", "14", "20", "01…
## $ minutes                  <chr> "53", "39", "35", "31", "36", "30", "59", "40…
## $ seconds                  <chr> "11", "02", "42", "19", "18", "17", "32", "15…
## $ Male                     <fct> 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, …
## $ Year                     <fct> 2016, 2016, 2016, 2016, 2016, 2016, 2016, 201…
## $ Month                    <fct> 03, 04, 03, 01, 06, 05, 01, 03, 04, 07, 03, 0…
## $ Day                      <fct> 27, 04, 13, 10, 03, 19, 28, 07, 18, 11, 16, 0…
## $ Hour                     <fct> 00, 01, 20, 02, 03, 14, 20, 01, 09, 01, 20, 0…
## $ Minutes                  <fct> 53, 39, 35, 31, 36, 30, 59, 40, 33, 42, 19, 1…
## $ Seconds                  <fct> 11, 02, 42, 19, 18, 17, 32, 15, 42, 51, 01, 1…
# Conversion of City and Country to Continent
ad_df$continent <- countrycode(sourcevar = ad_df[, "country"],
origin = "country.name",
destination = "continent")
## Warning: Some values were not matched unambiguously: Antarctica (the territory South of 60 deg S), Bouvet Island (Bouvetoya), British Indian Ocean Territory (Chagos Archipelago), French Southern Territories, Heard Island and McDonald Islands, Micronesia, Saint Martin, South Georgia and the South Sandwich Islands, United States Minor Outlying Islands

The cities and countries have high cardinality hence we will convert the countries to continents using the countrycode package and use the continents to perform the modelling

## Warning in countrycode(sourcevar = ad_df[, "country"], origin = "country.name", : Some values were not matched unambiguously: Antarctica (the territory South of 60 deg S), Bouvet Island (Bouvetoya), British Indian Ocean Territory (Chagos Archipelago), French Southern Territories, Heard Island and McDonald Islands, Micronesia, Saint Martin, South Georgia and the South Sandwich Islands, United States Minor Outlying Islands

# Finding out and Previewing if our columns have changed

head(ad_df)
##   daily_time_spent_on_site age area_income daily_internet_usage
## 1                    68.95  35    61833.90               256.09
## 2                    80.23  31    68441.85               193.77
## 3                    69.47  26    59785.94               236.50
## 4                    74.15  29    54806.18               245.89
## 5                    68.37  35    73889.99               225.58
## 6                    59.99  23    59761.56               226.74
##                           ad_topic_line           city male    country
## 1    Cloned 5thgeneration orchestration    Wrightburgh    0    Tunisia
## 2    Monitored national standardization      West Jodi    1      Nauru
## 3      Organic bottom-line service-desk       Davidton    0 San Marino
## 4 Triple-buffered reciprocal time-frame West Terrifurt    1      Italy
## 5         Robust logistical utilization   South Manuel    0    Iceland
## 6       Sharable client-driven software      Jamieberg    1     Norway
##             timestamp clicked_on_ad year month day hour minutes seconds Male
## 1 2016-03-27 00:53:11             0 2016    03  27   00      53      11    0
## 2 2016-04-04 01:39:02             0 2016    04  04   01      39      02    1
## 3 2016-03-13 20:35:42             0 2016    03  13   20      35      42    0
## 4 2016-01-10 02:31:19             0 2016    01  10   02      31      19    1
## 5 2016-06-03 03:36:18             0 2016    06  03   03      36      18    0
## 6 2016-05-19 14:30:17             0 2016    05  19   14      30      17    1
##   Year Month Day Hour Minutes Seconds continent
## 1 2016    03  27   00      53      11    Africa
## 2 2016    04  04   01      39      02   Oceania
## 3 2016    03  13   20      35      42    Europe
## 4 2016    01  10   02      31      19    Europe
## 5 2016    06  03   03      36      18    Europe
## 6 2016    05  19   14      30      17    Europe
# Encompasing the continents into factors
ad_df$continent <- factor(ad_df$continent, order = TRUE, levels = c('Africa', 'Americas', 'Asia', 'Europe', 'Oceania'))

                                                                    
# Exploring the Continent Column
table(ad_df$continent)                                                            
## 
##   Africa Americas     Asia   Europe  Oceania 
##      214      219      218      214      100
# Pre-processing:
# Converting variables to appropriate data types for modeling
# Converting factor variables to integers
ad_df$male = as.numeric(ad_df$male)
ad_df$month = as.numeric(ad_df$month)
ad_df$day = as.numeric(ad_df$day)
ad_df$hour = as.numeric(ad_df$hour)
ad_df$minutes = as.numeric(ad_df$minutes)
ad_df$seconds = as.numeric(ad_df$seconds)

# Converting the clicked on ad variable as a factor
ad_df$clicked_on_ad = as.factor(ad_df$clicked_on_ad)
# Previewing the dataset to see if changes were effected
glimpse(ad_df)
## Rows: 1,000
## Columns: 24
## $ daily_time_spent_on_site <dbl> 68.95, 80.23, 69.47, 74.15, 68.37, 59.99, 88.…
## $ age                      <int> 35, 31, 26, 29, 35, 23, 33, 48, 30, 20, 49, 3…
## $ area_income              <dbl> 61833.90, 68441.85, 59785.94, 54806.18, 73889…
## $ daily_internet_usage     <dbl> 256.09, 193.77, 236.50, 245.89, 225.58, 226.7…
## $ ad_topic_line            <chr> "Cloned 5thgeneration orchestration", "Monito…
## $ city                     <chr> "Wrightburgh", "West Jodi", "Davidton", "West…
## $ male                     <dbl> 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, …
## $ country                  <chr> "Tunisia", "Nauru", "San Marino", "Italy", "I…
## $ timestamp                <dttm> 2016-03-27 00:53:11, 2016-04-04 01:39:02, 20…
## $ clicked_on_ad            <fct> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, …
## $ year                     <chr> "2016", "2016", "2016", "2016", "2016", "2016…
## $ month                    <dbl> 3, 4, 3, 1, 6, 5, 1, 3, 4, 7, 3, 5, 6, 4, 3, …
## $ day                      <dbl> 27, 4, 13, 10, 3, 19, 28, 7, 18, 11, 16, 8, 3…
## $ hour                     <dbl> 0, 1, 20, 2, 3, 14, 20, 1, 9, 1, 20, 8, 1, 21…
## $ minutes                  <dbl> 53, 39, 35, 31, 36, 30, 59, 40, 33, 42, 19, 1…
## $ seconds                  <dbl> 11, 2, 42, 19, 18, 17, 32, 15, 42, 51, 1, 10,…
## $ Male                     <fct> 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, …
## $ Year                     <fct> 2016, 2016, 2016, 2016, 2016, 2016, 2016, 201…
## $ Month                    <fct> 03, 04, 03, 01, 06, 05, 01, 03, 04, 07, 03, 0…
## $ Day                      <fct> 27, 04, 13, 10, 03, 19, 28, 07, 18, 11, 16, 0…
## $ Hour                     <fct> 00, 01, 20, 02, 03, 14, 20, 01, 09, 01, 20, 0…
## $ Minutes                  <fct> 53, 39, 35, 31, 36, 30, 59, 40, 33, 42, 19, 1…
## $ Seconds                  <fct> 11, 02, 42, 19, 18, 17, 32, 15, 42, 51, 01, 1…
## $ continent                <ord> Africa, Oceania, Europe, Europe, Europe, Euro…
# Encoding the continent character variable
ad_df$continent <- factor(ad_df$continent, order = TRUE, levels = c('Africa', 'Americas', 'Asia', 'Europe', 'Oceania'))

Some countries could not be recognized by the function countrycode hence were not assigned a continent, this gave rise to some null values, let’s look into this

# Checking for null values brought about by unassigned countries and drop them
colSums(is.na(ad_df))
## daily_time_spent_on_site                      age              area_income 
##                        0                        0                        0 
##     daily_internet_usage            ad_topic_line                     city 
##                        0                        0                        0 
##                     male                  country                timestamp 
##                        0                        0                        0 
##            clicked_on_ad                     year                    month 
##                        0                        0                        0 
##                      day                     hour                  minutes 
##                        0                        0                        0 
##                  seconds                     Male                     Year 
##                        0                        0                        0 
##                    Month                      Day                     Hour 
##                        0                        0                        0 
##                  Minutes                  Seconds                continent 
##                        0                        0                       35
ad_df <- na.omit(ad_df)
colSums(is.na(ad_df))
## daily_time_spent_on_site                      age              area_income 
##                        0                        0                        0 
##     daily_internet_usage            ad_topic_line                     city 
##                        0                        0                        0 
##                     male                  country                timestamp 
##                        0                        0                        0 
##            clicked_on_ad                     year                    month 
##                        0                        0                        0 
##                      day                     hour                  minutes 
##                        0                        0                        0 
##                  seconds                     Male                     Year 
##                        0                        0                        0 
##                    Month                      Day                     Hour 
##                        0                        0                        0 
##                  Minutes                  Seconds                continent 
##                        0                        0                        0
# We will delete the timestamp, year and continent columns as they are irrelevant for modeling as well as omitting the ad_line_topic, country, and city columns as they have high cardinality
# We will preview the results to see if the changes have been effected
ad_df$timestamp <- NULL
ad_df$year <- NULL
ad_df$ad_topic_line <- NULL
ad_df$city <- NULL
ad_df$country <- NULL
ad_df$continent <- NULL
ad_df$Male <- NULL
ad_df$Day <- NULL
ad_df$Minutes <- NULL
ad_df$Seconds <- NULL
ad_df$Hour <- NULL
ad_df$Month <- NULL
ad_df$Year <- NULL
colnames(ad_df)
##  [1] "daily_time_spent_on_site" "age"                     
##  [3] "area_income"              "daily_internet_usage"    
##  [5] "male"                     "clicked_on_ad"           
##  [7] "month"                    "day"                     
##  [9] "hour"                     "minutes"                 
## [11] "seconds"
describe(ad_df)
##                          vars   n     mean       sd   median  trimmed      mad
## daily_time_spent_on_site    1 965    65.15    15.76    68.25    65.91    17.76
## age                         2 965    36.04     8.83    35.00    35.54     8.90
## area_income                 3 965 54972.55 13433.69 56986.73 55990.94 13370.69
## daily_internet_usage        4 965   179.86    43.96   182.65   179.85    58.73
## male                        5 965     0.48     0.50     0.00     0.47     0.00
## clicked_on_ad*              6 965     1.50     0.50     2.00     1.50     0.00
## month                       7 965     3.81     1.92     4.00     3.76     2.97
## day                         8 965    15.54     8.76    15.00    15.48    11.86
## hour                        9 965    11.68     6.97    12.00    11.71     8.90
## minutes                    10 965    29.13    17.22    30.00    29.09    22.24
## seconds                    11 965    29.71    16.88    30.00    29.80    20.76
##                               min      max    range  skew kurtosis     se
## daily_time_spent_on_site    32.60    91.43    58.83 -0.38    -1.07   0.51
## age                         19.00    61.00    42.00  0.47    -0.43   0.28
## area_income              13996.50 79484.80 65488.30 -0.64    -0.13 432.45
## daily_internet_usage       104.78   269.96   165.18 -0.03    -1.27   1.41
## male                         0.00     1.00     1.00  0.09    -1.99   0.02
## clicked_on_ad*               1.00     2.00     1.00  0.00    -2.00   0.02
## month                        1.00     7.00     6.00  0.09    -1.18   0.06
## day                          1.00    31.00    30.00  0.04    -1.18   0.28
## hour                         0.00    23.00    23.00 -0.01    -1.23   0.22
## minutes                      0.00    59.00    59.00  0.02    -1.18   0.55
## seconds                      0.00    59.00    59.00 -0.03    -1.15   0.54
# Normalizing the continous variables
normalize <- function(x) (
return( ((x - min(x)) /(max(x)-min(x))) )
)
ad_df$daily_time_spent_on_site <- normalize(ad_df$daily_time_spent_on_site)
ad_df$daily_internet_usage <- normalize(ad_df$daily_internet_usage)
ad_df$area_income <- normalize(ad_df$area_income)
ad_df$age<- normalize(ad_df$age)
head(ad_df)
##   daily_time_spent_on_site       age area_income daily_internet_usage male
## 1                0.6178820 0.3809524   0.7304725            0.9160310    0
## 2                0.8096209 0.2857143   0.8313752            0.5387456    1
## 3                0.6267211 0.1666667   0.6992003            0.7974331    0
## 4                0.7062723 0.2380952   0.6231599            0.8542802    1
## 5                0.6080231 0.3809524   0.9145678            0.7313234    0
## 6                0.4655788 0.0952381   0.6988280            0.7383460    1
##   clicked_on_ad month day hour minutes seconds
## 1             0     3  27    0      53      11
## 2             0     4   4    1      39       2
## 3             0     3  13   20      35      42
## 4             0     1  10    2      31      19
## 5             0     6   3    3      36      18
## 6             0     5  19   14      30      17

We observe that all the variables have varying range values both continous and factor variables. Hence we will normalize the continous variables

K Nearest Neighbours

set.seed(123)
# Creating a random number equal 70% of total number of rows
ran <- sample(1:nrow(ad_df),0.7 * nrow(ad_df))
# The training dataset extracted
ad_train <- ad_df[ran,]
#head(ad_train)
# The test dataset extracted
ad_test <- ad_df[-ran,]
#ad_test
# Extracting the target variable from the target variable
ad_target <- (ad_df[ran,6])
#ad_target
# Extracting the target variable from the test dataset
test_target <- (ad_df[-ran,6])
#test_target
# Calculating the square root of the length of the target variable to get an optimal k
print(sqrt(length(ad_test)))
## [1] 3.316625
# Running the knn function, with k = 3 as from the above calculation
library(class)
k <- knn(ad_train,ad_test,cl=ad_target,k=3)

# Creating the confucion matrix
matrix <- table(k,test_target)
print(matrix)
##    test_target
## k    0  1
##   0 77 56
##   1 74 83
# Checking the accuracy
#Calculating the accuracy score of our model
accuracy <- function(x){sum(diag(x)/(sum(rowSums(x)))) * 100}
accuracy(matrix)
## [1] 55.17241

Our baseline model has an accuracy score of 53%, which is poor hence we will use random forests, svm and naive bayes to try and achieve a better accuracy

# Building Naive Bayes model on our data
# setting the metod as cross validation with 10 iterations
model <- train(ad_train, ad_target, method = 'nb', trControl = trainControl(method = 'cv', number = 10))
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 5
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 7
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 10
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 15
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 18
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 20
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 22
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 25
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 27
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 37
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 45
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 57
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 5
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 7
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 8
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 9
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 10
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 14
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 15
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 18
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 19
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 20
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 22
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 25
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 27
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 30
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 33
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 34
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 35
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 37
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 39
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 45
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 48
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 52
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 54
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 56
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 57
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 58
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 69
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 6
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 34
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 52
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 53
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 3
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 4
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 5
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 6
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 8
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 9
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 16
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 18
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 20
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 24
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 25
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 26
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 29
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 32
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 34
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 37
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 40
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 42
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 44
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 50
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 51
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 52
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 53
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 57
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 62
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 66
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 8
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 27
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 30
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 33
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 34
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 36
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 54
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 59
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 1
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 2
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 5
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 8
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 9
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 11
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 14
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 16
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 18
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 20
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 25
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 27
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 30
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 33
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 34
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 36
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 42
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 43
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 45
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 47
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 52
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 54
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 59
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 62
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 64
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 65
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 67
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 2
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 5
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 10
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 24
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 26
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 29
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 41
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 2
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 5
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 7
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 10
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 11
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 13
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 17
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 22
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 24
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 26
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 27
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 29
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 31
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 34
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 41
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 45
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 47
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 48
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 50
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 58
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 62
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 65
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 66
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 67
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 33
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 37
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 42
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 52
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 62
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 67
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 2
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 4
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 7
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 8
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 10
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 12
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 15
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 16
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 24
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 26
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 29
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 31
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 33
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 34
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 35
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 36
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 37
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 38
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 42
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 44
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 47
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 52
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 58
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 61
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 62
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 64
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 67
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 7
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 35
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 51
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 60
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 61
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 3
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 5
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 7
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 13
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 18
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 23
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 27
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 28
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 35
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 37
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 51
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 55
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 57
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 58
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 60
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 61
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 62
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 63
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 67
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 6
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 22
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 32
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 43
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 45
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 54
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 61
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 6
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 12
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 14
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 16
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 20
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 21
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 22
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 24
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 32
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 33
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 38
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 42
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 43
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 44
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 45
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 47
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 49
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 50
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 52
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 54
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 55
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 58
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 59
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 60
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 61
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 65
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 67
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 1
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 5
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 13
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 16
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 27
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 41
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 58
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 68
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 1
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 3
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 5
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 9
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 11
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 13
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 14
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 16
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 22
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 25
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 27
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 28
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 37
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 38
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 41
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 46
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 48
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 54
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 57
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 58
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 63
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 64
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 68
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 3
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 5
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 14
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 18
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 20
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 22
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 23
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 34
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 54
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 56
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 60
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 64
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 1
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 3
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 4
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 5
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 9
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 10
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 12
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 14
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 15
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 17
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 18
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 20
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 22
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 23
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 25
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 26
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 27
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 30
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 32
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 34
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 35
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 37
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 42
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 44
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 45
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 49
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 54
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 56
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 60
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 64
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 65
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 66
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 67
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 7
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 13
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 16
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 26
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 28
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 56
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 7
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 8
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 10
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 11
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 13
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 14
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 16
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 17
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 19
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 26
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 28
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 29
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 31
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 35
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 36
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 39
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 46
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 49
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 50
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 55
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 56
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 58

SVM Model

#setting the method to repeated cv and 10 number of iterations
trctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
#fitting svm linear
svm_Linear <- train(clicked_on_ad~., data = ad_train, method = "svmLinear",
trControl= trctrl,
preProcess = c("center", "scale"),
tuneLength = 10)
#result of our train model
svm_Linear
## Support Vector Machines with Linear Kernel 
## 
## 675 samples
##  10 predictor
##   2 classes: '0', '1' 
## 
## Pre-processing: centered (10), scaled (10) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 607, 608, 607, 608, 607, 608, ... 
## Resampling results:
## 
##   Accuracy  Kappa    
##   0.968385  0.9367738
## 
## Tuning parameter 'C' was held constant at a value of 1

The svm model has an accuracy of approximately 98% on the data, with a total of 5 incorrect classifications. This is so far the best model for the data as it has a reasonable accuracy score.

# Hyperparameter Tuning for SVM
# Control params for SVM
ctrl <- trainControl(
method = "cv",
number = 10,
)
# Tune an SVM
set.seed(7000)
svm <- train(
clicked_on_ad ~ .,
data = ad_train,
method = "svmRadial",
preProcess = c("center", "scale"),
trControl = ctrl,
tuneLength = 10
)
# Looking at the SVM predictions
svm_pred <- predict(svm, newdata = ad_test)

# Plotting confusion matrix
confusionMatrix(table(svm_pred, ad_test$clicked_on_ad))
## Confusion Matrix and Statistics
## 
##         
## svm_pred   0   1
##        0 150   8
##        1   1 131
##                                           
##                Accuracy : 0.969           
##                  95% CI : (0.9419, 0.9857)
##     No Information Rate : 0.5207          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9377          
##                                           
##  Mcnemar's Test P-Value : 0.0455          
##                                           
##             Sensitivity : 0.9934          
##             Specificity : 0.9424          
##          Pos Pred Value : 0.9494          
##          Neg Pred Value : 0.9924          
##              Prevalence : 0.5207          
##          Detection Rate : 0.5172          
##    Detection Prevalence : 0.5448          
##       Balanced Accuracy : 0.9679          
##                                           
##        'Positive' Class : 0               
## 

The svmRadial has actually performed worse than the svmLinear hence the svmLinear is the better svm model for this dataset

Random Forest:

#fitting a single tree to the data
set.seed(12)
rforests <- train(clicked_on_ad ~ .,
data = ad_df,
method = "ranger")
rforests
## Random Forest 
## 
## 965 samples
##  10 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 965, 965, 965, 965, 965, 965, ... 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   Accuracy   Kappa    
##    2    gini        0.9618081  0.9235101
##    2    extratrees  0.9632760  0.9264573
##    6    gini        0.9634395  0.9267670
##    6    extratrees  0.9643680  0.9286341
##   10    gini        0.9565851  0.9130513
##   10    extratrees  0.9632889  0.9264809
## 
## Tuning parameter 'min.node.size' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 6, splitrule = extratrees
##  and min.node.size = 1.
# plotting the model
plot(rforests)

The Gini index reduces steadily from 2 to 8 and then gradually from 8 to 14, with an aim to reduce gini impurity. This means that the splitting rule is selecting the best gin but towards the end, it is overfitting

# Improving model performance
# Training the model
rforests_model <- train(clicked_on_ad ~ .,
data = ad_train,
method = "ranger",
tuneLength = 5)
# Setting grid search
set.seed(42)
myGrid <- expand.grid(mtry = c(3,5,10),
splitrule = c("gini", "extratrees"),
min.node.size = 10)
rforests_model <- train(clicked_on_ad ~ .,
data = ad_train,
method = "ranger",
tuneGrid = myGrid,
trControl = trainControl(method = "cv",
number = 5,
verboseIter = FALSE))
# Printing the model
rforests_model
## Random Forest 
## 
## 675 samples
##  10 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 540, 541, 540, 540, 539 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   Accuracy   Kappa    
##    3    gini        0.9585504  0.9170817
##    3    extratrees  0.9615355  0.9230916
##    5    gini        0.9570580  0.9141088
##    5    extratrees  0.9615355  0.9230921
##   10    gini        0.9511431  0.9022737
##   10    extratrees  0.9570800  0.9141872
## 
## Tuning parameter 'min.node.size' was held constant at a value of 10
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were mtry = 3, splitrule = extratrees
##  and min.node.size = 10.
plot(rforests_model)

The Gini is a straight line with a descending slope while the splitting rule increases steadily at first

then stagnates after sometime after selecting the split that minimizes the gini impurity

# Evaluating model performance
# Predictions
rforests_pred <- predict(rforests_model, newdata = ad_test)
#confusion matrix
confusionMatrix(table(rforests_pred, ad_test$clicked_on_ad))
## Confusion Matrix and Statistics
## 
##              
## rforests_pred   0   1
##             0 150   5
##             1   1 134
##                                           
##                Accuracy : 0.9793          
##                  95% CI : (0.9555, 0.9924)
##     No Information Rate : 0.5207          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9585          
##                                           
##  Mcnemar's Test P-Value : 0.2207          
##                                           
##             Sensitivity : 0.9934          
##             Specificity : 0.9640          
##          Pos Pred Value : 0.9677          
##          Neg Pred Value : 0.9926          
##              Prevalence : 0.5207          
##          Detection Rate : 0.5172          
##    Detection Prevalence : 0.5345          
##       Balanced Accuracy : 0.9787          
##                                           
##        'Positive' Class : 0               
## 
# Feature Importance
# Re-run model with permutation-based variable importance
rforests_permutation <- ranger(
formula = clicked_on_ad ~ .,
data = ad_train,
#num.trees = 2000,
mtry = 3,
min.node.size = 10,
#sample.fraction = .80,
replace = FALSE,
importance = "permutation",
respect.unordered.factors = "order",
verbose = FALSE,
seed = 123
)
#plotting a graph of the feature importance
vip::vip(rforests_permutation, num_features = 10)

The Random Forests has obtained an accuracy of approximately 98% with 6 incorrect observations. This is the second best model we have so far.

Daily internet usage Daily time spent on site Age Area income Challenging the solution

From the solutions above, the svm model has performed the best followed by the Decision Trees with a total of 5 and 6 incorrect observations respectively. They were both highly accurate with approximately 98% accuracy hence we can say that this study has been successful.

Thank you!!!!