By Victoria Maina
## 1.Defining the Question
### a) Specifying the Question
To Find and deal with outliers, anomalies, and missing data within the dataset. Perform univariate and bivariate analysis using R
To identify which individuals are most likely to click on her ads.
### b) Defining the Metric for Success
This project will be successful when:
### c) Understanding the context
### d) Recording the Experimental Design The following steps were taken:
Business Understanding
Reading the data
Checking our data
Data cleaning
Performing EDA(univariate,bivariate and multivariate analysis)
Conclusion
### e) Data Relevance
# Importing libraries
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(kernlab)
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:purrr':
##
## cross
## The following object is masked from 'package:ggplot2':
##
## alpha
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(ggplot2)
library(ggcorrplot)
library(dplyr)
library(moments)
library(tinytex)
library(earth)
## Loading required package: Formula
## Loading required package: plotmo
## Loading required package: plotrix
## Loading required package: TeachingDemos
library(Formula)
library(plotmo)
library(rpart)
library(plotrix)
library(purrr)
library(TeachingDemos)
library(prodlim)
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
library(iterators)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
library(gower)
library(numDeriv)
library(SQUAREM)
library(lava)
##
## Attaching package: 'lava'
## The following object is masked from 'package:dplyr':
##
## vars
## The following object is masked from 'package:ggplot2':
##
## vars
library(ipred)
##
## Attaching package: 'ipred'
## The following object is masked from 'package:lava':
##
## cv
library(timeDate)
##
## Attaching package: 'timeDate'
## The following objects are masked from 'package:moments':
##
## kurtosis, skewness
library(foreach)
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
library(ModelMetrics)
##
## Attaching package: 'ModelMetrics'
## The following objects are masked from 'package:caret':
##
## confusionMatrix, precision, recall, sensitivity, specificity
## The following object is masked from 'package:base':
##
## kappa
library(reshape2)
##
## Attaching package: 'reshape2'
## The following objects are masked from 'package:data.table':
##
## dcast, melt
## The following object is masked from 'package:tidyr':
##
## smiths
library(recipes)
##
## Attaching package: 'recipes'
## The following object is masked from 'package:stringr':
##
## fixed
## The following object is masked from 'package:stats':
##
## step
library(plyr)
theme_set(theme_classic())
options(warn = -1)
advertising<-read.csv('http://bit.ly/IPAdvertisingData')
df<-advertising
head(df)
tail(df)
str(df)
## 'data.frame': 1000 obs. of 10 variables:
## $ Daily.Time.Spent.on.Site: num 69 80.2 69.5 74.2 68.4 ...
## $ Age : int 35 31 26 29 35 23 33 48 30 20 ...
## $ Area.Income : num 61834 68442 59786 54806 73890 ...
## $ Daily.Internet.Usage : num 256 194 236 246 226 ...
## $ Ad.Topic.Line : chr "Cloned 5thgeneration orchestration" "Monitored national standardization" "Organic bottom-line service-desk" "Triple-buffered reciprocal time-frame" ...
## $ City : chr "Wrightburgh" "West Jodi" "Davidton" "West Terrifurt" ...
## $ Male : int 0 1 0 1 0 1 0 1 1 1 ...
## $ Country : chr "Tunisia" "Nauru" "San Marino" "Italy" ...
## $ Timestamp : chr "2016-03-27 00:53:11" "2016-04-04 01:39:02" "2016-03-13 20:35:42" "2016-01-10 02:31:19" ...
## $ Clicked.on.Ad : int 0 0 0 0 0 0 0 1 0 0 ...
library(dplyr)
glimpse(df)
## Rows: 1,000
## Columns: 10
## $ Daily.Time.Spent.on.Site <dbl> 68.95, 80.23, 69.47, 74.15, 68.37, 59.99, 88.~
## $ Age <int> 35, 31, 26, 29, 35, 23, 33, 48, 30, 20, 49, 3~
## $ Area.Income <dbl> 61833.90, 68441.85, 59785.94, 54806.18, 73889~
## $ Daily.Internet.Usage <dbl> 256.09, 193.77, 236.50, 245.89, 225.58, 226.7~
## $ Ad.Topic.Line <chr> "Cloned 5thgeneration orchestration", "Monito~
## $ City <chr> "Wrightburgh", "West Jodi", "Davidton", "West~
## $ Male <int> 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, ~
## $ Country <chr> "Tunisia", "Nauru", "San Marino", "Italy", "I~
## $ Timestamp <chr> "2016-03-27 00:53:11", "2016-04-04 01:39:02",~
## $ Clicked.on.Ad <int> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, ~
summary(df)
## Daily.Time.Spent.on.Site Age Area.Income Daily.Internet.Usage
## Min. :32.60 Min. :19.00 Min. :13996 Min. :104.8
## 1st Qu.:51.36 1st Qu.:29.00 1st Qu.:47032 1st Qu.:138.8
## Median :68.22 Median :35.00 Median :57012 Median :183.1
## Mean :65.00 Mean :36.01 Mean :55000 Mean :180.0
## 3rd Qu.:78.55 3rd Qu.:42.00 3rd Qu.:65471 3rd Qu.:218.8
## Max. :91.43 Max. :61.00 Max. :79485 Max. :270.0
## Ad.Topic.Line City Male Country
## Length:1000 Length:1000 Min. :0.000 Length:1000
## Class :character Class :character 1st Qu.:0.000 Class :character
## Mode :character Mode :character Median :0.000 Mode :character
## Mean :0.481
## 3rd Qu.:1.000
## Max. :1.000
## Timestamp Clicked.on.Ad
## Length:1000 Min. :0.0
## Class :character 1st Qu.:0.0
## Mode :character Median :0.5
## Mean :0.5
## 3rd Qu.:1.0
## Max. :1.0
class(df)
## [1] "data.frame"
# checking for the sum of missing values in each column
colSums(is.na(df))
## Daily.Time.Spent.on.Site Age Area.Income
## 0 0 0
## Daily.Internet.Usage Ad.Topic.Line City
## 0 0 0
## Male Country Timestamp
## 0 0 0
## Clicked.on.Ad
## 0
There are no missing values within our dataset.
# checking for duplicates
duplicated_rows <- colSums(df[duplicated(df),])
duplicated_rows
## Daily.Time.Spent.on.Site Age Area.Income
## 0 0 0
## Daily.Internet.Usage Ad.Topic.Line City
## 0 0 0
## Male Country Timestamp
## 0 0 0
## Clicked.on.Ad
## 0
There no duplicates in the dataset
# Changing the column namesto lower case
names(df) <- tolower(names(df))
names(df)
## [1] "daily.time.spent.on.site" "age"
## [3] "area.income" "daily.internet.usage"
## [5] "ad.topic.line" "city"
## [7] "male" "country"
## [9] "timestamp" "clicked.on.ad"
library(stringr)
colnames(df) = str_replace_all(colnames(df), c(' ' = '_'))
colnames(df)
## [1] "daily.time.spent.on.site" "age"
## [3] "area.income" "daily.internet.usage"
## [5] "ad.topic.line" "city"
## [7] "male" "country"
## [9] "timestamp" "clicked.on.ad"
Checking for duplicates
anyDuplicated(df)
## [1] 0
There are no duplicates in the dataset.
# Using a boxplot to check for observations far away from other data points.
# We will Use all three double type columns: specifying each
daily_time_spent_on_site <- df$ daily_time_spent_on_site
age <- df$age
daily_internet_usage <- df$daily_internet_usage
area_income <- df$area_income
boxplot(daily_time_spent_on_site,age, daily_internet_usage,
main = "Multiple boxplots to check for outliers",
at = c(1,2,3),
names = c("Time", "Age","Iternet_Usage"),
las = 2,
col = c("orange","red","blue"),
border = "brown",
horizontal = TRUE,
notch = TRUE
)
The Daily_Time_Spent_on_Site,Age, Daily_Internet_Usage variables do not seem to have any outliers.
numeric_columns = c("daily_time_spent_on_site", "age", "area_income", "daily_internet_usage", "male", "timestamp")
mean(df$daily.time.spent.on.site)
## [1] 65.0002
mean(df$area.income)
## [1] 55000
mean(df$age)
## [1] 36.009
mean(df$male)
## [1] 0.481
mean(df$daily.internet.usage)
## [1] 180.0001
The mean of daily time spent on site is 65.0002
the mean of age is 36.009
the mean of area income is 55000
the mean of male column is 0.481
the mean of internet usage column is 180.001 #### 5.1.2 Mode of Numeric Columns
# We create the mode function that will perform our mode operation for us
# ---
#
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
getmode(df$daily.time.spent.on.site)
## [1] 62.26
getmode(df$age)
## [1] 31
getmode(df$area.income)
## [1] 61833.9
getmode(df$daily.internet.usage)
## [1] 167.22
getmode(df$male)
## [1] 0
getmode(df$timestamp)
## [1] "2016-03-27 00:53:11"
mode of daily time spent on site is 62.26
mode of age is 31
mode of area income is 61833.9
mode of daily internet usage is 167.22
mode of male is 0
mode of time stamp column is “2016-03-27 00:53:11 UTC”
median(df$daily.time.spent.on.site)
## [1] 68.215
median(df$age)
## [1] 35
median(df$area.income)
## [1] 57012.3
median(df$daily.internet.usage)
## [1] 183.13
median(df$male)
## [1] 0
median of daily time spent on site is 68.215
median of age is 35
median of area income is 57012.3
median of daily internet usage is 183.13
median of male is 0
range(df$daily.time.spent.on.site)
## [1] 32.60 91.43
range(df$age)
## [1] 19 61
range(df$area.income)
## [1] 13996.5 79484.8
range(df$daily.internet.usage)
## [1] 104.78 269.96
range(df$male)
## [1] 0 1
sd(df$daily.time.spent.on.site)
## [1] 15.85361
sd(df$age)
## [1] 8.785562
sd(df$area.income)
## [1] 13414.63
sd(df$daily.internet.usage)
## [1] 43.90234
sd(df$male)
## [1] 0.4998889
var(df$daily.time.spent.on.site)
## [1] 251.3371
var(df$age)
## [1] 77.18611
var(df$area.income)
## [1] 179952406
var(df$daily.internet.usage)
## [1] 1927.415
var(df$male)
## [1] 0.2498889
quantile(df$daily.time.spent.on.site)
## 0% 25% 50% 75% 100%
## 32.6000 51.3600 68.2150 78.5475 91.4300
quantile(df$age)
## 0% 25% 50% 75% 100%
## 19 29 35 42 61
quantile(df$area.income)
## 0% 25% 50% 75% 100%
## 13996.50 47031.80 57012.30 65470.64 79484.80
quantile(df$daily.internet.usage)
## 0% 25% 50% 75% 100%
## 104.7800 138.8300 183.1300 218.7925 269.9600
quantile(df$male)
## 0% 25% 50% 75% 100%
## 0 0 0 1 1
skewness(df$daily.time.spent.on.site)
## [1] -0.370646
## attr(,"method")
## [1] "moment"
skewness(df$age)
## [1] 0.4777052
## attr(,"method")
## [1] "moment"
skewness(df$area.income)
## [1] -0.6484229
## attr(,"method")
## [1] "moment"
skewness(df$daily.internet.usage)
## [1] -0.03343681
## attr(,"method")
## [1] "moment"
skewness(df$male)
## [1] 0.07594088
## attr(,"method")
## [1] "moment"
male,time stamp and age column are positively skewed while as time spent on a site ,area income and daily internet usage are negatively skewed.
kurtosis(df$daily.time.spent.on.site)
## [1] -1.099864
## attr(,"method")
## [1] "excess"
kurtosis(df$age)
## [1] -0.4097066
## attr(,"method")
## [1] "excess"
kurtosis(df$area.income)
## [1] -0.1110924
## attr(,"method")
## [1] "excess"
kurtosis(df$daily.internet.usage)
## [1] -1.275752
## attr(,"method")
## [1] "excess"
kurtosis(df$male)
## [1] -1.996226
## attr(,"method")
## [1] "excess"
the data has a platykurtic distribution
# Histogram with density plot
ggplot(df, aes(x=area.income)) +
geom_histogram(colour="black", fill="blue",bins=15)#+
shows that most people receive incomes ranges between 60,000 and 70,000
# Histogram with density plot
ggplot(df, aes(x=daily.time.spent.on.site)) +
geom_histogram(colour="grey", fill="green",bins=10)#+
# Histogram with density plot
ggplot(df, aes(x=daily.internet.usage)) +
geom_histogram(colour="black", fill="purple",bins=10)#+
The Average Hours spent by users on the Internet is 180 minutes
ggplot(df, aes(x=factor(`clicked.on.ad`))) + geom_bar( fill=rgb(0.4,0.1,0.5))
The number of users on the site who clicked on the ad is equal to those that did not
# Creating a histogram for age
hist(df$age,)
Majority of the users are between the age 25 to 35.
ggplot(df, aes(x=area.income, y = daily.time.spent.on.site )) + geom_point(aes(colour= as.factor(`clicked.on.ad`)))+
labs(title="Area income vs daily time spent on site based on clicked ad")
The scatter plot for the area _income against time spent on the site shows that high income earners were least likely to click on the ad despite the fact that they seemed to spend a over an hour a day on the site.
ggplot(data=df, aes(x=age, y=daily.time.spent.on.site))+
geom_point(aes(color=clicked.on.ad))+
labs(title="Age vs daily time spent on site based on clicked ad")
the Age against Time spent on the site show that the younger demographic are less tolerant to ads since are more likely to detect ads and avoid them while using the internet compared to their older counterparts
ggplot(data=df, aes(x=area.income, y=age))+
geom_point(aes(color=clicked.on.ad))+
labs(title="Area income vs age based on clicked ad")
The scatter plot for the area_income against Age showed that ,majority of the users who did not click on the ad were the high income earners and many were aged between 20 and 40 years.
corr = round(cor(select_if(df, is.numeric)), 2)
ggcorrplot(corr, hc.order = T, ggtheme = ggplot2::theme_gray,
colors = c("#6D9EC1", "white", "#E46726"), lab = T)
The factors that seem to contribute the most to the click add activity are “daily_internet_usage”,“daily_time_spent_on_site”,“age” and “area_income”.
area income showed a moderate negative relationship with click ad activity, where most click activity happened with those that earned above 40,000. However, earners from 66,000 less clicked on the ad.
The people who clicked most on Ads were between age 28 to 43.
Older people , those over 35 were more likely to click on the course ad.
target users who were aged over 35 , as they were more likely to click on the ad.
More focus should be on those earning a lower income i.e less than 60,000 because their indicate to be more beneficial as these consumers clicking on the ad .
Finally the users who spend less time on the site and on the internet are more likely to click on the ads
df$clicked.on.ad <- as.factor(df$clicked.on.ad)
df$clicked.on.ad <- as.numeric(df$clicked.on.ad)
head(df)
df1 <- select(df, c(1,2,3,4,7,10))
#df1 <- select(df1, -c(7,8))
head(df1)
#Create an index for data partitioning
set.seed(7)
library(caret)
index<- createDataPartition(df1$clicked.on.ad,p = 0.8 ,list = FALSE)
Splitting the data
#Using the indexes to split data into test and train set
df.train <- df1[index, ]
df.test <- df1[-index, ]
#Fitting in the decision tree
TreeFit <- rpart(clicked.on.ad ~ ., data = df.train ,method = "class")
#Factor the Clicked.on.Ad vector in the test dataset
df.test$clicked.on.ad <- factor(df.test$clicked.on.ad)
#Using model to predict
TreePredict <- predict(TreeFit, newdata = df.test, type = "class")
confusionMatrix(TreePredict, df.test$clicked.on.ad)
## [,1] [,2]
## [1,] 0 0
## [2,] 0 103
#Fitting model to training dataset
#Also we scale and center our data
knnModel <- train(clicked.on.ad ~ ., data = df.train, method = "knn", preProcess = c("center", "scale"))
#Using the model to predict
knnPredict <- predict(knnModel, newdata = df.test)
#Printing out the confusion matrix and statistics
confusionMatrix(knnPredict, df.test$clicked.on.ad)
## [,1] [,2]
## [1,] 0 0
## [2,] 0 79
We can see both decision tree and knn have been correctly classified and have