Which individuals are most likely to clink on the course advertisement ads?
# Loading the data.
library(data.table)
ad <- fread('advertising.csv')
# Countries and continents data set.
cont <- fread('https://raw.githubusercontent.com/dbouquin/IS_608/master/NanosatDB_munging/Countries-Continents.csv')
Number of Records
# Number of rows and columns.
cat('Number of rows = ', nrow(ad), 'and the number of columns = ', ncol(ad),'.')
## Number of rows = 1000 and the number of columns = 10 .
Top Dataset Preview
# First 5 records.
head(ad, 5)
## Daily Time Spent on Site Age Area Income Daily Internet Usage
## 1: 68.95 35 61833.90 256.09
## 2: 80.23 31 68441.85 193.77
## 3: 69.47 26 59785.94 236.50
## 4: 74.15 29 54806.18 245.89
## 5: 68.37 35 73889.99 225.58
## Ad Topic Line City Male Country
## 1: Cloned 5thgeneration orchestration Wrightburgh 0 Tunisia
## 2: Monitored national standardization West Jodi 1 Nauru
## 3: Organic bottom-line service-desk Davidton 0 San Marino
## 4: Triple-buffered reciprocal time-frame West Terrifurt 1 Italy
## 5: Robust logistical utilization South Manuel 0 Iceland
## Timestamp Clicked on Ad
## 1: 2016-03-27 00:53:11 0
## 2: 2016-04-04 01:39:02 0
## 3: 2016-03-13 20:35:42 0
## 4: 2016-01-10 02:31:19 0
## 5: 2016-06-03 03:36:18 0
Bottom Dataset Preview
# Last 5 records.
tail(ad, 5)
## Daily Time Spent on Site Age Area Income Daily Internet Usage
## 1: 72.97 30 71384.57 208.58
## 2: 51.30 45 67782.17 134.42
## 3: 51.63 51 42415.72 120.37
## 4: 55.55 19 41920.79 187.95
## 5: 45.01 26 29875.80 178.35
## Ad Topic Line City Male
## 1: Fundamental modular algorithm Duffystad 1
## 2: Grass-roots cohesive monitoring New Darlene 1
## 3: Expanded intangible solution South Jessica 1
## 4: Proactive bandwidth-monitored policy West Steven 0
## 5: Virtual 5thgeneration emulation Ronniemouth 0
## Country Timestamp Clicked on Ad
## 1: Lebanon 2016-02-11 21:49:00 1
## 2: Bosnia and Herzegovina 2016-04-22 02:07:01 1
## 3: Mongolia 2016-02-01 17:24:57 1
## 4: Guatemala 2016-03-24 02:35:54 0
## 5: Brazil 2016-06-03 21:43:21 1
At first glance of the data set, no anomalies can be seen.
# Data set structure.
str(ad)
## Classes 'data.table' and 'data.frame': 1000 obs. of 10 variables:
## $ Daily Time Spent on Site: num 69 80.2 69.5 74.2 68.4 ...
## $ Age : int 35 31 26 29 35 23 33 48 30 20 ...
## $ Area Income : num 61834 68442 59786 54806 73890 ...
## $ Daily Internet Usage : num 256 194 236 246 226 ...
## $ Ad Topic Line : chr "Cloned 5thgeneration orchestration" "Monitored national standardization" "Organic bottom-line service-desk" "Triple-buffered reciprocal time-frame" ...
## $ City : chr "Wrightburgh" "West Jodi" "Davidton" "West Terrifurt" ...
## $ Male : int 0 1 0 1 0 1 0 1 1 1 ...
## $ Country : chr "Tunisia" "Nauru" "San Marino" "Italy" ...
## $ Timestamp : POSIXct, format: "2016-03-27 00:53:11" "2016-04-04 01:39:02" ...
## $ Clicked on Ad : int 0 0 0 0 0 0 0 1 0 0 ...
## - attr(*, ".internal.selfref")=<externalptr>
# Categorical columns
num <- unlist(lapply(ad, is.numeric))
cat_cols <- ad[, !num]
# Excluding the Timestamp column
cat_cols['Timestamp'] <- FALSE
# Coercing character columns to factors
# Data frame with character columns only
char_df <- ad[, ..cat_cols]
# Getting character vector from the original logical vector.
c <- as.vector(colnames(char_df))
# Target data set columns
a <- ad[ , ..c]
# Converting target character columns to factors
ad[ ,c] <- lapply(a, factor)
# Checking changes
head(ad, 5)
## Daily Time Spent on Site Age Area Income Daily Internet Usage
## 1: 68.95 35 61833.90 256.09
## 2: 80.23 31 68441.85 193.77
## 3: 69.47 26 59785.94 236.50
## 4: 74.15 29 54806.18 245.89
## 5: 68.37 35 73889.99 225.58
## Ad Topic Line City Male Country
## 1: Cloned 5thgeneration orchestration Wrightburgh 0 Tunisia
## 2: Monitored national standardization West Jodi 1 Nauru
## 3: Organic bottom-line service-desk Davidton 0 San Marino
## 4: Triple-buffered reciprocal time-frame West Terrifurt 1 Italy
## 5: Robust logistical utilization South Manuel 0 Iceland
## Timestamp Clicked on Ad
## 1: 2016-03-27 00:53:11 0
## 2: 2016-04-04 01:39:02 0
## 3: 2016-03-13 20:35:42 0
## 4: 2016-01-10 02:31:19 0
## 5: 2016-06-03 03:36:18 0
# Converting the encoded Male and Clicked on Ad columns to factors.
ad[ ,c('Male', 'Clicked on Ad')] <- lapply(ad[, c('Male', 'Clicked on Ad')], factor)
# Checking changes
head(ad, 5)
## Daily Time Spent on Site Age Area Income Daily Internet Usage
## 1: 68.95 35 61833.90 256.09
## 2: 80.23 31 68441.85 193.77
## 3: 69.47 26 59785.94 236.50
## 4: 74.15 29 54806.18 245.89
## 5: 68.37 35 73889.99 225.58
## Ad Topic Line City Male Country
## 1: Cloned 5thgeneration orchestration Wrightburgh 0 Tunisia
## 2: Monitored national standardization West Jodi 1 Nauru
## 3: Organic bottom-line service-desk Davidton 0 San Marino
## 4: Triple-buffered reciprocal time-frame West Terrifurt 1 Italy
## 5: Robust logistical utilization South Manuel 0 Iceland
## Timestamp Clicked on Ad
## 1: 2016-03-27 00:53:11 0
## 2: 2016-04-04 01:39:02 0
## 3: 2016-03-13 20:35:42 0
## 4: 2016-01-10 02:31:19 0
## 5: 2016-06-03 03:36:18 0
Column Validity
Checking for invalid/unnecessary columns that do not contribute relevant information to the study.
# Column names
colnames(ad)
## [1] "Daily Time Spent on Site" "Age"
## [3] "Area Income" "Daily Internet Usage"
## [5] "Ad Topic Line" "City"
## [7] "Male" "Country"
## [9] "Timestamp" "Clicked on Ad"
All columns are valid.
Checking for invalid values
# Checking for anomalies
# Data set summary
summary(ad)
## Daily Time Spent on Site Age Area Income Daily Internet Usage
## Min. :32.60 Min. :19.00 Min. :13996 Min. :104.8
## 1st Qu.:51.36 1st Qu.:29.00 1st Qu.:47032 1st Qu.:138.8
## Median :68.22 Median :35.00 Median :57012 Median :183.1
## Mean :65.00 Mean :36.01 Mean :55000 Mean :180.0
## 3rd Qu.:78.55 3rd Qu.:42.00 3rd Qu.:65471 3rd Qu.:218.8
## Max. :91.43 Max. :61.00 Max. :79485 Max. :270.0
##
## Ad Topic Line City Male
## Adaptive 24hour Graphic Interface : 1 Lisamouth : 3 0:519
## Adaptive asynchronous attitude : 1 Williamsport : 3 1:481
## Adaptive context-sensitive application : 1 Benjaminchester: 2
## Adaptive contextually-based methodology: 1 East John : 2
## Adaptive demand-driven knowledgebase : 1 East Timothy : 2
## Adaptive uniform capability : 1 Johnstad : 2
## (Other) :994 (Other) :986
## Country Timestamp Clicked on Ad
## Czech Republic: 9 Min. :2016-01-01 02:52:10.00 0:500
## France : 9 1st Qu.:2016-02-18 02:55:42.00 1:500
## Afghanistan : 8 Median :2016-04-07 17:27:29.50
## Australia : 8 Mean :2016-04-10 10:34:06.64
## Cyprus : 8 3rd Qu.:2016-05-31 03:18:14.00
## Greece : 8 Max. :2016-07-24 00:22:16.00
## (Other) :950
All numeric columns are >= 0.
# Checking unique categorical column values.
str(ad)
## Classes 'data.table' and 'data.frame': 1000 obs. of 10 variables:
## $ Daily Time Spent on Site: num 69 80.2 69.5 74.2 68.4 ...
## $ Age : int 35 31 26 29 35 23 33 48 30 20 ...
## $ Area Income : num 61834 68442 59786 54806 73890 ...
## $ Daily Internet Usage : num 256 194 236 246 226 ...
## $ Ad Topic Line : Factor w/ 1000 levels "Adaptive 24hour Graphic Interface",..: 92 465 567 904 767 806 223 724 108 455 ...
## $ City : Factor w/ 969 levels "Adamsbury","Adamside",..: 962 904 112 940 806 283 47 672 885 713 ...
## $ Male : Factor w/ 2 levels "0","1": 1 2 1 2 1 2 1 2 2 2 ...
## $ Country : Factor w/ 237 levels "Afghanistan",..: 216 148 185 104 97 159 146 13 83 79 ...
## $ Timestamp : POSIXct, format: "2016-03-27 00:53:11" "2016-04-04 01:39:02" ...
## $ Clicked on Ad : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...
## - attr(*, ".internal.selfref")=<externalptr>
No anomalies can be seen in the categorical columns, therefore, no anomalies are present.
# Checking for missing values
colSums(is.na(ad))
## Daily Time Spent on Site Age Area Income
## 0 0 0
## Daily Internet Usage Ad Topic Line City
## 0 0 0
## Male Country Timestamp
## 0 0 0
## Clicked on Ad
## 0
There are no missing values present in the data set.
# Checking for duplicates.
sum(duplicated(ad))
## [1] 0
There are no duplicated records.
# Checking uniformity of column names.
colnames(ad)
## [1] "Daily Time Spent on Site" "Age"
## [3] "Area Income" "Daily Internet Usage"
## [5] "Ad Topic Line" "City"
## [7] "Male" "Country"
## [9] "Timestamp" "Clicked on Ad"
Column names have the same case, therefore, they are uniform. However, the spaces will be replaced with ’_’ for easier dataset manipulation.
# Removing white spaces.
colnames(ad) <- gsub(' ', '_', colnames(ad))
# Checking changes
colnames(ad)
## [1] "Daily_Time_Spent_on_Site" "Age"
## [3] "Area_Income" "Daily_Internet_Usage"
## [5] "Ad_Topic_Line" "City"
## [7] "Male" "Country"
## [9] "Timestamp" "Clicked_on_Ad"
# Numerical columns
num_df <- ad[ , ..num]
# Removing the encoded categorical columns from the numerical columns set.
num_df <- num_df[ , !c('Male', 'Clicked_on_Ad') ]
# Checking for outliers
# Plotting boxplots
# Number of plots
length(num_df)
## [1] 4
# Boxplots
par(mfrow = c(2,2))
for (i in 1:length(num_df)){
boxplot(num_df[ , ..i], main = paste('Boxplot of', names(num_df)[i]),
ylab = 'Count')
}
From the box plots of the numerical columns, it can be seen that only the ‘Area Income’ column has outliers. Outliers will be retained for further analysis.
Adding a continent column for easier generalization.
# Continents
continents <- unlist(unique(cont$Continent))
country <- unlist(unique(ad$Country))
# Cities in each continent
africa <- unlist(unique(cont[cont$Continent == 'Africa'][ ,2]))
asia <- unlist(unique(cont[cont$Continent == 'Asia'][ ,2]))
europe <- unlist(unique(cont[cont$Continent == 'Europe'][ ,2]))
north.america <- unlist(unique(cont[cont$Continent == 'North America'][ ,2]))
oceania <- unlist(unique(cont[cont$Continent == 'Oceania'][ ,2]))
south.america <- unlist(unique(cont[cont$Continent == 'South America'][ ,2]))
# Copy of original dataset
ad2 <- data.frame(ad)
# Introducing a continent column
ad2$Continent <- ''
for (i in country){
if (i %in% africa){
ad2[(ad2$Country == i),]$Continent <- 'Africa'
}
else if(i %in% asia){
ad2[(ad2$Country == i),]$Continent <- 'Asia'
}
else if(i %in% europe){
ad2[(ad2$Country == i),]$Continent <- 'Europe'
}
else if(i %in% north.america){
ad2[(ad2$Country == i),]$Continent <- 'North America'
}
else if(i %in% oceania){
ad2[(ad2$Country == i),]$Continent <- 'Oceania'
}
else if(i %in% south.america){
ad2[(ad2$Country == i),]$Continent <- 'South America'
}
}
head(ad2)
## Daily_Time_Spent_on_Site Age Area_Income Daily_Internet_Usage
## 1 68.95 35 61833.90 256.09
## 2 80.23 31 68441.85 193.77
## 3 69.47 26 59785.94 236.50
## 4 74.15 29 54806.18 245.89
## 5 68.37 35 73889.99 225.58
## 6 59.99 23 59761.56 226.74
## Ad_Topic_Line City Male Country
## 1 Cloned 5thgeneration orchestration Wrightburgh 0 Tunisia
## 2 Monitored national standardization West Jodi 1 Nauru
## 3 Organic bottom-line service-desk Davidton 0 San Marino
## 4 Triple-buffered reciprocal time-frame West Terrifurt 1 Italy
## 5 Robust logistical utilization South Manuel 0 Iceland
## 6 Sharable client-driven software Jamieberg 1 Norway
## Timestamp Clicked_on_Ad Continent
## 1 2016-03-27 00:53:11 0 Africa
## 2 2016-04-04 01:39:02 0 Oceania
## 3 2016-03-13 20:35:42 0 Europe
## 4 2016-01-10 02:31:19 0 Europe
## 5 2016-06-03 03:36:18 0 Europe
## 6 2016-05-19 14:30:17 0 Europe
# Ensuring that all continent rows have been filled.
sum(is.na(ad2$Continent))
## [1] 0
All continent rows have been filled according to their respective countries.
# Data set of categorical columns
# Selecting categorical columns
cat_cols <- unlist(lapply(ad, is.factor))
cat_df <- ad[, ..cat_cols]
names(cat_df)
## [1] "Ad_Topic_Line" "City" "Male" "Country"
## [5] "Clicked_on_Ad"
# Count plot function
bar.plt <- function(data, col1, title, col2 = NULL){
library(ggplot2)
if (is.null(col2)){
ggplot(data, aes(x = {{col1}}, fill = {{col1}})) + geom_bar() +
ggtitle(paste(title, 'Frequency Plot')) +
theme(plot.title = element_text(hjust = 0.5))}
else if (!is.null(col2)){
ggplot(data, aes(x = {{col1}}, y = {{col2}}), fill = {{col1}}) + geom_dar()
}
}
Ad Topic Line
# Ad Topic Line
# Checking unique values
unique(cat_df[, 1])
## Ad_Topic_Line
## 1: Cloned 5thgeneration orchestration
## 2: Monitored national standardization
## 3: Organic bottom-line service-desk
## 4: Triple-buffered reciprocal time-frame
## 5: Robust logistical utilization
## ---
## 996: Fundamental modular algorithm
## 997: Grass-roots cohesive monitoring
## 998: Expanded intangible solution
## 999: Proactive bandwidth-monitored policy
## 1000: Virtual 5thgeneration emulation
The topic lines are unique therefore they will not be plotted.
City
# Checking unique values.
unique(cat_df[,2])
## City
## 1: Wrightburgh
## 2: West Jodi
## 3: Davidton
## 4: West Terrifurt
## 5: South Manuel
## ---
## 965: Nicholasland
## 966: Duffystad
## 967: New Darlene
## 968: South Jessica
## 969: Ronniemouth
There are 969 unique cities. Therefore, the ads are displayed to users in numerous cities.
library ("plyr")
# Frequency table function
frequencies <- function(col_no, col){
freq <- lapply(cat_df[, ..col_no], count)
# Converting list to data frame
a <- data.frame(freq)
# Only selecting cities with a frequency > 1
a <- a[a[, col] > 1,]
# Ordering cit frequency
high_f <- a[order(a[, col], decreasing = TRUE),]
high_f
}
# City Frequencies
head(frequencies(2, 'City.freq'))
## City.x City.freq
## 427 Lisamouth 3
## 956 Williamsport 3
## 31 Benjaminchester 2
## 158 East John 2
## 183 East Timothy 2
## 307 Johnstad 2
Lisamouth and Williamsport cities had the highest frequency in the dataset.
Country
# Country frequencies
head(frequencies(4, 'Country.freq'))
## Country.x Country.freq
## 55 Czech Republic 9
## 71 France 9
## 1 Afghanistan 8
## 13 Australia 8
## 54 Cyprus 8
## 81 Greece 8
The Czech Republic and France have the highest frequencies.
# Gender column
bar.plt (cat_df, Male, title = 'Male Column', col2 = NULL)
The frequency of Male observations/records is higher.
# Plot of classes in the Male column
bar.plt (cat_df, Clicked_on_Ad, col2 = NULL, title = 'Clicked on Ad Column')
The label/target column has balanced classes.
# Mode function
mode <- function(col, data) {
unique.value <- unique(data[ , c(col)])
unique.value[which.max(tabulate(match(data[ , c(col)], unique.value)))]
}
# Mode function
mode <- function(col, data) {
unique.value <- unique(data[, col])
unique.value[which.max(tabulate(match(data[,col], unique.value)))]
}
central.tendency <- function(col, data){
cat('Measures of Central Tendency \n')
# Mean
cat('Mean = ', mean(data[, col]), '\n')
# Median
cat('Median = ', median(data[,col]), '\n')
# Mode
cat('Mode = ', mode(col, data), '\n')
}
library(moments)
dispersion <- function(col, data){
# Range
cat('Range = ', min(data[ ,col]), '-', max(data[ ,col]), '\n')
# IQR
cat('IQR = ', IQR(data[ ,col]), '\n')
# Variance
cat('Variance = ', var(data[ ,col]), '\n')
# Standard Deviation
cat('Standard Deviation = ', sd(data[ ,col]), '\n')
# Skewness
cat('Skewness = ', skewness(data[ ,col]), '\n')
# Kurtosis
cat('Kurtosis = ', kurtosis(data[ ,col]), '\n')
}
Daily Time Spent on Site
# Measures of central tendency
central.tendency(names(num_df)[1], ad2)
## Measures of Central Tendency
## Mean = 65.0002
## Median = 68.215
## Mode = 62.26
The average time spent on the site is 65 minutes. The median is greater than the mean, therefore, the distribution is negatively skewed.
# Measures of dispersion
dispersion(names(num_df)[1], ad2)
## Range = 32.6 - 91.43
## IQR = 27.1875
## Variance = 251.3371
## Standard Deviation = 15.85361
## Skewness = -0.3712026
## Kurtosis = 1.903942
# Daily Time Spent on Site Histogram
hist(num_df$ Daily_Time_Spent_on_Site, main = 'Histogram of Daily Time Spent on Site', xlab = 'Daily Time Spent Spent on Site')
Most users spend 80 minutes on the site.
Age
# Measures of Central Distribution
central.tendency('Age', ad2)
## Measures of Central Tendency
## Mean = 36.009
## Median = 35
## Mode = 31
The average age of users is 36. The distribution has a positive skew as median < mean.
# Measures of Dispersion
dispersion('Age', ad2)
## Range = 19 - 61
## IQR = 13
## Variance = 77.18611
## Standard Deviation = 8.785562
## Skewness = 0.4784227
## Kurtosis = 2.595482
# Age Histogram
hist(num_df$ Age, main = 'Histogram of Age', xlab = 'Age')
The most frequent ages are within the range of 25 to 35.
Area Income
# Measures of Central Distribution
central.tendency('Area_Income', ad2)
## Measures of Central Tendency
## Mean = 55000
## Median = 57012.3
## Mode = 61833.9
The average area income is 55,000. The distribution has a negative skew as median > mean.
# Measures of Dispersion
dispersion('Area_Income', ad2)
## Range = 13996.5 - 79484.8
## IQR = 18438.83
## Variance = 179952406
## Standard Deviation = 13414.63
## Skewness = -0.6493967
## Kurtosis = 2.894694
# Area Income Histogram
hist(num_df$ Area_Income, main = 'Histogram of Area Income', xlab = 'Area Income')
The distribution leans towards higher are incomes, with 65,000 having the highest frequency.
# Measures of Central Distribution
central.tendency('Daily_Internet_Usage', ad2)
## Measures of Central Tendency
## Mean = 180.0001
## Median = 183.13
## Mode = 167.22
The average daily internet usage is 180 minutes. The distribution has a negative skew as mean < median.
# Measures of Dispersion
dispersion('Daily_Internet_Usage', ad2)
## Range = 104.78 - 269.96
## IQR = 79.9625
## Variance = 1927.415
## Standard Deviation = 43.90234
## Skewness = -0.03348703
## Kurtosis = 1.727701
# Daily Internet Usage Histogram
hist(num_df$ Daily_Internet_Usage, main = 'Histogram of Daily Internet Usage',
xlab = 'Daily Internet Usage')
Most users spent 125 minutes on the internet.
Clicked on Ad Vs Continent
# Count plot function
bar.plt <- function(data, col1, title, col2 = NULL){
library(ggplot2)
if (is.null(col2)){
ggplot(data, aes(x = {{col1}}, fill = {{col1}})) + geom_bar() +
ggtitle(paste(title, 'Frequency Plot')) +
theme(plot.title = element_text(hjust = 0.5))}
else if (!is.null(col2)){
ggplot(data, aes(x = {{col1}}, y = {{col2}}), fill = {{col1}}) + geom_bar()
}
}
library(ggplot2)
Continent <- ad2$Continent
Clicked_on_Ad <- ad2$Clicked_on_Ad
# contingency.table <- table(ad2$Continent, ad2$Clicked_on_Ad)
contingency.table <- table(Clicked_on_Ad, Continent)
contingency.table
## Continent
## Clicked_on_Ad Africa Asia Europe North America Oceania South America
## 0 134 90 79 94 45 28 30
## 1 130 92 72 97 51 34 24
dev.new(width = 10, height=15, unit= 'in' ,noRStudioGD = TRUE)
# Mosaic plot of contingency table
mosaicplot(contingency.table, xlab='Continent', ylab='Clicked_on_Ad',
main='Clicked on Ad Vs Continent', color = 'orange')
class(contingency.table)
## [1] "table"
names(ad2)
## [1] "Daily_Time_Spent_on_Site" "Age"
## [3] "Area_Income" "Daily_Internet_Usage"
## [5] "Ad_Topic_Line" "City"
## [7] "Male" "Country"
## [9] "Timestamp" "Clicked_on_Ad"
## [11] "Continent"
# Temporary Directory
dir.create(tempdir())
## Warning in dir.create(tempdir()): 'C:\Users\HP\AppData\Local\Temp\Rtmp0mafYE'
## already exists