ADVERTISING

1. Defining the Question

a) Specifying the Question

Which individuals are most likely to clink on the course advertisement ads?

b) Defining the Metric for Success

c) Understanding the Context

d) Recording the Experimental Design

e) Data Relevance

2. Data Understanding

a) Reading the Data

# Loading the data.
library(data.table)
ad <- fread('advertising.csv')

# Countries and continents data set.
cont <- fread('https://raw.githubusercontent.com/dbouquin/IS_608/master/NanosatDB_munging/Countries-Continents.csv')

b) Checking the Data

Number of Records

# Number of rows and columns.
cat('Number of rows = ', nrow(ad), 'and the number of columns = ', ncol(ad),'.')

## Number of rows =  1000 and the number of columns =  10 .

Top Dataset Preview

# First 5 records.
head(ad, 5)

##    Daily Time Spent on Site Age Area Income Daily Internet Usage
## 1:                    68.95  35    61833.90               256.09
## 2:                    80.23  31    68441.85               193.77
## 3:                    69.47  26    59785.94               236.50
## 4:                    74.15  29    54806.18               245.89
## 5:                    68.37  35    73889.99               225.58
##                            Ad Topic Line           City Male    Country
## 1:    Cloned 5thgeneration orchestration    Wrightburgh    0    Tunisia
## 2:    Monitored national standardization      West Jodi    1      Nauru
## 3:      Organic bottom-line service-desk       Davidton    0 San Marino
## 4: Triple-buffered reciprocal time-frame West Terrifurt    1      Italy
## 5:         Robust logistical utilization   South Manuel    0    Iceland
##              Timestamp Clicked on Ad
## 1: 2016-03-27 00:53:11             0
## 2: 2016-04-04 01:39:02             0
## 3: 2016-03-13 20:35:42             0
## 4: 2016-01-10 02:31:19             0
## 5: 2016-06-03 03:36:18             0

Bottom Dataset Preview

# Last 5 records.
tail(ad, 5)

##    Daily Time Spent on Site Age Area Income Daily Internet Usage
## 1:                    72.97  30    71384.57               208.58
## 2:                    51.30  45    67782.17               134.42
## 3:                    51.63  51    42415.72               120.37
## 4:                    55.55  19    41920.79               187.95
## 5:                    45.01  26    29875.80               178.35
##                           Ad Topic Line          City Male
## 1:        Fundamental modular algorithm     Duffystad    1
## 2:      Grass-roots cohesive monitoring   New Darlene    1
## 3:         Expanded intangible solution South Jessica    1
## 4: Proactive bandwidth-monitored policy   West Steven    0
## 5:      Virtual 5thgeneration emulation   Ronniemouth    0
##                   Country           Timestamp Clicked on Ad
## 1:                Lebanon 2016-02-11 21:49:00             1
## 2: Bosnia and Herzegovina 2016-04-22 02:07:01             1
## 3:               Mongolia 2016-02-01 17:24:57             1
## 4:              Guatemala 2016-03-24 02:35:54             0
## 5:                 Brazil 2016-06-03 21:43:21             1

At first glance of the data set, no anomalies can be seen.

c) Checking Datatypes

# Data set structure.
str(ad)

## Classes 'data.table' and 'data.frame':   1000 obs. of  10 variables:
##  $ Daily Time Spent on Site: num  69 80.2 69.5 74.2 68.4 ...
##  $ Age                     : int  35 31 26 29 35 23 33 48 30 20 ...
##  $ Area Income             : num  61834 68442 59786 54806 73890 ...
##  $ Daily Internet Usage    : num  256 194 236 246 226 ...
##  $ Ad Topic Line           : chr  "Cloned 5thgeneration orchestration" "Monitored national standardization" "Organic bottom-line service-desk" "Triple-buffered reciprocal time-frame" ...
##  $ City                    : chr  "Wrightburgh" "West Jodi" "Davidton" "West Terrifurt" ...
##  $ Male                    : int  0 1 0 1 0 1 0 1 1 1 ...
##  $ Country                 : chr  "Tunisia" "Nauru" "San Marino" "Italy" ...
##  $ Timestamp               : POSIXct, format: "2016-03-27 00:53:11" "2016-04-04 01:39:02" ...
##  $ Clicked on Ad           : int  0 0 0 0 0 0 0 1 0 0 ...
##  - attr(*, ".internal.selfref")=<externalptr>

# Categorical columns
num <- unlist(lapply(ad, is.numeric))
cat_cols <- ad[, !num]
# Excluding the Timestamp column
cat_cols['Timestamp'] <- FALSE
# Coercing character columns to factors
  # Data frame with character columns only
char_df <- ad[, ..cat_cols]
 # Getting character vector from the original logical vector.
c <- as.vector(colnames(char_df))
# Target data set columns
a <- ad[ , ..c]
# Converting target character columns to factors
ad[ ,c] <- lapply(a, factor)
# Checking changes
head(ad, 5)

##    Daily Time Spent on Site Age Area Income Daily Internet Usage
## 1:                    68.95  35    61833.90               256.09
## 2:                    80.23  31    68441.85               193.77
## 3:                    69.47  26    59785.94               236.50
## 4:                    74.15  29    54806.18               245.89
## 5:                    68.37  35    73889.99               225.58
##                            Ad Topic Line           City Male    Country
## 1:    Cloned 5thgeneration orchestration    Wrightburgh    0    Tunisia
## 2:    Monitored national standardization      West Jodi    1      Nauru
## 3:      Organic bottom-line service-desk       Davidton    0 San Marino
## 4: Triple-buffered reciprocal time-frame West Terrifurt    1      Italy
## 5:         Robust logistical utilization   South Manuel    0    Iceland
##              Timestamp Clicked on Ad
## 1: 2016-03-27 00:53:11             0
## 2: 2016-04-04 01:39:02             0
## 3: 2016-03-13 20:35:42             0
## 4: 2016-01-10 02:31:19             0
## 5: 2016-06-03 03:36:18             0

# Converting the encoded Male and Clicked on Ad columns to factors.
ad[ ,c('Male', 'Clicked on Ad')] <- lapply(ad[, c('Male', 'Clicked on Ad')], factor)
# Checking changes
head(ad, 5)

##    Daily Time Spent on Site Age Area Income Daily Internet Usage
## 1:                    68.95  35    61833.90               256.09
## 2:                    80.23  31    68441.85               193.77
## 3:                    69.47  26    59785.94               236.50
## 4:                    74.15  29    54806.18               245.89
## 5:                    68.37  35    73889.99               225.58
##                            Ad Topic Line           City Male    Country
## 1:    Cloned 5thgeneration orchestration    Wrightburgh    0    Tunisia
## 2:    Monitored national standardization      West Jodi    1      Nauru
## 3:      Organic bottom-line service-desk       Davidton    0 San Marino
## 4: Triple-buffered reciprocal time-frame West Terrifurt    1      Italy
## 5:         Robust logistical utilization   South Manuel    0    Iceland
##              Timestamp Clicked on Ad
## 1: 2016-03-27 00:53:11             0
## 2: 2016-04-04 01:39:02             0
## 3: 2016-03-13 20:35:42             0
## 4: 2016-01-10 02:31:19             0
## 5: 2016-06-03 03:36:18             0

3. External Dataset Validation

4. Data Preperation

a) Validation

Column Validity

Checking for invalid/unnecessary columns that do not contribute relevant information to the study.

# Column names 
colnames(ad)

##  [1] "Daily Time Spent on Site" "Age"                     
##  [3] "Area Income"              "Daily Internet Usage"    
##  [5] "Ad Topic Line"            "City"                    
##  [7] "Male"                     "Country"                 
##  [9] "Timestamp"                "Clicked on Ad"

All columns are valid.

Checking for invalid values

# Checking for anomalies
# Data set summary
summary(ad)

##  Daily Time Spent on Site      Age         Area Income    Daily Internet Usage
##  Min.   :32.60            Min.   :19.00   Min.   :13996   Min.   :104.8       
##  1st Qu.:51.36            1st Qu.:29.00   1st Qu.:47032   1st Qu.:138.8       
##  Median :68.22            Median :35.00   Median :57012   Median :183.1       
##  Mean   :65.00            Mean   :36.01   Mean   :55000   Mean   :180.0       
##  3rd Qu.:78.55            3rd Qu.:42.00   3rd Qu.:65471   3rd Qu.:218.8       
##  Max.   :91.43            Max.   :61.00   Max.   :79485   Max.   :270.0       
##                                                                               
##                                  Ad Topic Line              City     Male   
##  Adaptive 24hour Graphic Interface      :  1   Lisamouth      :  3   0:519  
##  Adaptive asynchronous attitude         :  1   Williamsport   :  3   1:481  
##  Adaptive context-sensitive application :  1   Benjaminchester:  2          
##  Adaptive contextually-based methodology:  1   East John      :  2          
##  Adaptive demand-driven knowledgebase   :  1   East Timothy   :  2          
##  Adaptive uniform capability            :  1   Johnstad       :  2          
##  (Other)                                :994   (Other)        :986          
##            Country      Timestamp                      Clicked on Ad
##  Czech Republic:  9   Min.   :2016-01-01 02:52:10.00   0:500        
##  France        :  9   1st Qu.:2016-02-18 02:55:42.00   1:500        
##  Afghanistan   :  8   Median :2016-04-07 17:27:29.50                
##  Australia     :  8   Mean   :2016-04-10 10:34:06.64                
##  Cyprus        :  8   3rd Qu.:2016-05-31 03:18:14.00                
##  Greece        :  8   Max.   :2016-07-24 00:22:16.00                
##  (Other)       :950

All numeric columns are >= 0.

# Checking unique categorical column values.
str(ad)

## Classes 'data.table' and 'data.frame':   1000 obs. of  10 variables:
##  $ Daily Time Spent on Site: num  69 80.2 69.5 74.2 68.4 ...
##  $ Age                     : int  35 31 26 29 35 23 33 48 30 20 ...
##  $ Area Income             : num  61834 68442 59786 54806 73890 ...
##  $ Daily Internet Usage    : num  256 194 236 246 226 ...
##  $ Ad Topic Line           : Factor w/ 1000 levels "Adaptive 24hour Graphic Interface",..: 92 465 567 904 767 806 223 724 108 455 ...
##  $ City                    : Factor w/ 969 levels "Adamsbury","Adamside",..: 962 904 112 940 806 283 47 672 885 713 ...
##  $ Male                    : Factor w/ 2 levels "0","1": 1 2 1 2 1 2 1 2 2 2 ...
##  $ Country                 : Factor w/ 237 levels "Afghanistan",..: 216 148 185 104 97 159 146 13 83 79 ...
##  $ Timestamp               : POSIXct, format: "2016-03-27 00:53:11" "2016-04-04 01:39:02" ...
##  $ Clicked on Ad           : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 2 1 1 ...
##  - attr(*, ".internal.selfref")=<externalptr>

No anomalies can be seen in the categorical columns, therefore, no anomalies are present.

b) Consistency

# Checking for missing values
colSums(is.na(ad))

## Daily Time Spent on Site                      Age              Area Income 
##                        0                        0                        0 
##     Daily Internet Usage            Ad Topic Line                     City 
##                        0                        0                        0 
##                     Male                  Country                Timestamp 
##                        0                        0                        0 
##            Clicked on Ad 
##                        0

There are no missing values present in the data set.

c) Completeness

# Checking for duplicates.
sum(duplicated(ad))

## [1] 0

There are no duplicated records.

d) Uniformity

# Checking uniformity of column names.
colnames(ad)

##  [1] "Daily Time Spent on Site" "Age"                     
##  [3] "Area Income"              "Daily Internet Usage"    
##  [5] "Ad Topic Line"            "City"                    
##  [7] "Male"                     "Country"                 
##  [9] "Timestamp"                "Clicked on Ad"

Column names have the same case, therefore, they are uniform. However, the spaces will be replaced with ’_’ for easier dataset manipulation.

# Removing white spaces.
colnames(ad) <- gsub(' ', '_', colnames(ad))
# Checking changes
colnames(ad)

##  [1] "Daily_Time_Spent_on_Site" "Age"                     
##  [3] "Area_Income"              "Daily_Internet_Usage"    
##  [5] "Ad_Topic_Line"            "City"                    
##  [7] "Male"                     "Country"                 
##  [9] "Timestamp"                "Clicked_on_Ad"

e) Outliers

# Numerical columns
num_df <- ad[ , ..num]
# Removing the encoded categorical columns from the numerical columns set.
num_df <- num_df[ , !c('Male', 'Clicked_on_Ad') ]
# Checking for outliers

# Plotting boxplots
  # Number of plots
length(num_df)

## [1] 4

# Boxplots
par(mfrow = c(2,2))
for (i in 1:length(num_df)){
  boxplot(num_df[ , ..i], main = paste('Boxplot of',  names(num_df)[i]), 
          ylab = 'Count')
}

From the box plots of the numerical columns, it can be seen that only the ‘Area Income’ column has outliers. Outliers will be retained for further analysis.

Feature Engineering

Adding a continent column for easier generalization.

# Continents
continents <- unlist(unique(cont$Continent))
country <- unlist(unique(ad$Country))
# Cities in each continent
africa <- unlist(unique(cont[cont$Continent == 'Africa'][ ,2]))

asia <- unlist(unique(cont[cont$Continent == 'Asia'][ ,2]))
europe <- unlist(unique(cont[cont$Continent == 'Europe'][ ,2]))
north.america <- unlist(unique(cont[cont$Continent == 'North America'][ ,2]))
oceania <- unlist(unique(cont[cont$Continent == 'Oceania'][ ,2]))
south.america <- unlist(unique(cont[cont$Continent == 'South America'][ ,2]))

# Copy of original dataset
ad2 <- data.frame(ad)
# Introducing a continent column
ad2$Continent <- ''
for (i in country){
  if (i %in% africa){
    ad2[(ad2$Country == i),]$Continent <- 'Africa'
  }
  else if(i %in% asia){
    ad2[(ad2$Country == i),]$Continent <- 'Asia'
  }
  else if(i %in% europe){
    ad2[(ad2$Country == i),]$Continent <- 'Europe'
  }
  else if(i %in% north.america){
    ad2[(ad2$Country == i),]$Continent <- 'North America'
  }
  else if(i %in% oceania){
    ad2[(ad2$Country == i),]$Continent <- 'Oceania'
  }
  else if(i %in% south.america){
    ad2[(ad2$Country == i),]$Continent <- 'South America'
  }
}

head(ad2)

##   Daily_Time_Spent_on_Site Age Area_Income Daily_Internet_Usage
## 1                    68.95  35    61833.90               256.09
## 2                    80.23  31    68441.85               193.77
## 3                    69.47  26    59785.94               236.50
## 4                    74.15  29    54806.18               245.89
## 5                    68.37  35    73889.99               225.58
## 6                    59.99  23    59761.56               226.74
##                           Ad_Topic_Line           City Male    Country
## 1    Cloned 5thgeneration orchestration    Wrightburgh    0    Tunisia
## 2    Monitored national standardization      West Jodi    1      Nauru
## 3      Organic bottom-line service-desk       Davidton    0 San Marino
## 4 Triple-buffered reciprocal time-frame West Terrifurt    1      Italy
## 5         Robust logistical utilization   South Manuel    0    Iceland
## 6       Sharable client-driven software      Jamieberg    1     Norway
##             Timestamp Clicked_on_Ad Continent
## 1 2016-03-27 00:53:11             0    Africa
## 2 2016-04-04 01:39:02             0   Oceania
## 3 2016-03-13 20:35:42             0    Europe
## 4 2016-01-10 02:31:19             0    Europe
## 5 2016-06-03 03:36:18             0    Europe
## 6 2016-05-19 14:30:17             0    Europe

# Ensuring that all continent rows have been filled.

sum(is.na(ad2$Continent))

## [1] 0

All continent rows have been filled according to their respective countries.

5. Descritpive Analysis

a) Univariate Analysis

Categorical

# Data set of categorical columns
  # Selecting categorical columns
cat_cols <- unlist(lapply(ad, is.factor))
cat_df <- ad[, ..cat_cols]
names(cat_df)

## [1] "Ad_Topic_Line" "City"          "Male"          "Country"      
## [5] "Clicked_on_Ad"

# Count plot function
bar.plt <- function(data, col1, title, col2 = NULL){
  library(ggplot2)
  if (is.null(col2)){
    ggplot(data, aes(x = {{col1}}, fill = {{col1}})) + geom_bar() +
    ggtitle(paste(title, 'Frequency Plot')) + 
    theme(plot.title = element_text(hjust = 0.5))}
  else if (!is.null(col2)){
    ggplot(data, aes(x = {{col1}}, y = {{col2}}), fill = {{col1}}) + geom_dar()
      
  }
  
}

Ad Topic Line

# Ad Topic Line

# Checking unique values
unique(cat_df[, 1])

##                               Ad_Topic_Line
##    1:    Cloned 5thgeneration orchestration
##    2:    Monitored national standardization
##    3:      Organic bottom-line service-desk
##    4: Triple-buffered reciprocal time-frame
##    5:         Robust logistical utilization
##   ---                                      
##  996:         Fundamental modular algorithm
##  997:       Grass-roots cohesive monitoring
##  998:          Expanded intangible solution
##  999:  Proactive bandwidth-monitored policy
## 1000:       Virtual 5thgeneration emulation

The topic lines are unique therefore they will not be plotted.

City

# Checking unique values.
unique(cat_df[,2])

##                City
##   1:    Wrightburgh
##   2:      West Jodi
##   3:       Davidton
##   4: West Terrifurt
##   5:   South Manuel
##  ---               
## 965:   Nicholasland
## 966:      Duffystad
## 967:    New Darlene
## 968:  South Jessica
## 969:    Ronniemouth

There are 969 unique cities. Therefore, the ads are displayed to users in numerous cities.

library ("plyr")

# Frequency table function
frequencies <- function(col_no, col){
  freq <- lapply(cat_df[, ..col_no], count)
# Converting list to data frame
a <- data.frame(freq)
# Only selecting cities with a frequency > 1
a <- a[a[, col] > 1,]
# Ordering cit frequency
high_f <- a[order(a[, col], decreasing = TRUE),]
high_f
}

# City Frequencies
head(frequencies(2, 'City.freq'))

##              City.x City.freq
## 427       Lisamouth         3
## 956    Williamsport         3
## 31  Benjaminchester         2
## 158       East John         2
## 183    East Timothy         2
## 307        Johnstad         2

Lisamouth and Williamsport cities had the highest frequency in the dataset.

Country

# Country frequencies
head(frequencies(4, 'Country.freq'))

##         Country.x Country.freq
## 55 Czech Republic            9
## 71         France            9
## 1     Afghanistan            8
## 13      Australia            8
## 54         Cyprus            8
## 81         Greece            8

The Czech Republic and France have the highest frequencies.

# Gender column
bar.plt (cat_df, Male, title = 'Male Column', col2 = NULL)

The frequency of Male observations/records is higher.

# Plot of classes in the Male column
bar.plt (cat_df, Clicked_on_Ad, col2 = NULL, title = 'Clicked on Ad Column')

The label/target column has balanced classes.

Numerical

# Mode function
mode <- function(col, data) {
   unique.value <- unique(data[ , c(col)])
   unique.value[which.max(tabulate(match(data[ , c(col)], unique.value)))]
}

# Mode function
mode <- function(col, data) {
   unique.value <- unique(data[, col])
   unique.value[which.max(tabulate(match(data[,col], unique.value)))]
}

central.tendency <- function(col, data){
  cat('Measures of Central Tendency \n')
  # Mean
  cat('Mean = ', mean(data[, col]), '\n')
  # Median
  cat('Median = ', median(data[,col]), '\n')
  # Mode
  cat('Mode = ', mode(col, data), '\n')
  
}
library(moments)

dispersion <- function(col, data){
  # Range
  cat('Range = ', min(data[ ,col]), '-', max(data[ ,col]), '\n')
  # IQR
  cat('IQR = ', IQR(data[ ,col]), '\n')
  # Variance
  cat('Variance = ', var(data[ ,col]), '\n')
  # Standard Deviation
  cat('Standard Deviation = ', sd(data[ ,col]), '\n')
  # Skewness
  cat('Skewness = ', skewness(data[ ,col]), '\n')
  # Kurtosis
  cat('Kurtosis = ', kurtosis(data[ ,col]), '\n')
}

Daily Time Spent on Site

# Measures of central tendency
central.tendency(names(num_df)[1], ad2)

## Measures of Central Tendency 
## Mean =  65.0002 
## Median =  68.215 
## Mode =  62.26

The average time spent on the site is 65 minutes. The median is greater than the mean, therefore, the distribution is negatively skewed.

# Measures of dispersion
dispersion(names(num_df)[1], ad2)

## Range =  32.6 - 91.43 
## IQR =  27.1875 
## Variance =  251.3371 
## Standard Deviation =  15.85361 
## Skewness =  -0.3712026 
## Kurtosis =  1.903942

The skewness value is negative, corroborating the previous observation. Therefore, most users tend to spend more time on the site.
The distribution is also platykurtic.

# Daily Time Spent on Site Histogram
hist(num_df$ Daily_Time_Spent_on_Site, main = 'Histogram of Daily Time Spent on Site', xlab = 'Daily Time Spent Spent on Site')

Most users spend 80 minutes on the site.

Age

# Measures of Central Distribution
central.tendency('Age', ad2)

## Measures of Central Tendency 
## Mean =  36.009 
## Median =  35 
## Mode =  31

The average age of users is 36. The distribution has a positive skew as median < mean.

# Measures of Dispersion
dispersion('Age', ad2)

## Range =  19 - 61 
## IQR =  13 
## Variance =  77.18611 
## Standard Deviation =  8.785562 
## Skewness =  0.4784227 
## Kurtosis =  2.595482

From the skewness value, the column distribution is fairly symmetrical, with a slight positive skew.
The kurtosis value indicates that the distribution is almost leptokurtic link.

# Age Histogram
hist(num_df$ Age, main = 'Histogram of Age', xlab = 'Age')

The most frequent ages are within the range of 25 to 35.

Area Income

# Measures of Central Distribution
central.tendency('Area_Income', ad2)

## Measures of Central Tendency 
## Mean =  55000 
## Median =  57012.3 
## Mode =  61833.9

The average area income is 55,000. The distribution has a negative skew as median > mean.

# Measures of Dispersion
dispersion('Area_Income', ad2)

## Range =  13996.5 - 79484.8 
## IQR =  18438.83 
## Variance =  179952406 
## Standard Deviation =  13414.63 
## Skewness =  -0.6493967 
## Kurtosis =  2.894694

From the skewness value, the column distribution is fairly symmetrical, with a slight negative skew.
The kurtosis value indicates that the distribution is almost leptokurtic.

# Area Income Histogram
hist(num_df$ Area_Income, main = 'Histogram of Area Income', xlab = 'Area Income')

The distribution leans towards higher are incomes, with 65,000 having the highest frequency.

# Measures of Central Distribution
central.tendency('Daily_Internet_Usage', ad2)

## Measures of Central Tendency 
## Mean =  180.0001 
## Median =  183.13 
## Mode =  167.22

The average daily internet usage is 180 minutes. The distribution has a negative skew as mean < median.

# Measures of Dispersion
dispersion('Daily_Internet_Usage', ad2)

## Range =  104.78 - 269.96 
## IQR =  79.9625 
## Variance =  1927.415 
## Standard Deviation =  43.90234 
## Skewness =  -0.03348703 
## Kurtosis =  1.727701

From the skewness value, the column distribution is fairly symmetrical, with a slight negative skew.
The kurtosis value indicates that the distribution is fairly mesokurtic. link.

# Daily Internet Usage Histogram
hist(num_df$ Daily_Internet_Usage, main = 'Histogram of Daily Internet Usage',
     xlab = 'Daily Internet Usage')

Most users spent 125 minutes on the internet.

b) Bivariate Analysis

Categorical-Categorical

Clicked on Ad Vs Continent

# Count plot function
bar.plt <- function(data, col1, title, col2 = NULL){
  library(ggplot2)
  if (is.null(col2)){
    ggplot(data, aes(x = {{col1}}, fill = {{col1}})) + geom_bar() +
    ggtitle(paste(title, 'Frequency Plot')) + 
    theme(plot.title = element_text(hjust = 0.5))}
  else if (!is.null(col2)){
    ggplot(data, aes(x = {{col1}}, y = {{col2}}), fill = {{col1}}) + geom_bar()
      
  }
  
}

library(ggplot2)
Continent <- ad2$Continent
Clicked_on_Ad <- ad2$Clicked_on_Ad

# contingency.table <- table(ad2$Continent, ad2$Clicked_on_Ad)
contingency.table <- table(Clicked_on_Ad, Continent)
contingency.table

##              Continent
## Clicked_on_Ad     Africa Asia Europe North America Oceania South America
##             0 134     90   79     94            45      28            30
##             1 130     92   72     97            51      34            24

dev.new(width = 10, height=15, unit= 'in' ,noRStudioGD = TRUE)
# Mosaic plot of contingency table
mosaicplot(contingency.table, xlab='Continent', ylab='Clicked_on_Ad',
           main='Clicked on Ad Vs Continent', color = 'orange')

class(contingency.table)

## [1] "table"

names(ad2)

##  [1] "Daily_Time_Spent_on_Site" "Age"                     
##  [3] "Area_Income"              "Daily_Internet_Usage"    
##  [5] "Ad_Topic_Line"            "City"                    
##  [7] "Male"                     "Country"                 
##  [9] "Timestamp"                "Clicked_on_Ad"           
## [11] "Continent"

Numeircal-Numerical

Numerical-Categorical

Analysis Summary

c) Multivariate Analysis

6. Implemeting the Solution

7. Challenging the Solution

8. Conclusion

9. Follow Up Questions

a) Did we have the right data?

b) Do we need other data to answer our question?

c) Did we have the right question?

# Temporary Directory
dir.create(tempdir())

## Warning in dir.create(tempdir()): 'C:\Users\HP\AppData\Local\Temp\Rtmp0mafYE'
## already exists

Moringa School Module 3 Week 1 IP

Deborah Masibo

2022-05-26