library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.5.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1 v purrr 0.3.3
## v tibble 2.1.3 v stringr 1.4.0
## v tidyr 1.0.2 v forcats 0.4.0
## v readr 1.3.1
## Warning: package 'ggplot2' was built under R version 3.5.3
## Warning: package 'tibble' was built under R version 3.5.3
## Warning: package 'tidyr' was built under R version 3.5.3
## Warning: package 'readr' was built under R version 3.5.3
## Warning: package 'purrr' was built under R version 3.5.3
## Warning: package 'stringr' was built under R version 3.5.3
## Warning: package 'forcats' was built under R version 3.5.3
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
getwd()
## [1] "C:/Users/alexl/Desktop/MGT 6203/HW3"
data <- read.csv("KAG.csv", header = TRUE, stringsAsFactors = FALSE)
#print(data)
#Q.1 Which ad (provide ad_id as the answer) among the ads that have the least CPC led to the most impressions?
#use dual level sort for CPC and conversions
data %>% arrange(CPC, -Impressions)
head(data)
#ad_id 708746
#Q.2 What campaign (provide campaign_id as the answer) had spent least efficiently on brand awareness on an average(i.e. most Cost per mille or CPM: use total cost for the campaign / total impressions in thousands)?
#Add cost per mille (CPM) and other useful feature to new dataset
data2 <- data %>% mutate(CTR = round(((Clicks / Impressions) * 100),4),
CPC = ifelse(Clicks != 0, round(Spent / Clicks,4), Spent),
CostPerConv_Total = ifelse(Total_Conversion !=0,round(Spent/Total_Conversion,4),Spent),
CostPerConv_Approved = ifelse(Approved_Conversion !=0,round(Spent/Approved_Conversion,4),Spent),
CPM = round((Spent / Impressions) * 1000, 2) )
head(data2)
#Calculate average cost per mille by campaign_id
data2 %>% group_by(campaign_id) %>% summarise(n_ads = length(ad_id),campaign_CPM = mean(CPM)) %>%
arrange(desc(campaign_CPM))
#Add campaign 936 spent the least efficility with a campaign_CPM of 0.936
#Q.3 Assume each conversion (‘Total_Conversion’) is worth $5, each approved conversion (‘Approved_Conversion’) is worth $50. ROAS (return on advertising spent) is revenue as a percentage of the advertising spent . Calculate ROAS and round it to two decimals.
#ROAS = 5XTotal_Conversion+50*Approve_Conversion / Spent
#Add ROAS to data set and other useful feature to new dataset
data4 <- data %>% mutate(ROAS = signif((5 * Total_Conversion + 50*Approved_Conversion) / (Spent), digits =3)) %>%
filter_all(all_vars(!is.infinite(.))) %>% filter_all(all_vars(!is.nan(.)))
#print(data4)
data5 <- data4
data5 <- data5 %>%filter(interest %in% c('15','21','101')) %>% select(interest,gender,ROAS)
#print(data5)
#create plot
ggplot(data = data5, aes(x=as.factor(interest), y=ROAS)) +geom_boxplot(aes(fill=as.factor(gender))) + scale_y_log10()+ xlab("Interest") + ylab("ROAS")+guides(fill=guide_legend(title="Gender"))

#Q.4 Summarize the median and mean of ROAS by genders when campaign_id == 1178.
roas1178 <- data4 %>% filter(campaign_id == 1178 ) %>% group_by(gender) %>% summarise(MEAN = mean(ROAS), Median = median(ROAS))
roas1178
if (!require(readr)) install.packages("readr")
library(readr)
if (!require(correlationfunnel)) install.packages("correlationfunnel")
## Loading required package: correlationfunnel
## Warning: package 'correlationfunnel' was built under R version 3.5.3
## == Using correlationfunnel? ====================================================
## You might also be interested in applied data science training for business.
## </> Learn more at - www.business-science.io </>
library(correlationfunnel)
if (!require(DataExplorer)) install.packages("DataExplorer")
## Loading required package: DataExplorer
## Warning: package 'DataExplorer' was built under R version 3.5.3
library(DataExplorer)
if (!require(WVPlots)) install.packages("WVPlots")
## Loading required package: WVPlots
## Warning: package 'WVPlots' was built under R version 3.5.3
library(WVPlots)
if (!require(ggthemes)) install.packages("ggthemes")
## Loading required package: ggthemes
## Warning: package 'ggthemes' was built under R version 3.5.3
library(ggthemes)
if (!require(ROCR)) install.packages("ROCR")
## Loading required package: ROCR
## Warning: package 'ROCR' was built under R version 3.5.3
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.5.3
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(ROCR)
if (!require(caret)) install.packages("caret")
## Loading required package: caret
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(caret)
if (!require(e1071)) install.packages("e1071")
## Loading required package: e1071
## Warning: package 'e1071' was built under R version 3.5.3
library(e1071)
if (!require(corrplot)) install.packages("corrplot")
## Loading required package: corrplot
## Warning: package 'corrplot' was built under R version 3.5.3
## corrplot 0.84 loaded
library(corrplot)
getwd()
## [1] "C:/Users/alexl/Desktop/MGT 6203/HW3"
ad3 = read.csv("advertising1.csv", header = TRUE, fileEncoding="UTF-8-BOM")
head(ad3)
#a) We aim to explore the dataset so that we can better choose a model to implement. Plot histograms for at least 2 of the continuous variables in the dataset. Note it is acceptable to plot more than 2. [1 point]
hist(ad3$Daily.Internet.Usage)

hist(ad3$Area.Income)

#b) Again on the track of exploring the dataset, plot at least 2 bar charts reflecting the counts of different values for different variables. Note it is acceptable to plot more than 2.
ggplot(ad3, aes(x = Age)) +geom_bar()+ labs(title = "Bar Plot of Age")

ggplot(ad3, aes(x = Country)) +geom_bar()+ labs(title = "Bar Plot of Country")

#c) Plot boxplots for Age, Area.Income, Daily.Internet.Usage and Daily.Time.Spent.on.Site separated by the variable Clicked.on.Ad. To clarify, we want to create 4 plots, each of which has 2 boxplots: 1 for people who clicked on the ad, one for those who didn’t. [2 points]
ggplot(data = ad3, mapping = aes(x = as.factor(Clicked.on.Ad), y = Age)) + geom_boxplot() + labs(title = "Age vs Clicked ", x = "Clicked")

ggplot(data = ad3, mapping = aes(x = as.factor(Clicked.on.Ad), y = Area.Income)) + geom_boxplot() + labs(title = "Area Income vs Clicked", x = "Clicked ",y = "Area Income")

ggplot(data = ad3, mapping = aes(x = as.factor(Clicked.on.Ad), y = Daily.Internet.Usage)) + geom_boxplot() + labs(title = "Daily Internet Usage vs Clicked", x = "Clicked",y = "Daily Internet Usage")

ggplot(data = ad3, mapping = aes(x = as.factor(Clicked.on.Ad), y = Daily.Time.Spent.on.Site)) + geom_boxplot() + labs(title = "Daily Time vs Clicked", x = "Clicked ",y = "Daily Time ")

#d) Based on our preliminary boxplots, would you expect an older person to be more likely to click on the ad than someone younger? [2 points]
#Answer: Yes, based on the boxplot it is more like that someone older clicks on the plot than someone younger.
#Q.6
#Part (a) [3 points]
#1. Make a scatter plot for Area.Income against Age. Separate the datapoints by different shapes based on if the datapoint has clicked on the ad or not.
ggplot(data = ad3, mapping = aes(x = Age, y = Area.Income)) + geom_point(aes(shape = as.factor(Clicked.on.Ad), color = Clicked.on.Ad))

#2. Based on this plot, would you expect a 31-year-old person with an Area income of $62,000 to click on the ad or not?
#No, I would not expect them to have cicked on the ad
#Part (b) [3 points]
#1. Similar to part a), create a scatter plot for Daily.Time.Spent.on.Site against Age. Separate the datapoints by different shapes based on if the datapoint has clicked on the ad or not.
ggplot(data = ad3, mapping = aes(x = Age, y = Daily.Time.Spent.on.Site)) + geom_point(aes(shape = as.factor(Clicked.on.Ad), color = Clicked.on.Ad))

#2. Based on this plot, would you expect a 50-year-old person who spends 60 minutes daily on the site to click on the ad or not?
#yes, I would expect them to click the ad
#Q.7
#Part (a) [2 points]
#Generate a correlation funnel (using the correlation funnel package) to see which of the variable in the dataset have the most correlation with having clicked the advert.
#NOTE: Here we are creating the correlation funnel in regards to HAVING clicked the advert, rather than not. This will lead to a minor distinction in your code between the 2 cases. However, it will not affect your results and subsequent variable selection.
ad4=ad3
ad4$Age =as.factor(ad4$Age)
ad4$Male=as.factor(ad4$Male)
ad4$Clicked.on.Ad=as.factor(ad4$Clicked.on.Ad)
ad4 %>%
mutate_if(is.numeric,as.numeric)%>%
binarize() %>%
correlate(Clicked.on.Ad__1) %>%
plot_correlation_funnel(interactive = TRUE, alpha = 0.7)
## Warning: All elements of `...` must be named.
## Did you want `data = c(type, role, source)`?
#4 most covarying varibles = age, daily internet usage, area income and daily time spent on site
#Part (b) [2 points]
#1. Based on the generated correlation funnel, choose the 4 most covarying variables (with having clicked the advert) and run a logistic regression model for Clicked.on.Ad using these 4 variables.
logitm = dvertising_logistic_regression <- glm(data=ad3,Clicked.on.Ad ~ Age+Area.Income+Daily.Internet.Usage + Daily.Time.Spent.on.Site, family = 'binomial')
#2. Output the summary of this model.
summary(logitm)
##
## Call:
## glm(formula = Clicked.on.Ad ~ Age + Area.Income + Daily.Internet.Usage +
## Daily.Time.Spent.on.Site, family = "binomial", data = ad3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4578 -0.1341 -0.0333 0.0167 3.1961
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.713e+01 2.714e+00 9.995 < 2e-16 ***
## Age 1.709e-01 2.568e-02 6.655 2.83e-11 ***
## Area.Income -1.354e-04 1.868e-05 -7.247 4.25e-13 ***
## Daily.Internet.Usage -6.391e-02 6.745e-03 -9.475 < 2e-16 ***
## Daily.Time.Spent.on.Site -1.919e-01 2.066e-02 -9.291 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1386.3 on 999 degrees of freedom
## Residual deviance: 182.9 on 995 degrees of freedom
## AIC: 192.9
##
## Number of Fisher Scoring iterations: 8
#8Now that we have created our logistic regression model using variables of significance, we must test the model.
ad3$predictreg =predict(logitm, ad3, type="response")
ad3$predictvalue <- ifelse(ad3$predictreg>0.8, 1,0)
xtab <- table(ad3$Clicked.on.Ad,ad3$predictvalue)
confusionMatrix(xtab)
## Confusion Matrix and Statistics
##
##
## 0 1
## 0 497 3
## 1 36 464
##
## Accuracy : 0.961
## 95% CI : (0.9471, 0.9721)
## No Information Rate : 0.533
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.922
## Mcnemar's Test P-Value : 2.99e-07
##
## Sensitivity : 0.9325
## Specificity : 0.9936
## Pos Pred Value : 0.9940
## Neg Pred Value : 0.9280
## Prevalence : 0.5330
## Detection Rate : 0.4970
## Detection Prevalence : 0.5000
## Balanced Accuracy : 0.9630
##
## 'Positive' Class : 0
##
#How many false-negative occurrences do you observe?
#There are 36 false negatives