HW #3

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.5.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 3.5.3

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.2.1     v purrr   0.3.3
## v tibble  2.1.3     v stringr 1.4.0
## v tidyr   1.0.2     v forcats 0.4.0
## v readr   1.3.1

## Warning: package 'ggplot2' was built under R version 3.5.3

## Warning: package 'tibble' was built under R version 3.5.3

## Warning: package 'tidyr' was built under R version 3.5.3

## Warning: package 'readr' was built under R version 3.5.3

## Warning: package 'purrr' was built under R version 3.5.3

## Warning: package 'stringr' was built under R version 3.5.3

## Warning: package 'forcats' was built under R version 3.5.3

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

getwd()

## [1] "C:/Users/alexl/Desktop/MGT 6203/HW3"

data <- read.csv("KAG.csv", header = TRUE, stringsAsFactors = FALSE)
#print(data)

#Q.1 Which ad (provide ad_id as the answer) among the ads that have the least CPC led to the most impressions?

#use dual level sort for CPC and conversions
data %>% arrange(CPC, -Impressions)

head(data)

#ad_id 708746

#Q.2 What campaign (provide campaign_id as the answer) had spent least efficiently on brand awareness on an average(i.e. most Cost per mille or CPM: use total cost for the campaign / total impressions in thousands)?

#Add cost per mille (CPM) and other useful feature to new dataset
data2 <- data %>% mutate(CTR = round(((Clicks / Impressions) * 100),4), 
                        CPC = ifelse(Clicks != 0, round(Spent / Clicks,4), Spent), 
                        CostPerConv_Total = ifelse(Total_Conversion !=0,round(Spent/Total_Conversion,4),Spent),
                        CostPerConv_Approved = ifelse(Approved_Conversion !=0,round(Spent/Approved_Conversion,4),Spent),
                        CPM = round((Spent / Impressions) * 1000, 2) )

head(data2)

#Calculate average cost per mille by campaign_id
data2 %>% group_by(campaign_id) %>% summarise(n_ads = length(ad_id),campaign_CPM = mean(CPM)) %>%
  arrange(desc(campaign_CPM))

#Add campaign 936 spent the least efficility with a campaign_CPM of 0.936

#Q.3 Assume each conversion (‘Total_Conversion’) is worth $5, each approved conversion (‘Approved_Conversion’) is worth $50. ROAS (return on advertising spent) is revenue as a percentage of the advertising spent . Calculate ROAS and round it to two decimals.
#ROAS = 5XTotal_Conversion+50*Approve_Conversion / Spent

#Add ROAS to data set and other useful feature to new dataset
data4 <- data %>% mutate(ROAS = signif((5 * Total_Conversion + 50*Approved_Conversion) / (Spent), digits =3)) %>% 
  filter_all(all_vars(!is.infinite(.))) %>% filter_all(all_vars(!is.nan(.))) 

#print(data4) 
data5 <- data4
data5 <- data5 %>%filter(interest %in% c('15','21','101')) %>%  select(interest,gender,ROAS)
#print(data5)

#create plot

ggplot(data = data5, aes(x=as.factor(interest), y=ROAS)) +geom_boxplot(aes(fill=as.factor(gender))) + scale_y_log10()+ xlab("Interest") + ylab("ROAS")+guides(fill=guide_legend(title="Gender"))

#Q.4 Summarize the median and mean of ROAS by genders when campaign_id == 1178.
roas1178 <- data4 %>% filter(campaign_id == 1178 ) %>% group_by(gender) %>% summarise(MEAN = mean(ROAS), Median = median(ROAS))
roas1178

if (!require(readr)) install.packages("readr")
library(readr)

if (!require(correlationfunnel)) install.packages("correlationfunnel")

## Loading required package: correlationfunnel

## Warning: package 'correlationfunnel' was built under R version 3.5.3

## == Using correlationfunnel? ====================================================
## You might also be interested in applied data science training for business.
## </> Learn more at - www.business-science.io </>

library(correlationfunnel)

if (!require(DataExplorer)) install.packages("DataExplorer")

## Loading required package: DataExplorer

## Warning: package 'DataExplorer' was built under R version 3.5.3

library(DataExplorer)

if (!require(WVPlots)) install.packages("WVPlots")

## Loading required package: WVPlots

## Warning: package 'WVPlots' was built under R version 3.5.3

library(WVPlots)

if (!require(ggthemes)) install.packages("ggthemes")

## Loading required package: ggthemes

## Warning: package 'ggthemes' was built under R version 3.5.3

library(ggthemes)

if (!require(ROCR)) install.packages("ROCR")

## Loading required package: ROCR

## Warning: package 'ROCR' was built under R version 3.5.3

## Loading required package: gplots

## Warning: package 'gplots' was built under R version 3.5.3

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

library(ROCR)

if (!require(caret)) install.packages("caret")

## Loading required package: caret

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library(caret)

if (!require(e1071)) install.packages("e1071")

## Loading required package: e1071

## Warning: package 'e1071' was built under R version 3.5.3

library(e1071)

if (!require(corrplot)) install.packages("corrplot")

## Loading required package: corrplot

## Warning: package 'corrplot' was built under R version 3.5.3

## corrplot 0.84 loaded

library(corrplot)

getwd()

## [1] "C:/Users/alexl/Desktop/MGT 6203/HW3"

ad3 = read.csv("advertising1.csv", header = TRUE, fileEncoding="UTF-8-BOM")
head(ad3)

#a) We aim to explore the dataset so that we can better choose a model to implement. Plot histograms for at least 2 of the continuous variables in the dataset. Note it is acceptable to plot more than 2. [1 point]
hist(ad3$Daily.Internet.Usage)

hist(ad3$Area.Income)

#b) Again on the track of exploring the dataset, plot at least 2 bar charts reflecting the counts of different values for different variables. Note it is acceptable to plot more than 2. 
ggplot(ad3, aes(x = Age)) +geom_bar()+ labs(title = "Bar Plot of Age")

ggplot(ad3, aes(x = Country)) +geom_bar()+ labs(title = "Bar Plot of Country")

#c) Plot boxplots for Age, Area.Income, Daily.Internet.Usage and Daily.Time.Spent.on.Site separated by the variable Clicked.on.Ad. To clarify, we want to create 4 plots, each of which has 2 boxplots: 1 for people who clicked on the ad, one for those who didn’t. [2 points]
ggplot(data = ad3, mapping = aes(x = as.factor(Clicked.on.Ad), y = Age)) + geom_boxplot() + labs(title = "Age vs Clicked ", x = "Clicked")

ggplot(data = ad3, mapping = aes(x = as.factor(Clicked.on.Ad), y = Area.Income)) + geom_boxplot() + labs(title = "Area Income vs Clicked", x = "Clicked ",y = "Area Income")

ggplot(data = ad3, mapping = aes(x = as.factor(Clicked.on.Ad), y = Daily.Internet.Usage)) + geom_boxplot() + labs(title = "Daily Internet Usage vs Clicked", x = "Clicked",y = "Daily Internet Usage")

ggplot(data = ad3, mapping = aes(x = as.factor(Clicked.on.Ad), y = Daily.Time.Spent.on.Site)) + geom_boxplot() + labs(title = "Daily Time  vs Clicked", x = "Clicked ",y = "Daily Time ")

#d) Based on our preliminary boxplots, would you expect an older person to be more likely to click on the ad than someone younger? [2 points]
#Answer: Yes, based on the boxplot it is more like that someone older clicks on the plot than someone younger.

#Q.6

#Part (a) [3 points]

#1. Make a scatter plot for Area.Income against Age. Separate the datapoints by different shapes based on if the datapoint has clicked on the ad or not.

ggplot(data = ad3, mapping = aes(x = Age, y = Area.Income)) + geom_point(aes(shape = as.factor(Clicked.on.Ad), color = Clicked.on.Ad))

#2. Based on this plot, would you expect a 31-year-old person with an Area income of $62,000 to click on the ad or not?
#No, I would not expect them to have cicked on the ad
#Part (b) [3 points]

#1. Similar to part a), create a scatter plot for Daily.Time.Spent.on.Site against Age. Separate the datapoints by different shapes based on if the datapoint has clicked on the ad or not.
ggplot(data = ad3, mapping = aes(x = Age, y = Daily.Time.Spent.on.Site)) + geom_point(aes(shape = as.factor(Clicked.on.Ad), color = Clicked.on.Ad))

#2. Based on this plot, would you expect a 50-year-old person who spends 60 minutes daily on the site to click on the ad or not?
#yes, I would expect them to click the ad


#Q.7

#Part (a) [2 points]

#Generate a correlation funnel (using the correlation funnel package) to see which of the variable in the dataset have the most correlation with having clicked the advert.
#NOTE: Here we are creating the correlation funnel in regards to HAVING clicked the advert, rather than not. This will lead to a minor distinction in your code between the 2 cases. However, it will not affect your results and subsequent variable selection.
ad4=ad3
ad4$Age =as.factor(ad4$Age)
ad4$Male=as.factor(ad4$Male)
ad4$Clicked.on.Ad=as.factor(ad4$Clicked.on.Ad)

ad4 %>%
  mutate_if(is.numeric,as.numeric)%>%
    binarize() %>% 
    correlate(Clicked.on.Ad__1) %>%  
    plot_correlation_funnel(interactive = TRUE, alpha = 0.7)

## Warning: All elements of `...` must be named.
## Did you want `data = c(type, role, source)`?

#4 most covarying varibles = age, daily internet usage, area income and daily time spent on site


#Part (b) [2 points]


#1. Based on the generated correlation funnel, choose the 4 most covarying variables (with having clicked the advert) and run a logistic regression model for Clicked.on.Ad using these 4 variables.
logitm = dvertising_logistic_regression <- glm(data=ad3,Clicked.on.Ad ~ Age+Area.Income+Daily.Internet.Usage + Daily.Time.Spent.on.Site, family = 'binomial')


#2. Output the summary of this model.
summary(logitm)

## 
## Call:
## glm(formula = Clicked.on.Ad ~ Age + Area.Income + Daily.Internet.Usage + 
##     Daily.Time.Spent.on.Site, family = "binomial", data = ad3)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4578  -0.1341  -0.0333   0.0167   3.1961  
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)               2.713e+01  2.714e+00   9.995  < 2e-16 ***
## Age                       1.709e-01  2.568e-02   6.655 2.83e-11 ***
## Area.Income              -1.354e-04  1.868e-05  -7.247 4.25e-13 ***
## Daily.Internet.Usage     -6.391e-02  6.745e-03  -9.475  < 2e-16 ***
## Daily.Time.Spent.on.Site -1.919e-01  2.066e-02  -9.291  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1386.3  on 999  degrees of freedom
## Residual deviance:  182.9  on 995  degrees of freedom
## AIC: 192.9
## 
## Number of Fisher Scoring iterations: 8

#8Now that we have created our logistic regression model using variables of significance, we must test the model.
ad3$predictreg =predict(logitm, ad3, type="response")
ad3$predictvalue <- ifelse(ad3$predictreg>0.8, 1,0)
xtab <- table(ad3$Clicked.on.Ad,ad3$predictvalue)
confusionMatrix(xtab)

## Confusion Matrix and Statistics
## 
##    
##       0   1
##   0 497   3
##   1  36 464
##                                           
##                Accuracy : 0.961           
##                  95% CI : (0.9471, 0.9721)
##     No Information Rate : 0.533           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.922           
##  Mcnemar's Test P-Value : 2.99e-07        
##                                           
##             Sensitivity : 0.9325          
##             Specificity : 0.9936          
##          Pos Pred Value : 0.9940          
##          Neg Pred Value : 0.9280          
##              Prevalence : 0.5330          
##          Detection Rate : 0.4970          
##    Detection Prevalence : 0.5000          
##       Balanced Accuracy : 0.9630          
##                                           
##        'Positive' Class : 0               
##

#How many false-negative occurrences do you observe?
#There are 36 false negatives

HW #3

Alex MacGregor

13/04/2020