Data Science for Marketing

I do not own any rights of the data an code in the below exercises asthey were entirely used for practise & learning purposes. Enjoy!

##1. Data, Exploratory Analysis, Performance Analysis

#Import CSV File
#Create a variable
MyExploratoryData <- read.csv("C:/Data Science for Marketing/Exercise_Files/02_02/exploratory-r.csv")

#Get a sanapshot of the data. This hsows the first lines of the data
head(MyExploratoryData)

#Visualize the data, in this case, a specific subset / column of the data
hist(MyExploratoryData$cpa) #In this case, we chose Cost Per Acquisition(cpa)

#Shift the names of the data. Measuring performance against the key words.
#We'll do some transformation first

row.names(MyExploratoryData) <- MyExploratoryData$keyword

#Realise after the above chunk of code, the rows are now named and not numbered.
head(MyExploratoryData)

#Transform into a matrix. In order to create a heatmap
MyDataMatrix <- data.matrix(MyExploratoryData)

#Create a heatmap.

#This can give us a clear performance of each of the keywords.
heatmap(MyDataMatrix, Rowv = NA, Colv = NA, scale = "column")

##2. Regression in R

#Import CSV File
#Create a variable
RegressionData <- read.csv("C:/Data Science for Marketing/Exercise_Files/03_02/regression-r.csv")

#Plot our data (broadcast & sales). This gives us a scatterplot
plot(RegressionData$BROADCAST, RegressionData$NET.SALES)

#Fit a line by using the 'lm' fxn.
Mylm <- lm(RegressionData$NET.SALES ~ RegressionData$BROADCAST)


#Visulize the regresion line. A line of best fit
#Note that this will not play on its on, at least if using R notebooks

lines(RegressionData$BROADCAST, Mylm$fitted.values)

#show our coefficients.
#There is a positive correlation between broadcastinga and sales
Mylm$coefficients

##              (Intercept) RegressionData$BROADCAST 
##                133108.78                 12141.94

##3. Prediction with R.

#Connect to our data
PredictionData <- read.csv("C:/Data Science for Marketing/Exercise_Files/04_02/prediction-r.csv")

#Sum our classifications so we can see them.
table(PredictionData$sales.classification)

## 
##   A   B   C 
##  85 138  77

#Output column names for easy reference
names(PredictionData)

##  [1] "sales"                        "sales.classification"        
##  [3] "capita"                       "drive.by.traffic"            
##  [5] "complimentary.establishments" "competition"                 
##  [7] "weather"                      "unemployment.rate"           
##  [9] "var1"                         "var2"                        
## [11] "var3"

#install tree package
library("tree")

#Configure algorithm to create the tree
#variable name | tree fxn | predictors
DecisionTree <- tree(sales.classification ~ capita + drive.by.traffic + complimentary.establishments + competition + weather + unemployment.rate + var1 + var2 + var3, data=PredictionData)

#Plot the tree
#A tree (dendrogram) showing a taxonomic relationship has been created. However, not labelled
plot(DecisionTree)
#Label the tree
text(DecisionTree)

The result below best advices what conditions are to be met in order for a new store / business to be fruitful in a certain location.This is based on the question we were trying to solve in this exercise.

#Prune the tree. Identifying the biggest predictors
PrunedTree <- prune.tree(DecisionTree, best = 3)
#plot the pruned tree
plot(PrunedTree)
#Label the pruned tree
text(PrunedTree)

##4. Cluster Analysis in R

#Connect to the data
ClusterData <- read.csv("C:/Data Science for Marketing/Exercise_Files/05_02/cluster-r.csv")

#Review the data
head(ClusterData)

#Standardize the data. Transform the data to get the equal weight.
ClusterDataStdzd <- scale(ClusterData[-1]) #remove the first column of the data, as the algo is only looking for numerical data

#Run kmeans on the standardised data
Groups <- kmeans(ClusterDataStdzd, 3)

#Load Cluster Library
library("cluster")

#Visualize the clusters
clusplot(ClusterDataStdzd, Groups$cluster)

#Summarise the data
Groups$size

## [1]  95 115  90

##5. Conjoint Analysis in R

ConjointData <- read.csv("C:/Data Science for Marketing/Exercise_Files/06_02/conjoint-r.csv")
ConjointRprofiles <- read.csv("C:/Data Science for Marketing/Exercise_Files/06_02/conjoint-r-profiles-matrix.csv")
ConjointDataLevels <- read.csv("C:/Data Science for Marketing/Exercise_Files/06_02/conjoint-r-level-names.csv")

#Install Conjoint package
#install.packages("conjoint")
library("conjoint")

#model some of our data
#(only want to look the first row)
caUtilities(y = ConjointData[1,], x = ConjointRprofiles, z = ConjointDataLevels)

## 
## Call:
## lm(formula = frml)
## 
## Residuals:
## ALL 6 residuals are 0: no residual degrees of freedom!
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)
## (Intercept)                2,667e+00         NA      NA       NA
## factor(x$photos)1          6,667e-01         NA      NA       NA
## factor(x$photos)2         -3,333e-01         NA      NA       NA
## factor(x$content.source)1  9,065e-17         NA      NA       NA
## factor(x$content.type)1    1,667e+00         NA      NA       NA
## factor(x$content.type)2   -3,333e-01         NA      NA       NA
## 
## Residual standard error: NaN on 0 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:    NaN 
## F-statistic:   NaN on 5 and 0 DF,  p-value: NA

## [1]  2.666667e+00  6.666667e-01 -3.333333e-01 -3.333333e-01  9.064933e-17
## [6] -9.064933e-17  1.666667e+00 -3.333333e-01 -1.333333e+00

#From the result below, we see that this individual in more interested in the editing fxn more than filter or collage features.

ConjointData[1,]

#here, we see that profile 4 has been considered to have a higher value than the rest.

#Model all Dataset
caUtilities(y = ConjointData, x = ConjointRprofiles, z = ConjointDataLevels)

## 
## Call:
## lm(formula = frml)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -1,992 -1,056 -0,056  0,944  2,082 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                2,52667    0,02702  93,520  < 2e-16 ***
## factor(x$photos)1         -0,43533    0,03417 -12,739  < 2e-16 ***
## factor(x$photos)2          0,42667    0,05403   7,896 4,00e-15 ***
## factor(x$content.source)1 -0,00200    0,03625  -0,055    0,956    
## factor(x$content.type)1    0,45867    0,09042   5,073 4,16e-07 ***
## factor(x$content.type)2   -0,03333    0,03417  -0,975    0,329    
## ---
## Signif. codes:  0 '***' 0,001 '**' 0,01 '*' 0,05 '.' 0,1 ' ' 1
## 
## Residual standard error: 1,146 on 2994 degrees of freedom
## Multiple R-squared:  0,09312,    Adjusted R-squared:  0,0916 
## F-statistic: 61,48 on 5 and 2994 DF,  p-value: < 2,2e-16

## [1]  2.526666667 -0.435333333  0.426666667  0.008666667 -0.002000000
## [6]  0.002000000  0.458666667 -0.033333333 -0.425333333