require(ggthemes)
library(tidyverse)
library(magrittr)
library(tidyr)
library(dplyr)
library(lubridate)
library(ggplot2)
library(fpp2)   
library(forecast)
library(ggpubr)
library(boot)
library(plotly)

Loading Data

df <- read.csv(file="Videos.csv", sep=",",na.strings = c("NA"," ",""),strip.white = T, stringsAsFactors = F, header=T) 

for(unique_value in unique(df$Gender)){
 

df[paste("Gender", unique_value, sep = ".")] <- ifelse(df$Gender == unique_value, 1, 0)

}

head(df)
##   Customer State    City Gender FirstChoice SecondChoice Purchases DollarAmt
## 1        1    AZ Phoenix      F      Comedy       Action        42       251
## 2        2    AZ Phoenix      F       SciFi        Drama        43       276
## 3        3    AZ Phoenix      F      Action       Comedy        22       156
## 4        4    AZ Phoenix      F      Action       Comedy        31       191
## 5        5    AZ Phoenix      F      Action        SciFi        20       115
## 6        6    AZ Phoenix      F       SciFi       Comedy        21       126
##   Gender.F Gender.M Gender.NA
## 1        1        0        NA
## 2        1        0        NA
## 3        1        0        NA
## 4        1        0        NA
## 5        1        0        NA
## 6        1        0        NA

Scatter Plots - checking for linear relationships

## Warning: Ignoring 1 observations
# without the female variable
model1 <- lm(Purchases ~ DollarAmt, data = df)
summary(model1)
## 
## Call:
## lm(formula = Purchases ~ DollarAmt, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.5911 -0.9688 -0.0602  0.9349  7.2534 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.5316229  0.0398891   13.33   <2e-16 ***
## DollarAmt   0.1506180  0.0002229  675.80   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.516 on 9995 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.9786, Adjusted R-squared:  0.9786 
## F-statistic: 4.567e+05 on 1 and 9995 DF,  p-value: < 2.2e-16
# with the female variable added to the model
model <- lm(Purchases ~ DollarAmt+ Gender.F, data = df)
summary(model)
## 
## Call:
## lm(formula = Purchases ~ DollarAmt + Gender.F, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.6185 -0.9747 -0.0612  0.9372  7.2190 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.5013003  0.0411715  12.176  < 2e-16 ***
## DollarAmt   0.1505208  0.0002252 668.380  < 2e-16 ***
## Gender.F    0.0906241  0.0306521   2.957  0.00312 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.515 on 9994 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.9786, Adjusted R-squared:  0.9786 
## F-statistic: 2.285e+05 on 2 and 9994 DF,  p-value: < 2.2e-16
# Compute the analysis of variance of dollar amount spent (predictor variable #1)
res.aov <- aov(Purchases~ DollarAmt, data = df)
# Summary of the analysis
summary(res.aov)
##               Df  Sum Sq Mean Sq F value Pr(>F)    
## DollarAmt      1 1049728 1049728  456708 <2e-16 ***
## Residuals   9995   22973       2                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1 observation deleted due to missingness
# 1. Homogeneity of variances
plot(res.aov, 1)

# 2. Normality
plot(res.aov, 2)

# Compute the analysis of variance of sex: female (predictor variable #2)
res.aov <- aov(Purchases~ Gender.F, data = df)
# Summary of the analysis
summary(res.aov)
##               Df  Sum Sq Mean Sq F value Pr(>F)    
## Gender.F       1   23744   23744   226.2 <2e-16 ***
## Residuals   9995 1048957     105                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1 observation deleted due to missingness
# 1. Homogeneity of variances
plot(res.aov, 1)

# 2. Normality
plot(res.aov, 2)

Summary and Conclusion:


- In this case, the amount spent by customers was good enough of a predictor variable to predict the no. of purchases
- For one, the scatter plots showed that there's really good linear relationships between the predictors vs. the response variable.
- The statistical diagnostics via F-stat, T-stat and p-values revealed the signifcance of the slope and it all met the conditions for a linear regression model; with or without the added dummy variable of female

In fact, the residual analysis showed the amount spent was much better than the added second predictor variable of female or not