require(ggthemes)
library(tidyverse)
library(magrittr)
library(tidyr)
library(dplyr)
library(lubridate)
library(ggplot2)
library(fpp2)
library(forecast)
library(ggpubr)
library(boot)
library(plotly)
df <- read.csv(file="Videos.csv", sep=",",na.strings = c("NA"," ",""),strip.white = T, stringsAsFactors = F, header=T)
for(unique_value in unique(df$Gender)){
df[paste("Gender", unique_value, sep = ".")] <- ifelse(df$Gender == unique_value, 1, 0)
}
head(df)
## Customer State City Gender FirstChoice SecondChoice Purchases DollarAmt
## 1 1 AZ Phoenix F Comedy Action 42 251
## 2 2 AZ Phoenix F SciFi Drama 43 276
## 3 3 AZ Phoenix F Action Comedy 22 156
## 4 4 AZ Phoenix F Action Comedy 31 191
## 5 5 AZ Phoenix F Action SciFi 20 115
## 6 6 AZ Phoenix F SciFi Comedy 21 126
## Gender.F Gender.M Gender.NA
## 1 1 0 NA
## 2 1 0 NA
## 3 1 0 NA
## 4 1 0 NA
## 5 1 0 NA
## 6 1 0 NA
## Warning: Ignoring 1 observations
# without the female variable
model1 <- lm(Purchases ~ DollarAmt, data = df)
summary(model1)
##
## Call:
## lm(formula = Purchases ~ DollarAmt, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.5911 -0.9688 -0.0602 0.9349 7.2534
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.5316229 0.0398891 13.33 <2e-16 ***
## DollarAmt 0.1506180 0.0002229 675.80 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.516 on 9995 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.9786, Adjusted R-squared: 0.9786
## F-statistic: 4.567e+05 on 1 and 9995 DF, p-value: < 2.2e-16
# with the female variable added to the model
model <- lm(Purchases ~ DollarAmt+ Gender.F, data = df)
summary(model)
##
## Call:
## lm(formula = Purchases ~ DollarAmt + Gender.F, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.6185 -0.9747 -0.0612 0.9372 7.2190
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.5013003 0.0411715 12.176 < 2e-16 ***
## DollarAmt 0.1505208 0.0002252 668.380 < 2e-16 ***
## Gender.F 0.0906241 0.0306521 2.957 0.00312 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.515 on 9994 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.9786, Adjusted R-squared: 0.9786
## F-statistic: 2.285e+05 on 2 and 9994 DF, p-value: < 2.2e-16
# Compute the analysis of variance of dollar amount spent (predictor variable #1)
res.aov <- aov(Purchases~ DollarAmt, data = df)
# Summary of the analysis
summary(res.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## DollarAmt 1 1049728 1049728 456708 <2e-16 ***
## Residuals 9995 22973 2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1 observation deleted due to missingness
# 1. Homogeneity of variances
plot(res.aov, 1)
# 2. Normality
plot(res.aov, 2)
# Compute the analysis of variance of sex: female (predictor variable #2)
res.aov <- aov(Purchases~ Gender.F, data = df)
# Summary of the analysis
summary(res.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## Gender.F 1 23744 23744 226.2 <2e-16 ***
## Residuals 9995 1048957 105
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1 observation deleted due to missingness
# 1. Homogeneity of variances
plot(res.aov, 1)
# 2. Normality
plot(res.aov, 2)
Summary and Conclusion:
- In this case, the amount spent by customers was good enough of a predictor variable to predict the no. of purchases
- For one, the scatter plots showed that there's really good linear relationships between the predictors vs. the response variable.
- The statistical diagnostics via F-stat, T-stat and p-values revealed the signifcance of the slope and it all met the conditions for a linear regression model; with or without the added dummy variable of female
In fact, the residual analysis showed the amount spent was much better than the added second predictor variable of female or not