Data: SKP_FashionBig data set contains 1000 observations of 4 variables. The predictor variables are Age, Income and Months_subbed, and the response variable is Upgrade.
Objective:
The company wants to understand the effect of age, income and the length of subscription on upgrading to premium fashion service and make prediction for other customers using this information.
Data Preparation
#Loading necessary libraries
library(tidyverse) # For data manipulation
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(caret) # confusion matrix
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(e1071)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ggplot2)
#PREPARING WORK SPAcE
# Clear the workspace:
rm(list = ls())
# Load data
df <- read.csv("SKP_fashionBIG.csv", header = TRUE)
head(df)
## age income months_subbed upgrade
## 1 22 55890 14 0
## 2 32 86030 57 0
## 3 38 49220 37 1
## 4 14 92710 51 1
## 5 33 94060 37 0
## 6 34 41880 76 1
nrow(df)
## [1] 1000
colnames(df)
## [1] "age" "income" "months_subbed" "upgrade"
#Summary Statistics
summary(df)
## age income months_subbed upgrade
## Min. : 6.0 Min. : 17570 Min. : 1.00 Min. :0.000
## 1st Qu.:25.0 1st Qu.: 67658 1st Qu.:21.00 1st Qu.:0.000
## Median :30.0 Median : 80270 Median :41.00 Median :1.000
## Mean :29.8 Mean : 80290 Mean :40.96 Mean :0.579
## 3rd Qu.:34.0 3rd Qu.: 93468 3rd Qu.:61.00 3rd Qu.:1.000
## Max. :52.0 Max. :143360 Max. :80.00 Max. :1.000
sapply(df, sd)
## age income months_subbed upgrade
## 6.985110e+00 1.962395e+04 2.310230e+01 4.939666e-01
#Data Structure
str(df)
## 'data.frame': 1000 obs. of 4 variables:
## $ age : int 22 32 38 14 33 34 26 26 26 24 ...
## $ income : int 55890 86030 49220 92710 94060 41880 98780 75510 66520 88920 ...
## $ months_subbed: int 14 57 37 51 37 76 71 62 5 51 ...
## $ upgrade : int 0 0 1 1 0 1 0 0 0 1 ...
All variables are integer and gpa is numeric because it has decimal value.We need to convert admit and rank to categorical variable.
#Convert to Categorical Variable
newdf <- df %>%
mutate_at(vars(upgrade), funs(factor))
str(newdf)
## 'data.frame': 1000 obs. of 4 variables:
## $ age : int 22 32 38 14 33 34 26 26 26 24 ...
## $ income : int 55890 86030 49220 92710 94060 41880 98780 75510 66520 88920 ...
## $ months_subbed: int 14 57 37 51 37 76 71 62 5 51 ...
## $ upgrade : Factor w/ 2 levels "0","1": 1 1 2 2 1 2 1 1 1 2 ...
#Pairs plot
ggpairs(newdf)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
As we see on pairs plot, none of the predictors have a strong correlation with the response variable. So, we cannot expect high accuracy rate on our model, we can investigate deeper to understand each ralationship between the predicotr and response variable.
The Effect of Age
In order to understand th effect of age, we can plot it first.
# Scatter plot by group
ggplot(df, aes(x = age, y = upgrade)) +
geom_point()
age_df <- df %>%
mutate(Age_cat = case_when(age <10 ~ "0-10",
age <20 ~ "10-20",
age <30 ~ "20-30",
age <40 ~ "30-40",
age <50 ~ "40-50",
age <60 ~ "50-60" ))
head(age_df)
## age income months_subbed upgrade Age_cat
## 1 22 55890 14 0 20-30
## 2 32 86030 57 0 30-40
## 3 38 49220 37 1 30-40
## 4 14 92710 51 1 10-20
## 5 33 94060 37 0 30-40
## 6 34 41880 76 1 30-40
#Two-way table
xtabs(~upgrade + Age_cat, data=age_df)
## Age_cat
## upgrade 0-10 10-20 20-30 30-40 40-50 50-60
## 0 1 28 227 136 28 1
## 1 1 31 198 279 68 2
new_age<-age_df %>%
group_by(Age_cat) %>%
summarise(Proportion = mean(upgrade))
new_age
## # A tibble: 6 x 2
## Age_cat Proportion
## <chr> <dbl>
## 1 0-10 0.5
## 2 10-20 0.525
## 3 20-30 0.466
## 4 30-40 0.672
## 5 40-50 0.708
## 6 50-60 0.667
# Scatter plot by group
ggplot(new_age, aes(x = Age_cat, y = Proportion)) +
geom_point()
We can notice the age effect on upgrading to the premium service. The grapg clearly indicates that customers who is older than 30 years old are more likely to upgrade their premium service than the customers who is younger than 30 years old.
The Effect of Income
# Scatter plot by group
ggplot(newdf, aes(x = income, y = upgrade)) +
geom_point()
income_df <- df %>%
mutate(Income_cat = case_when(income < 20000 ~ "10-20",
income < 30000 ~ "20-30",
income < 40000 ~ "30-40",
income < 50000 ~ "40-50",
income < 60000 ~ "50-60",
income < 70000 ~ "60-70",
income < 80000 ~ "70-80",
income < 90000 ~ "80-90",
income < 100000 ~ "90-100",
income >= 100000 ~ "100+"))
head(income_df)
## age income months_subbed upgrade Income_cat
## 1 22 55890 14 0 50-60
## 2 32 86030 57 0 80-90
## 3 38 49220 37 1 40-50
## 4 14 92710 51 1 90-100
## 5 33 94060 37 0 90-100
## 6 34 41880 76 1 40-50
new_income<-income_df %>%
group_by(Income_cat) %>%
summarise(Proportion = mean(upgrade))
new_income
## # A tibble: 10 x 2
## Income_cat Proportion
## <chr> <dbl>
## 1 10-20 0
## 2 100+ 0.562
## 3 20-30 0.5
## 4 30-40 0.737
## 5 40-50 0.578
## 6 50-60 0.55
## 7 60-70 0.566
## 8 70-80 0.577
## 9 80-90 0.604
## 10 90-100 0.581
#Two-way table
xtabs(~upgrade + Income_cat, data=income_df)
## Income_cat
## upgrade 10-20 100+ 20-30 30-40 40-50 50-60 60-70 70-80 80-90 90-100
## 0 1 71 2 5 19 36 62 85 78 62
## 1 0 91 2 14 26 44 81 116 119 86
# Scatter plot by group
ggplot(new_income, aes(x = Income_cat, y = Proportion)) +
geom_point()
As we can see from the graph, income do not effect on upgrading to the premium service.
The Effect of Length of Subscription
# Scatter plot by group
ggplot(newdf, aes(x = months_subbed , y = upgrade)) +
geom_point()
sub_df <- df %>%
mutate(Sub_cat = case_when(months_subbed <10 ~ "0-10",
months_subbed <20 ~ "10-20",
months_subbed <30 ~ "20-30",
months_subbed <40 ~ "30-40",
months_subbed <50 ~ "40-50",
months_subbed <60 ~ "50-60",
months_subbed >=60 ~ "60+"))
head(sub_df)
## age income months_subbed upgrade Sub_cat
## 1 22 55890 14 0 10-20
## 2 32 86030 57 0 50-60
## 3 38 49220 37 1 30-40
## 4 14 92710 51 1 50-60
## 5 33 94060 37 0 30-40
## 6 34 41880 76 1 60+
new_sub<-sub_df %>%
group_by(Sub_cat) %>%
summarise(Proportion = mean(upgrade))
new_sub
## # A tibble: 7 x 2
## Sub_cat Proportion
## <chr> <dbl>
## 1 0-10 0.543
## 2 10-20 0.570
## 3 20-30 0.602
## 4 30-40 0.591
## 5 40-50 0.552
## 6 50-60 0.624
## 7 60+ 0.573
#Two-way table
xtabs(~upgrade + Sub_cat, data=sub_df)
## Sub_cat
## upgrade 0-10 10-20 20-30 30-40 40-50 50-60 60+
## 0 48 55 45 56 56 47 114
## 1 57 73 68 81 69 78 153
# Scatter plot by group
ggplot(new_sub, aes(x = Sub_cat, y = Proportion)) +
geom_point()
As we can see from the graph, there is no linear relationship between subsctiption and upgrading.
Logistic Regression Model
Scaling
newdf <- df %>%
mutate_at(vars("age","income","months_subbed"), funs(scale))
head(newdf)
## age income months_subbed upgrade
## 1 -1.1159451 -1.2433923 -1.1669835 0
## 2 0.3156715 0.2924859 0.6943033 0
## 3 1.1746414 -1.5832831 -0.1714115 1
## 4 -2.2612384 0.6328863 0.4345888 1
## 5 0.4588331 0.7016798 -0.1714115 0
## 6 0.6019948 -1.9573158 1.5167323 1
Data Partition
ind <- sample(2, nrow(newdf), replace=T, prob =c(0.8, 0.2))
train <- newdf[ind==1,]
test <- newdf[ind==2,]
nrow(train)
## [1] 803
nrow(test)
## [1] 197
Fit Model
fit_glm <- glm(upgrade ~ age+income+months_subbed, data=train, family= binomial(link= logit) )
summary(fit_glm)
##
## Call:
## glm(formula = upgrade ~ age + income + months_subbed, family = binomial(link = logit),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7845 -1.2157 0.8602 1.0482 1.6195
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.28780 0.07267 3.960 7.48e-05 ***
## age 0.39146 0.07498 5.221 1.78e-07 ***
## income -0.04718 0.07303 -0.646 0.518
## months_subbed 0.02576 0.07281 0.354 0.723
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1097.8 on 802 degrees of freedom
## Residual deviance: 1068.7 on 799 degrees of freedom
## AIC: 1076.7
##
## Number of Fisher Scoring iterations: 4
p1 <- predict(fit_glm, data=train, type= 'response')
#By Caret library
pred1 <- ifelse(p1>0.5,1,0)
pred1 <- as.factor(pred1)
upgrade1<- as.factor(train$upgrade)
result_train <-confusionMatrix(data=pred1,reference=upgrade1)
print(result_train)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 102 77
## 1 244 380
##
## Accuracy : 0.6002
## 95% CI : (0.5654, 0.6343)
## No Information Rate : 0.5691
## P-Value [Acc > NIR] : 0.04008
##
## Kappa : 0.1342
##
## Mcnemar's Test P-Value : < 2e-16
##
## Sensitivity : 0.2948
## Specificity : 0.8315
## Pos Pred Value : 0.5698
## Neg Pred Value : 0.6090
## Prevalence : 0.4309
## Detection Rate : 0.1270
## Detection Prevalence : 0.2229
## Balanced Accuracy : 0.5632
##
## 'Positive' Class : 0
##
Prediction for Test data set
ptest <- predict(fit_glm, test, type='response')
#Confusion Matrix::
pred_test <- ifelse(ptest>0.5,1,0)
pred_test<- as.factor(pred_test)
admit_test<- as.factor(test$upgrade)
result_test <-confusionMatrix(data=pred_test,reference=admit_test)
print(result_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 18 23
## 1 57 99
##
## Accuracy : 0.5939
## 95% CI : (0.5218, 0.6631)
## No Information Rate : 0.6193
## P-Value [Acc > NIR] : 0.7907321
##
## Kappa : 0.0564
##
## Mcnemar's Test P-Value : 0.0002247
##
## Sensitivity : 0.24000
## Specificity : 0.81148
## Pos Pred Value : 0.43902
## Neg Pred Value : 0.63462
## Prevalence : 0.38071
## Detection Rate : 0.09137
## Detection Prevalence : 0.20812
## Balanced Accuracy : 0.52574
##
## 'Positive' Class : 0
##
Results
We see that our predictor variables have very low correlation with the response variable, therefore we have not gotten very satisfactory result.