Customer Analytics Project

library(caret)
library(corrplot)
library(clustMixType)
library(dplyr)
library(GGally)
library(ggplot2)
library(gridExtra)
library(lexicon)
library(mice)
library(pdp)
library(ROCR)
library(rtweet)
library(scales)
library(sentimentr)
library(stringr)
library(tidytext)
library(tm)


## PROBLEM STATEMENT 1

# DATA PREPROCESSING

setwd("C:/Users/ADMIN/Downloads")

custdat <- read.csv("term3 (1) (1).csv")

str(custdat)

custdat <- custdat[-1]

colnames(custdat)[11] <- "Reached_on_Time"
custdat$Reached_on_Time <- 1 - custdat$Reached_on_Time

custdat$Warehouse_block <- as.factor(custdat$Warehouse_block)
custdat$Mode_of_Shipment <- as.factor(custdat$Mode_of_Shipment)
custdat$Customer_rating <- as.factor(custdat$Customer_rating)
custdat$Product_importance <- as.factor(custdat$Product_importance)
custdat$Gender <- as.factor(custdat$Gender)
custdat$Reached_on_Time <- as.factor(custdat$Reached_on_Time)

# CHECKING FOR MISSING DATA

md.pattern(custdat)

# MULTICOLLINEARITY

ggcorr(custdat[sapply(custdat, is.numeric)], label = T, label_round = 2)

ggpairs(custdat[sapply(custdat, is.numeric)],lower = list(continuous=wrap("points", position="jitter",
alpha = 0.2)),mapping=ggplot2::aes(colour = custdat$Reached_on_Time), 
diag = list(continuous=wrap("densityDiag", alpha=0.5)))

# EXPLORATORY DATA ANALYSIS

ggplot(custdat, aes(y =Customer_care_calls, x = "")) + geom_boxplot() + labs(x = NULL)
ggplot(custdat, aes(y =Cost_of_the_Product, x = "")) + geom_boxplot() + labs(x = NULL)
ggplot(custdat, aes(y =Weight_in_gms, x = "")) + geom_boxplot() + labs(x = NULL)

ggplot(custdat, aes(x = Warehouse_block, fill = Reached_on_Time)) + geom_bar(position="dodge")
ggplot(custdat, aes(x = Warehouse_block, fill = Reached_on_Time)) + geom_bar(position="fill")
table(as.factor(custdat$Warehouse_block), custdat$Reached_on_Time)
prop.table(table(as.factor(custdat$Warehouse_block), custdat$Reached_on_Time), 1)

ggplot(custdat, aes(x = Mode_of_Shipment, fill = Reached_on_Time)) + geom_bar(position="dodge")
ggplot(custdat, aes(x = Mode_of_Shipment, fill = Reached_on_Time)) + geom_bar(position="fill") + labs(y = "Proportion")
table(as.factor(custdat$Mode_of_Shipment), custdat$Reached_on_Time)
prop.table(table(as.factor(custdat$Mode_of_Shipment), custdat$Reached_on_Time), 1)

ggplot(custdat, aes(x = Customer_rating, fill = Reached_on_Time)) + geom_bar(position="dodge")
ggplot(custdat, aes(x = Customer_rating, fill = Reached_on_Time)) + geom_bar(position="fill") + labs(y = "Proportion")
table(as.factor(custdat$Customer_rating), custdat$Reached_on_Time)
prop.table(table(as.factor(custdat$Customer_rating), custdat$Reached_on_Time), 1)

ggplot(custdat, aes(x = Product_importance, fill = Reached_on_Time)) + geom_bar(position="dodge")
ggplot(custdat, aes(x = Product_importance, fill = Reached_on_Time)) + geom_bar(position="fill") + labs(y = "Proportion")
table(as.factor(custdat$Product_importance), custdat$Reached_on_Time)
prop.table(table(as.factor(custdat$Product_importance), custdat$Reached_on_Time), 1)

ggplot(custdat, aes(x = Gender, fill = Reached_on_Time)) + geom_bar(position="dodge")
ggplot(custdat, aes(x = Gender, fill = Reached_on_Time)) + geom_bar(position="fill") + labs(y = "Proportion")
table(as.factor(custdat$Gender), custdat$Reached_on_Time)
prop.table(table(as.factor(custdat$Gender), custdat$Reached_on_Time), 1)

ggplot(custdat, aes(x = as.factor(Customer_care_calls), fill = Reached_on_Time)) + geom_bar(position="dodge")
ggplot(custdat, aes(x = as.factor(Customer_care_calls), fill = Reached_on_Time)) + geom_bar(position="fill") + labs(x = "Customer Care Calls",y = "Proportion")

ggplot(custdat, aes(x = as.factor(Prior_purchases), fill = Reached_on_Time)) + geom_bar(position="dodge")
ggplot(custdat, aes(x = as.factor(Prior_purchases), fill = Reached_on_Time)) + geom_bar(position="fill") + labs(x = "Prior Purchases",y = "Proportion")

ggplot(custdat, aes(x=Cost_of_the_Product, fill=Reached_on_Time))+geom_density()+ facet_grid(Reached_on_Time ~ .)
ggplot(custdat, aes(x=Discount_offered, fill=Reached_on_Time))+geom_density()+ facet_grid(Reached_on_Time ~ .)
ggplot(custdat, aes(x=Weight_in_gms, fill=Reached_on_Time))+geom_density()+ facet_grid(Reached_on_Time ~ .)

# TRAINING/TESTING SPLIT

set.seed(100)

custindex <- createDataPartition(custdat$Reached_on_Time, p = 0.8, times = 1, list = F)
custtrain <- custdat[custindex,]
custtest <- custdat[-custindex,]

# MODEL TRAINING

fitControl <- trainControl(method = "cv", number = 10, verboseIter = T)

# LOGISTIC REGRESSION

logicv <- train(custtrain[,-11],custtrain[,11], method = "glmStepAIC", direction = "both", trControl = fitControl)

# RANDOM FOREST

rfcv <- train(custtrain[,-11],custtrain[,11], method = "rf", trControl = fitControl, tuneGrid = expand.grid(mtry = c(2,4,6,8)))

# SUPPORT VECTOR MACHINES

svmcv <- train(Reached_on_Time ~., data = custtrain, method = "svmRadial", trControl = fitControl, 
               tuneGrid = expand.grid(sigma = 10^(-3:3), C =10^(-3:3)) ,preProcess = c("center","scale"))

# ADABOOST

adacv <- train(Reached_on_Time ~., data = custtrain, method = "AdaBoost.M1", trControl = fitControl)

# GRADIENT BOOSTING

gradcv <- train(Reached_on_Time ~., data = custtrain, method = "xgbTree", trControl = fitControl,
                tuneGrid = expand.grid(gamma = c(0,0.1,0.5,1,3,5), nrounds = c(50,100,200), max_depth =c(1,2,4,6,8),
                                       eta=c(0.01,0.1,0.5), colsample_bytree = c(0.6,0.8), min_child_weight = 1, subsample = c(0.5,0.75,1)))

# MODEL EVALUATION

results <- resamples(list(Logistic = logicv, Random_Forest = rfcv, Adaboost = adacv, SVM = svmcv, Gradient_Boosting = gradcv))

summary(results)
bwplot(results)
dotplot(results)

confusionMatrix(predict(logicv, custtest),custtest$Reached_on_Time)
confusionMatrix(predict(rfcv, custtest),custtest$Reached_on_Time)
confusionMatrix(predict(svmcv, custtest),custtest$Reached_on_Time)
confusionMatrix(predict(adacv, custtest),custtest$Reached_on_Time)
confusionMatrix(predict(gradcv, custtest),custtest$Reached_on_Time)

# ROC/AUROC CALCULATION

logmod <- predict(logicv, custtest, type = "prob")[,2]
rocpredlog <- prediction(logmod, custtest$Reached_on_Time)
rocperflog <- performance(rocpredlog, "tpr", "fpr")
auclog <- performance(rocpredlog, "auc")

rfmod <- predict(rfcv, custtest, type = "prob")[,2]
rocpredrf <- prediction(rfmod, custtest$Reached_on_Time)
rocperfrf <- performance(rocpredrf, "tpr", "fpr")
aucrf <- performance(rocpredrf, "auc")

adamod <- predict(adacv, custtest, type = "prob")[,2]
rocpredada <- prediction(adamod, custtest$Reached_on_Time)
rocperfada <- performance(rocpredada, "tpr", "fpr")
aucada <- performance(rocpredada, "auc")

xgbmod <- predict(gradcv, custtest, type = "prob")[,2]
rocpredxgb <- prediction(xgbmod, custtest$Reached_on_Time)
rocperfxgb <- performance(rocpredxgb, "tpr", "fpr")
aucxgb <- performance(rocpredxgb, "auc")

plot(rocperflog, col = "green", lwd = 2.5)  
plot(rocperfrf, add=T, col = "blue", lwd = 2.5)
plot(rocperfada, add=T, col = "red", lwd = 2.5)
plot(rocperfxgb, add=T, col = "orange", lwd = 2.5)
abline(0,1, col = "Red", lwd = 2.5, lty = 2)
title('ROC Curve')
legend(0.48,0.4,c(paste0("Logistic Regression AUC - ",round(auclog@y.values[[1]]*100,1)),
paste0("Random Forest AUC - ",round(aucrf@y.values[[1]]*100,1)), paste0("Adaboost AUC - ",
round(aucada@y.values[[1]]*100,1)), paste0("Gradient Boosting AUC - ",round(aucxgb@y.values[[1]]*100,
1))),cex = 0.8, lty = c(1,1), lwd = c(1,1), col = c("green", "blue", "red", "orange"), bty = "n" )

# MODEL INTERPRETATION

plot(varImp(gradcv))

partial(gradcv,pred.var = "Discount_offered", which.class = "1", plot = T, rug = T)
partial(gradcv,pred.var = "Weight_in_gms", which.class = "1", plot = T, rug = T)




## PROBLEM STATEMENT 2

# CUSTOMER RATING PROPORTION

custdat %>%
  group_by(Customer_rating) %>%
  count(Customer_rating, Reached.on.Time) %>%
  mutate(pct = prop.table(n)) %>%
  ggplot(aes(x = Customer_rating, y = pct, fill = Reached.on.Time, label = scales::percent(pct))) + 
  geom_col(position = 'fill') + 
  geom_text(vjust = 2,  size = 4, position = "stack") + scale_y_continuous(labels = scales::percent) + labs(y = NULL)

# CUSTOMER RATING COUNT

custdat %>%
  group_by(Customer_rating) %>%
  count(Customer_rating, Reached.on.Time) %>%
  ggplot(aes(x = Customer_rating, y = n, fill = Reached.on.Time, label = n, group = Reached.on.Time)) + 
  geom_bar(position = 'dodge', stat = "identity") + 
  geom_text(vjust = -0.5,  size = 4, hjust = 0.5, position = position_dodge(width = 0.8)) + labs(y = NULL)

# PRIOR PURCHASES PROPORTION

custdat %>%
  group_by(Prior_purchases) %>%
  count(Prior_purchases, Reached.on.Time) %>%
  mutate(pct = prop.table(n)) %>%
  ggplot(aes(x = factor(Prior_purchases), y = pct, fill = Reached.on.Time, label = scales::percent(pct))) + 
  geom_col(position = 'fill') + 
  geom_text(vjust = 2,  size = 4, position = "stack") + scale_y_continuous(labels = scales::percent) + labs(y = NULL, x = "Prior Purchases")

# PRIOR PURCHASES COUNT

custdat %>%
  group_by(Prior_purchases) %>%
  count(Prior_purchases, Reached.on.Time) %>%
  ggplot(aes(x = factor(Prior_purchases), y = n, fill = Reached.on.Time, label = n, group = Reached.on.Time)) + 
  geom_bar(position = 'dodge', stat = "identity") + 
  geom_text(vjust = -0.5,  size = 4, hjust = 0.5, position = position_dodge(width = 0.8)) + labs(y = NULL, x = "Prior Purchases")

# COST OF THE PRODUCT

ggplot(custdat, aes(x = Cost_of_the_Product, fill = Reached.on.Time)) + geom_density(alpha = 0.5) + theme(
  axis.text.y = element_blank(),axis.ticks = element_blank())

ggplot(custdat, aes(x = Cost_of_the_Product, fill = Reached.on.Time)) + geom_histogram(binwidth = 20, colour = "black")

ggplot(custdat, aes(y = Cost_of_the_Product,x="", fill = Reached.on.Time)) + geom_boxplot() + facet_grid(~Reached.on.Time) 




## PROBLEM STATEMENT 3

# DATA SELECTION & SCALING

custtrain2 <- filter(custdat, Reached.on.Time == 0)
custtrain2 <- custtrain2[-11]
custtrain2[sapply(custtrain2, is.numeric)] <- sapply(custtrain2[sapply(custtrain2, is.numeric)]
                                                     ,rescale,to = c(-1,1))

# FINDING OPTIMAL NUMBER OF CLUSTERS

Es <- numeric(10)
for(i in 1:10){
  kpres <- kproto(custtrain2, k = i, nstart = 5)
  Es[i] <- kpres$tot.withinss
}

plot(1:10, Es, type = "b", ylab = "Within cluster sum of squares", xlab = "No. of Clusters",
     main = "Scree Plot")

kclustmix1 <- kproto(custtrain2, verbose = T, k = 3, nstart = 10)
kclustmix2 <- kproto(custtrain2, verbose = T, k = 4, nstart = 10)
kclustmix3 <- kproto(custtrain2, verbose = T, k = 5, nstart = 10)

validation_kproto(method = "silhouette", kclustmix1, custtrain2) #0.266553
validation_kproto(method = "silhouette", kclustmix2, custtrain2) #0.214979
validation_kproto(method = "silhouette", kclustmix3, custtrain2) #0.189281

# CLUSTER PROFILES

kclustmix <- kproto(custtrain2, verbose = T, k = 3, nstart = 10)

summary(kclustmix)
clprofiles(kclustmix, custtrain2)




## PROBLEM STATEMENT 4

# TOKEN CREATION

app_name = 'SentimentAnalysis0208'
consumer_key = "HiddenForPrivacy"
consumer_secret = "HiddenForPrivacy"
access_token = "HiddenForPrivacy"
access_secret = "HiddenForPrivacy"

token <- create_token(app = app_name,
                      consumer_key = consumer_key,
                      consumer_secret = consumer_secret,
                      access_token = access_token,
                      access_secret = access_secret)

# FETCHING TWEETS

amazon <- search_tweets("#amazonindia OR @amazonIN", n = 18000, include_rts = F, retryonratelimit = T)
amazon <- distinct(amazon, user_id, .keep_all = T)

flipkart <- search_tweets("#flipkart OR @flipkart", n = 18000, include_rts = F, retryonratelimit = T)
flipkart <- distinct(flipkart, user_id, .keep_all = T)

snapdeal <- search_tweets("#snapdeal OR @snapdeal", n = 18000, include_rts = F, retryonratelimit = T)
snapdeal <- distinct(snapdeal, user_id, .keep_all = T)

nrow(amazon)
nrow(flipkart)
nrow(snapdeal)

amazondf <- amazon[5]
flipkartdf <- flipkart[5]
snapdealdf <- snapdeal[5]

# DATA PROCESSING - AMAZON

amazondf$textproc <- str_replace_all(amazondf$text, "https\\S+","")
amazondf$textproc <- gsub('[[:punct:] ]+',' ',amazondf$textproc)
amazondf$textproc <- stripWhitespace(amazondf$textproc)
amazondf$textproc <- tolower(amazondf$textproc)
amazondf$textproc <- removeWords(amazondf$textproc, stopwords("english"))
amazondf$textproc <- stemDocument(amazondf$textproc)
amzsplit <- unnest_tokens(amazondf, word, textproc)


amzsent <- amzsplit %>%
  inner_join(get_sentiments("bing")) %>%
  count(word,sentiment, sort = T)

# DATA PROCESSING - FLIPKART

flipkartdf$textproc <- str_replace_all(flipkartdf$text, "https\\S+","")
flipkartdf$textproc <- gsub('[[:punct:] ]+',' ',flipkartdf$textproc)
flipkartdf$textproc <- stripWhitespace(flipkartdf$textproc)
flipkartdf$textproc <- tolower(flipkartdf$textproc)
flipkartdf$textproc <- removeWords(flipkartdf$textproc, stopwords("english"))
flipkartdf$textproc <- stemDocument(flipkartdf$textproc)
flpkrtsplit <- unnest_tokens(flipkartdf, word, textproc)


flpkrtsent <- flpkrtsplit %>%
  inner_join(get_sentiments("bing")) %>%
  count(word,sentiment, sort = T)

# DATA PROCESSING - SNAPDEAL

snapdealdf$textproc <- str_replace_all(snapdealdf$text, "https\\S+","")
snapdealdf$textproc <- gsub('[[:punct:] ]+',' ',snapdealdf$textproc)
snapdealdf$textproc <- stripWhitespace(snapdealdf$textproc)
snapdealdf$textproc <- tolower(snapdealdf$textproc)
snapdealdf$textproc <- removeWords(snapdealdf$textproc, stopwords("english"))
snapdealdf$textproc <- stemDocument(snapdealdf$textproc)
snapdealsplit <- unnest_tokens(snapdealdf, word, textproc)


snapdealsent <- snapdealsplit %>%
  inner_join(get_sentiments("bing")) %>%
  count(word,sentiment, sort = T)

# SENTIMENT ANALYSIS (UNIGRAMS) - VISUALISATION - AMAZON

amzsent$sentiment <- ifelse(amzsent$word == "refund", "negative", amzsent$sentiment)

amzsent %>%
  mutate(perc = (n/sum(n)*100)) %>%
  group_by(sentiment) %>%
  summarise(percent = round(sum(perc),1)) %>%
  ggplot(aes(y = percent, x = "", fill = sentiment)) + geom_bar(stat = "identity") + 
  labs(x = NULL, y = NULL) + geom_text(aes(label = paste0(percent," %")),size = 7, position = 
                                         position_stack(vjust=0.5)) + theme(axis.text.y = element_blank(), axis.ticks = element_blank())

amzsent %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10, with_ties = F) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) + geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment", y = NULL)

# SENTIMENT ANALYSIS (UNIGRAMS) - VISUALISATION - FLIPKART

flpkrtsent$sentiment <- ifelse(flpkrtsent$word == "refund", "negative", flpkrtsent$sentiment)

flpkrtsent %>%
  mutate(perc = (n/sum(n)*100)) %>%
  group_by(sentiment) %>%
  summarise(percent = round(sum(perc),1)) %>%
  ggplot(aes(y = percent, x = "", fill = sentiment)) + geom_bar(stat = "identity") + labs(x = NULL, y = NULL) + 
  geom_text(aes(label = paste0(percent," %")),size = 14, position = position_stack(vjust=0.5)) + theme(axis.text.y = element_blank(), axis.ticks = element_blank())

flpkrtsent %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10, with_ties = F) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

# SENTIMENT ANALYSIS (UNIGRAMS) - VISUALISATION - SNAPDEAL

snapdealsent$sentiment <- ifelse(snapdealsent$word == "refund", "negative", snapdealsent$sentiment)

snapdealsent %>%
  mutate(perc = (n/sum(n)*100)) %>%
  group_by(sentiment) %>%
  summarise(percent = round(sum(perc),1)) %>%
  ggplot(aes(y = percent, x = "", fill = sentiment)) + geom_bar(stat = "identity") + labs(x = NULL, y = NULL) + 
  geom_text(aes(label = paste0(percent," %")),size = 14, position = position_stack(vjust=0.5)) + theme(axis.text.y = element_blank(), axis.ticks = element_blank())

snapdealsent %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10, with_ties = F) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

# SENTIMENT ANALYSIS WITH VALENCE SHIFTERS

sentiment("refund", polarity_dt = lexicon::hash_sentiment_jockers_rinker)

mysentiment <- lexicon::hash_sentiment_jockers_rinker
mysentiment$y[mysentiment$x %in% "refund"] <- -0.8

sentiment("refund", polarity_dt = mysentiment)

# SENTIMENT ANALYSIS WITH VALENCE SHIFTERS - AMAZON

amzsentvalshift <- sentiment_by(amazondf$textproc, polarity_dt = mysentiment)

summary(amzsentvalshift$ave_sentiment)

ggplot(amzsentvalshift, aes(ave_sentiment)) + geom_histogram(bins = 15, color = "black", boundary = 0) + theme_minimal()

amznegdens <- amzsentvalshift %>%
  filter(ave_sentiment < 0) %>%
  ggplot(aes(ave_sentiment)) + geom_density(fill = "tomato1") + theme(legend.position = "none") 

amzposdens <- amzsentvalshift %>%
  filter(ave_sentiment > 0) %>%
  ggplot(aes(ave_sentiment)) + geom_density(fill = "turquoise3") + theme(legend.position = "none") 

grid.arrange(amznegdens, amzposdens, ncol = 1, nrow = 2)

highlight(amzsentvalshift)

# SENTIMENT ANALYSIS WITH VALENCE SHIFTERS - FLIPKART

flpkrtsentvalshift <- sentiment_by(flipkartdf$textproc, polarity_dt = mysentiment)

summary(flpkrtsentvalshift$ave_sentiment)

flipkartdf$textproc[2126]
#The aforementioned entry is an outlier that has an extreme negative score of -3.37 due to the user just
#writing the same negative words - "Cheater/Fraud" repeatedly. This entry could be considered for removal.

ggplot(flpkrtsentvalshift, aes(ave_sentiment)) + geom_histogram(bins = 15, color = "black", boundary = 0) + theme_minimal()

flpkrtnegdens <- flpkrtsentvalshift %>%
  filter(ave_sentiment < 0) %>%
  ggplot(aes(ave_sentiment)) + geom_density(fill = "tomato1") + theme(legend.position = "none") 

flpkrtposdens <- flpkrtsentvalshift %>%
  filter(ave_sentiment > 0) %>%
  ggplot(aes(ave_sentiment)) + geom_density(fill = "turquoise3") + theme(legend.position = "none") 

grid.arrange(flpkrtnegdens, flpkrtposdens, ncol = 1, nrow = 2)

highlight(flpkrtsentvalshift)

# SENTIMENT ANALYSIS WITH VALENCE SHIFTERS - SNAPDEAL

snapdealsentvalshift <- sentiment_by(snapdealdf$textproc, polarity_dt = mysentiment)

summary(snapdealsentvalshift$ave_sentiment)

ggplot(snapdealsentvalshift, aes(ave_sentiment)) + geom_histogram(bins = 10, color = "black", boundary = 0) + theme_minimal()

snpdealnegdens <- snapdealsentvalshift %>%
  filter(ave_sentiment < 0) %>%
  ggplot(aes(ave_sentiment)) + geom_density(fill = "tomato1") + theme(legend.position = "none") 

snpdealposdens <- snapdealsentvalshift %>%
  filter(ave_sentiment > 0) %>%
  ggplot(aes(ave_sentiment)) + geom_density(fill = "turquoise3") + theme(legend.position = "none")

grid.arrange(snpdealnegdens, snpdealposdens, ncol = 1, nrow = 2)

highlight(snapdealsentvalshift)
Customer Analytics Project

Simarpreet Singh