library(caret)
library(corrplot)
library(clustMixType)
library(dplyr)
library(GGally)
library(ggplot2)
library(gridExtra)
library(lexicon)
library(mice)
library(pdp)
library(ROCR)
library(rtweet)
library(scales)
library(sentimentr)
library(stringr)
library(tidytext)
library(tm)
## PROBLEM STATEMENT 1
# DATA PREPROCESSING
setwd("C:/Users/ADMIN/Downloads")
custdat <- read.csv("term3 (1) (1).csv")
str(custdat)
custdat <- custdat[-1]
colnames(custdat)[11] <- "Reached_on_Time"
custdat$Reached_on_Time <- 1 - custdat$Reached_on_Time
custdat$Warehouse_block <- as.factor(custdat$Warehouse_block)
custdat$Mode_of_Shipment <- as.factor(custdat$Mode_of_Shipment)
custdat$Customer_rating <- as.factor(custdat$Customer_rating)
custdat$Product_importance <- as.factor(custdat$Product_importance)
custdat$Gender <- as.factor(custdat$Gender)
custdat$Reached_on_Time <- as.factor(custdat$Reached_on_Time)
# CHECKING FOR MISSING DATA
md.pattern(custdat)
# MULTICOLLINEARITY
ggcorr(custdat[sapply(custdat, is.numeric)], label = T, label_round = 2)
ggpairs(custdat[sapply(custdat, is.numeric)],lower = list(continuous=wrap("points", position="jitter",
alpha = 0.2)),mapping=ggplot2::aes(colour = custdat$Reached_on_Time),
diag = list(continuous=wrap("densityDiag", alpha=0.5)))
# EXPLORATORY DATA ANALYSIS
ggplot(custdat, aes(y =Customer_care_calls, x = "")) + geom_boxplot() + labs(x = NULL)
ggplot(custdat, aes(y =Cost_of_the_Product, x = "")) + geom_boxplot() + labs(x = NULL)
ggplot(custdat, aes(y =Weight_in_gms, x = "")) + geom_boxplot() + labs(x = NULL)
ggplot(custdat, aes(x = Warehouse_block, fill = Reached_on_Time)) + geom_bar(position="dodge")
ggplot(custdat, aes(x = Warehouse_block, fill = Reached_on_Time)) + geom_bar(position="fill")
table(as.factor(custdat$Warehouse_block), custdat$Reached_on_Time)
prop.table(table(as.factor(custdat$Warehouse_block), custdat$Reached_on_Time), 1)
ggplot(custdat, aes(x = Mode_of_Shipment, fill = Reached_on_Time)) + geom_bar(position="dodge")
ggplot(custdat, aes(x = Mode_of_Shipment, fill = Reached_on_Time)) + geom_bar(position="fill") + labs(y = "Proportion")
table(as.factor(custdat$Mode_of_Shipment), custdat$Reached_on_Time)
prop.table(table(as.factor(custdat$Mode_of_Shipment), custdat$Reached_on_Time), 1)
ggplot(custdat, aes(x = Customer_rating, fill = Reached_on_Time)) + geom_bar(position="dodge")
ggplot(custdat, aes(x = Customer_rating, fill = Reached_on_Time)) + geom_bar(position="fill") + labs(y = "Proportion")
table(as.factor(custdat$Customer_rating), custdat$Reached_on_Time)
prop.table(table(as.factor(custdat$Customer_rating), custdat$Reached_on_Time), 1)
ggplot(custdat, aes(x = Product_importance, fill = Reached_on_Time)) + geom_bar(position="dodge")
ggplot(custdat, aes(x = Product_importance, fill = Reached_on_Time)) + geom_bar(position="fill") + labs(y = "Proportion")
table(as.factor(custdat$Product_importance), custdat$Reached_on_Time)
prop.table(table(as.factor(custdat$Product_importance), custdat$Reached_on_Time), 1)
ggplot(custdat, aes(x = Gender, fill = Reached_on_Time)) + geom_bar(position="dodge")
ggplot(custdat, aes(x = Gender, fill = Reached_on_Time)) + geom_bar(position="fill") + labs(y = "Proportion")
table(as.factor(custdat$Gender), custdat$Reached_on_Time)
prop.table(table(as.factor(custdat$Gender), custdat$Reached_on_Time), 1)
ggplot(custdat, aes(x = as.factor(Customer_care_calls), fill = Reached_on_Time)) + geom_bar(position="dodge")
ggplot(custdat, aes(x = as.factor(Customer_care_calls), fill = Reached_on_Time)) + geom_bar(position="fill") + labs(x = "Customer Care Calls",y = "Proportion")
ggplot(custdat, aes(x = as.factor(Prior_purchases), fill = Reached_on_Time)) + geom_bar(position="dodge")
ggplot(custdat, aes(x = as.factor(Prior_purchases), fill = Reached_on_Time)) + geom_bar(position="fill") + labs(x = "Prior Purchases",y = "Proportion")
ggplot(custdat, aes(x=Cost_of_the_Product, fill=Reached_on_Time))+geom_density()+ facet_grid(Reached_on_Time ~ .)
ggplot(custdat, aes(x=Discount_offered, fill=Reached_on_Time))+geom_density()+ facet_grid(Reached_on_Time ~ .)
ggplot(custdat, aes(x=Weight_in_gms, fill=Reached_on_Time))+geom_density()+ facet_grid(Reached_on_Time ~ .)
# TRAINING/TESTING SPLIT
set.seed(100)
custindex <- createDataPartition(custdat$Reached_on_Time, p = 0.8, times = 1, list = F)
custtrain <- custdat[custindex,]
custtest <- custdat[-custindex,]
# MODEL TRAINING
fitControl <- trainControl(method = "cv", number = 10, verboseIter = T)
# LOGISTIC REGRESSION
logicv <- train(custtrain[,-11],custtrain[,11], method = "glmStepAIC", direction = "both", trControl = fitControl)
# RANDOM FOREST
rfcv <- train(custtrain[,-11],custtrain[,11], method = "rf", trControl = fitControl, tuneGrid = expand.grid(mtry = c(2,4,6,8)))
# SUPPORT VECTOR MACHINES
svmcv <- train(Reached_on_Time ~., data = custtrain, method = "svmRadial", trControl = fitControl,
tuneGrid = expand.grid(sigma = 10^(-3:3), C =10^(-3:3)) ,preProcess = c("center","scale"))
# ADABOOST
adacv <- train(Reached_on_Time ~., data = custtrain, method = "AdaBoost.M1", trControl = fitControl)
# GRADIENT BOOSTING
gradcv <- train(Reached_on_Time ~., data = custtrain, method = "xgbTree", trControl = fitControl,
tuneGrid = expand.grid(gamma = c(0,0.1,0.5,1,3,5), nrounds = c(50,100,200), max_depth =c(1,2,4,6,8),
eta=c(0.01,0.1,0.5), colsample_bytree = c(0.6,0.8), min_child_weight = 1, subsample = c(0.5,0.75,1)))
# MODEL EVALUATION
results <- resamples(list(Logistic = logicv, Random_Forest = rfcv, Adaboost = adacv, SVM = svmcv, Gradient_Boosting = gradcv))
summary(results)
bwplot(results)
dotplot(results)
confusionMatrix(predict(logicv, custtest),custtest$Reached_on_Time)
confusionMatrix(predict(rfcv, custtest),custtest$Reached_on_Time)
confusionMatrix(predict(svmcv, custtest),custtest$Reached_on_Time)
confusionMatrix(predict(adacv, custtest),custtest$Reached_on_Time)
confusionMatrix(predict(gradcv, custtest),custtest$Reached_on_Time)
# ROC/AUROC CALCULATION
logmod <- predict(logicv, custtest, type = "prob")[,2]
rocpredlog <- prediction(logmod, custtest$Reached_on_Time)
rocperflog <- performance(rocpredlog, "tpr", "fpr")
auclog <- performance(rocpredlog, "auc")
rfmod <- predict(rfcv, custtest, type = "prob")[,2]
rocpredrf <- prediction(rfmod, custtest$Reached_on_Time)
rocperfrf <- performance(rocpredrf, "tpr", "fpr")
aucrf <- performance(rocpredrf, "auc")
adamod <- predict(adacv, custtest, type = "prob")[,2]
rocpredada <- prediction(adamod, custtest$Reached_on_Time)
rocperfada <- performance(rocpredada, "tpr", "fpr")
aucada <- performance(rocpredada, "auc")
xgbmod <- predict(gradcv, custtest, type = "prob")[,2]
rocpredxgb <- prediction(xgbmod, custtest$Reached_on_Time)
rocperfxgb <- performance(rocpredxgb, "tpr", "fpr")
aucxgb <- performance(rocpredxgb, "auc")
plot(rocperflog, col = "green", lwd = 2.5)
plot(rocperfrf, add=T, col = "blue", lwd = 2.5)
plot(rocperfada, add=T, col = "red", lwd = 2.5)
plot(rocperfxgb, add=T, col = "orange", lwd = 2.5)
abline(0,1, col = "Red", lwd = 2.5, lty = 2)
title('ROC Curve')
legend(0.48,0.4,c(paste0("Logistic Regression AUC - ",round(auclog@y.values[[1]]*100,1)),
paste0("Random Forest AUC - ",round(aucrf@y.values[[1]]*100,1)), paste0("Adaboost AUC - ",
round(aucada@y.values[[1]]*100,1)), paste0("Gradient Boosting AUC - ",round(aucxgb@y.values[[1]]*100,
1))),cex = 0.8, lty = c(1,1), lwd = c(1,1), col = c("green", "blue", "red", "orange"), bty = "n" )
# MODEL INTERPRETATION
plot(varImp(gradcv))
partial(gradcv,pred.var = "Discount_offered", which.class = "1", plot = T, rug = T)
partial(gradcv,pred.var = "Weight_in_gms", which.class = "1", plot = T, rug = T)
## PROBLEM STATEMENT 2
# CUSTOMER RATING PROPORTION
custdat %>%
group_by(Customer_rating) %>%
count(Customer_rating, Reached.on.Time) %>%
mutate(pct = prop.table(n)) %>%
ggplot(aes(x = Customer_rating, y = pct, fill = Reached.on.Time, label = scales::percent(pct))) +
geom_col(position = 'fill') +
geom_text(vjust = 2, size = 4, position = "stack") + scale_y_continuous(labels = scales::percent) + labs(y = NULL)
# CUSTOMER RATING COUNT
custdat %>%
group_by(Customer_rating) %>%
count(Customer_rating, Reached.on.Time) %>%
ggplot(aes(x = Customer_rating, y = n, fill = Reached.on.Time, label = n, group = Reached.on.Time)) +
geom_bar(position = 'dodge', stat = "identity") +
geom_text(vjust = -0.5, size = 4, hjust = 0.5, position = position_dodge(width = 0.8)) + labs(y = NULL)
# PRIOR PURCHASES PROPORTION
custdat %>%
group_by(Prior_purchases) %>%
count(Prior_purchases, Reached.on.Time) %>%
mutate(pct = prop.table(n)) %>%
ggplot(aes(x = factor(Prior_purchases), y = pct, fill = Reached.on.Time, label = scales::percent(pct))) +
geom_col(position = 'fill') +
geom_text(vjust = 2, size = 4, position = "stack") + scale_y_continuous(labels = scales::percent) + labs(y = NULL, x = "Prior Purchases")
# PRIOR PURCHASES COUNT
custdat %>%
group_by(Prior_purchases) %>%
count(Prior_purchases, Reached.on.Time) %>%
ggplot(aes(x = factor(Prior_purchases), y = n, fill = Reached.on.Time, label = n, group = Reached.on.Time)) +
geom_bar(position = 'dodge', stat = "identity") +
geom_text(vjust = -0.5, size = 4, hjust = 0.5, position = position_dodge(width = 0.8)) + labs(y = NULL, x = "Prior Purchases")
# COST OF THE PRODUCT
ggplot(custdat, aes(x = Cost_of_the_Product, fill = Reached.on.Time)) + geom_density(alpha = 0.5) + theme(
axis.text.y = element_blank(),axis.ticks = element_blank())
ggplot(custdat, aes(x = Cost_of_the_Product, fill = Reached.on.Time)) + geom_histogram(binwidth = 20, colour = "black")
ggplot(custdat, aes(y = Cost_of_the_Product,x="", fill = Reached.on.Time)) + geom_boxplot() + facet_grid(~Reached.on.Time)
## PROBLEM STATEMENT 3
# DATA SELECTION & SCALING
custtrain2 <- filter(custdat, Reached.on.Time == 0)
custtrain2 <- custtrain2[-11]
custtrain2[sapply(custtrain2, is.numeric)] <- sapply(custtrain2[sapply(custtrain2, is.numeric)]
,rescale,to = c(-1,1))
# FINDING OPTIMAL NUMBER OF CLUSTERS
Es <- numeric(10)
for(i in 1:10){
kpres <- kproto(custtrain2, k = i, nstart = 5)
Es[i] <- kpres$tot.withinss
}
plot(1:10, Es, type = "b", ylab = "Within cluster sum of squares", xlab = "No. of Clusters",
main = "Scree Plot")
kclustmix1 <- kproto(custtrain2, verbose = T, k = 3, nstart = 10)
kclustmix2 <- kproto(custtrain2, verbose = T, k = 4, nstart = 10)
kclustmix3 <- kproto(custtrain2, verbose = T, k = 5, nstart = 10)
validation_kproto(method = "silhouette", kclustmix1, custtrain2) #0.266553
validation_kproto(method = "silhouette", kclustmix2, custtrain2) #0.214979
validation_kproto(method = "silhouette", kclustmix3, custtrain2) #0.189281
# CLUSTER PROFILES
kclustmix <- kproto(custtrain2, verbose = T, k = 3, nstart = 10)
summary(kclustmix)
clprofiles(kclustmix, custtrain2)
## PROBLEM STATEMENT 4
# TOKEN CREATION
app_name = 'SentimentAnalysis0208'
consumer_key = "HiddenForPrivacy"
consumer_secret = "HiddenForPrivacy"
access_token = "HiddenForPrivacy"
access_secret = "HiddenForPrivacy"
token <- create_token(app = app_name,
consumer_key = consumer_key,
consumer_secret = consumer_secret,
access_token = access_token,
access_secret = access_secret)
# FETCHING TWEETS
amazon <- search_tweets("#amazonindia OR @amazonIN", n = 18000, include_rts = F, retryonratelimit = T)
amazon <- distinct(amazon, user_id, .keep_all = T)
flipkart <- search_tweets("#flipkart OR @flipkart", n = 18000, include_rts = F, retryonratelimit = T)
flipkart <- distinct(flipkart, user_id, .keep_all = T)
snapdeal <- search_tweets("#snapdeal OR @snapdeal", n = 18000, include_rts = F, retryonratelimit = T)
snapdeal <- distinct(snapdeal, user_id, .keep_all = T)
nrow(amazon)
nrow(flipkart)
nrow(snapdeal)
amazondf <- amazon[5]
flipkartdf <- flipkart[5]
snapdealdf <- snapdeal[5]
# DATA PROCESSING - AMAZON
amazondf$textproc <- str_replace_all(amazondf$text, "https\\S+","")
amazondf$textproc <- gsub('[[:punct:] ]+',' ',amazondf$textproc)
amazondf$textproc <- stripWhitespace(amazondf$textproc)
amazondf$textproc <- tolower(amazondf$textproc)
amazondf$textproc <- removeWords(amazondf$textproc, stopwords("english"))
amazondf$textproc <- stemDocument(amazondf$textproc)
amzsplit <- unnest_tokens(amazondf, word, textproc)
amzsent <- amzsplit %>%
inner_join(get_sentiments("bing")) %>%
count(word,sentiment, sort = T)
# DATA PROCESSING - FLIPKART
flipkartdf$textproc <- str_replace_all(flipkartdf$text, "https\\S+","")
flipkartdf$textproc <- gsub('[[:punct:] ]+',' ',flipkartdf$textproc)
flipkartdf$textproc <- stripWhitespace(flipkartdf$textproc)
flipkartdf$textproc <- tolower(flipkartdf$textproc)
flipkartdf$textproc <- removeWords(flipkartdf$textproc, stopwords("english"))
flipkartdf$textproc <- stemDocument(flipkartdf$textproc)
flpkrtsplit <- unnest_tokens(flipkartdf, word, textproc)
flpkrtsent <- flpkrtsplit %>%
inner_join(get_sentiments("bing")) %>%
count(word,sentiment, sort = T)
# DATA PROCESSING - SNAPDEAL
snapdealdf$textproc <- str_replace_all(snapdealdf$text, "https\\S+","")
snapdealdf$textproc <- gsub('[[:punct:] ]+',' ',snapdealdf$textproc)
snapdealdf$textproc <- stripWhitespace(snapdealdf$textproc)
snapdealdf$textproc <- tolower(snapdealdf$textproc)
snapdealdf$textproc <- removeWords(snapdealdf$textproc, stopwords("english"))
snapdealdf$textproc <- stemDocument(snapdealdf$textproc)
snapdealsplit <- unnest_tokens(snapdealdf, word, textproc)
snapdealsent <- snapdealsplit %>%
inner_join(get_sentiments("bing")) %>%
count(word,sentiment, sort = T)
# SENTIMENT ANALYSIS (UNIGRAMS) - VISUALISATION - AMAZON
amzsent$sentiment <- ifelse(amzsent$word == "refund", "negative", amzsent$sentiment)
amzsent %>%
mutate(perc = (n/sum(n)*100)) %>%
group_by(sentiment) %>%
summarise(percent = round(sum(perc),1)) %>%
ggplot(aes(y = percent, x = "", fill = sentiment)) + geom_bar(stat = "identity") +
labs(x = NULL, y = NULL) + geom_text(aes(label = paste0(percent," %")),size = 7, position =
position_stack(vjust=0.5)) + theme(axis.text.y = element_blank(), axis.ticks = element_blank())
amzsent %>%
group_by(sentiment) %>%
slice_max(n, n = 10, with_ties = F) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) + geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment", y = NULL)
# SENTIMENT ANALYSIS (UNIGRAMS) - VISUALISATION - FLIPKART
flpkrtsent$sentiment <- ifelse(flpkrtsent$word == "refund", "negative", flpkrtsent$sentiment)
flpkrtsent %>%
mutate(perc = (n/sum(n)*100)) %>%
group_by(sentiment) %>%
summarise(percent = round(sum(perc),1)) %>%
ggplot(aes(y = percent, x = "", fill = sentiment)) + geom_bar(stat = "identity") + labs(x = NULL, y = NULL) +
geom_text(aes(label = paste0(percent," %")),size = 14, position = position_stack(vjust=0.5)) + theme(axis.text.y = element_blank(), axis.ticks = element_blank())
flpkrtsent %>%
group_by(sentiment) %>%
slice_max(n, n = 10, with_ties = F) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)
# SENTIMENT ANALYSIS (UNIGRAMS) - VISUALISATION - SNAPDEAL
snapdealsent$sentiment <- ifelse(snapdealsent$word == "refund", "negative", snapdealsent$sentiment)
snapdealsent %>%
mutate(perc = (n/sum(n)*100)) %>%
group_by(sentiment) %>%
summarise(percent = round(sum(perc),1)) %>%
ggplot(aes(y = percent, x = "", fill = sentiment)) + geom_bar(stat = "identity") + labs(x = NULL, y = NULL) +
geom_text(aes(label = paste0(percent," %")),size = 14, position = position_stack(vjust=0.5)) + theme(axis.text.y = element_blank(), axis.ticks = element_blank())
snapdealsent %>%
group_by(sentiment) %>%
slice_max(n, n = 10, with_ties = F) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)
# SENTIMENT ANALYSIS WITH VALENCE SHIFTERS
sentiment("refund", polarity_dt = lexicon::hash_sentiment_jockers_rinker)
mysentiment <- lexicon::hash_sentiment_jockers_rinker
mysentiment$y[mysentiment$x %in% "refund"] <- -0.8
sentiment("refund", polarity_dt = mysentiment)
# SENTIMENT ANALYSIS WITH VALENCE SHIFTERS - AMAZON
amzsentvalshift <- sentiment_by(amazondf$textproc, polarity_dt = mysentiment)
summary(amzsentvalshift$ave_sentiment)
ggplot(amzsentvalshift, aes(ave_sentiment)) + geom_histogram(bins = 15, color = "black", boundary = 0) + theme_minimal()
amznegdens <- amzsentvalshift %>%
filter(ave_sentiment < 0) %>%
ggplot(aes(ave_sentiment)) + geom_density(fill = "tomato1") + theme(legend.position = "none")
amzposdens <- amzsentvalshift %>%
filter(ave_sentiment > 0) %>%
ggplot(aes(ave_sentiment)) + geom_density(fill = "turquoise3") + theme(legend.position = "none")
grid.arrange(amznegdens, amzposdens, ncol = 1, nrow = 2)
highlight(amzsentvalshift)
# SENTIMENT ANALYSIS WITH VALENCE SHIFTERS - FLIPKART
flpkrtsentvalshift <- sentiment_by(flipkartdf$textproc, polarity_dt = mysentiment)
summary(flpkrtsentvalshift$ave_sentiment)
flipkartdf$textproc[2126]
#The aforementioned entry is an outlier that has an extreme negative score of -3.37 due to the user just
#writing the same negative words - "Cheater/Fraud" repeatedly. This entry could be considered for removal.
ggplot(flpkrtsentvalshift, aes(ave_sentiment)) + geom_histogram(bins = 15, color = "black", boundary = 0) + theme_minimal()
flpkrtnegdens <- flpkrtsentvalshift %>%
filter(ave_sentiment < 0) %>%
ggplot(aes(ave_sentiment)) + geom_density(fill = "tomato1") + theme(legend.position = "none")
flpkrtposdens <- flpkrtsentvalshift %>%
filter(ave_sentiment > 0) %>%
ggplot(aes(ave_sentiment)) + geom_density(fill = "turquoise3") + theme(legend.position = "none")
grid.arrange(flpkrtnegdens, flpkrtposdens, ncol = 1, nrow = 2)
highlight(flpkrtsentvalshift)
# SENTIMENT ANALYSIS WITH VALENCE SHIFTERS - SNAPDEAL
snapdealsentvalshift <- sentiment_by(snapdealdf$textproc, polarity_dt = mysentiment)
summary(snapdealsentvalshift$ave_sentiment)
ggplot(snapdealsentvalshift, aes(ave_sentiment)) + geom_histogram(bins = 10, color = "black", boundary = 0) + theme_minimal()
snpdealnegdens <- snapdealsentvalshift %>%
filter(ave_sentiment < 0) %>%
ggplot(aes(ave_sentiment)) + geom_density(fill = "tomato1") + theme(legend.position = "none")
snpdealposdens <- snapdealsentvalshift %>%
filter(ave_sentiment > 0) %>%
ggplot(aes(ave_sentiment)) + geom_density(fill = "turquoise3") + theme(legend.position = "none")
grid.arrange(snpdealnegdens, snpdealposdens, ncol = 1, nrow = 2)
highlight(snapdealsentvalshift)