library(ggplot2) # Make nice plots
library(factoextra) # Make plots for clusters
library(gridExtra) # Stack several plots
library(knitr) # Create html table
library(dplyr) # Data manipulation
library(tidyr) # Tidy Data
reviews_data <- read.csv('https://archive.ics.uci.edu/ml/machine-learning-databases/00484/tripadvisor_review.csv')
Rename columns according to attributes
new_names <- c('UserId', 'ArtGalleries', 'DanceClubs', 'JuiceBars',
'Restaurants', 'Museums', 'Resorts', 'ParksPicnics',
'Beaches', 'Theaters', 'Religious')
colnames(reviews_data) <- new_names
Get rid of UserId column and scale values even though they all range from 1 to 5.
reviews_data <- scale(reviews_data[,-1])
summary(reviews_data)
## ArtGalleries DanceClubs JuiceBars Restaurants
## Min. :-1.6922 Min. :-2.8281 Min. :-1.1201 Min. :-1.3674
## 1st Qu.:-0.6827 1st Qu.:-0.5700 1st Qu.:-0.9426 1st Qu.:-0.4379
## Median :-0.1933 Median :-0.1518 Median :-0.2451 Median :-0.1162
## Mean : 0.0000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.3879 3rd Qu.: 0.4336 3rd Qu.: 0.7091 3rd Qu.: 0.1698
## Max. : 7.1175 Max. : 4.7825 Max. : 3.3054 Max. :10.3939
## Museums Resorts ParksPicnics Beaches
## Min. :-2.01114 Min. :-3.15621 Min. :-2.676 Min. :-3.0185
## 1st Qu.:-0.68522 1st Qu.:-0.70968 1st Qu.:-0.120 1st Qu.:-0.6913
## Median :-0.09084 Median :-0.07951 Median :-0.120 Median :-0.1095
## Mean : 0.00000 Mean : 0.00000 Mean : 0.000 Mean : 0.0000
## 3rd Qu.: 0.59499 3rd Qu.: 0.66187 3rd Qu.:-0.120 3rd Qu.: 0.5450
## Max. : 5.39576 Max. : 3.55323 Max. : 3.714 Max. : 4.0358
## Theaters Religious
## Min. :-2.27474 Min. :-2.05123
## 1st Qu.:-0.71151 1st Qu.:-0.80660
## Median :-0.08074 Median :-0.05982
## Mean : 0.00000 Mean : 0.00000
## 3rd Qu.: 0.52262 3rd Qu.: 0.74919
## Max. : 4.38955 Max. : 2.67837
Trying k = 2
set.seed(44)
kmReviews <- kmeans(reviews_data, centers = 2)
Plot the data points according to the first two principal components that explain most of the variance.
fviz_cluster(kmReviews, data = reviews_data, geom = 'point')
Run k-means for different k values where each run has 20 random starts in order to get a more stable result. This allows the algorithm to attempt multiple initial configurations and report the best one.
k_values <- seq(2, 10, 1)
set.seed(668822544)
withins_sum <- sapply(k_values, function(k){
kmeans(reviews_data, centers = k, nstart = 25)$tot.withinss
})
“Elbow” method - Pick the k that has the largest decrease in the within clusters sum of squares (wcss).
plot(k_values, withins_sum, type='b', xlab="Number of clusters", ylab="Within groups sum of squares", main="WCSS vs K")
It seems like a good choice of k is between 3 and 6. Trying k-means with these different values since the plot does not show a clear minimum.
set.seed(668822544)
km3 <- kmeans(reviews_data, centers = 3, nstart = 25)
km4 <- kmeans(reviews_data, centers = 4, nstart = 25)
km5 <- kmeans(reviews_data, centers = 5, nstart = 25)
km6 <- kmeans(reviews_data, centers = 6, nstart = 25)
p3 <- fviz_cluster(km3, reviews_data, geom='point') + ggtitle("3 Clusters")
p4 <- fviz_cluster(km4, reviews_data, geom='point') + ggtitle("4 Clusters")
p5 <- fviz_cluster(km5, reviews_data, geom='point') + ggtitle("5 Clusters")
p6 <- fviz_cluster(km6, reviews_data, geom='point') + ggtitle("6 Clusters")
grid.arrange(p3, p4, p5, p6, ncol = 2)
With k=6 there is a higher between cluster sum of squares which means that each cluster is very different to each other, which is good.
btw_df <- data.frame(k = 3:6, betweenss = sapply(list(km3, km4, km5, km6), function(km) km$betweenss))
kable(btw_df)
| k | betweenss |
|---|---|
| 3 | 2699.348 |
| 4 | 3333.904 |
| 5 | 3804.358 |
| 6 | 4184.636 |
However, by looking at the decrease in wccs, it seems that k=4 is a reasonable choice. We don’t want too many or too few clusters for this problem and there aren’t that many travel categories (just 10).
kable(data.frame(k=3:10, wcss_decrease=round(abs(diff(withins_sum)), 2)))
| k | wcss_decrease |
|---|---|
| 3 | 668.82 |
| 4 | 634.00 |
| 5 | 471.01 |
| 6 | 380.28 |
| 7 | 304.23 |
| 8 | 245.95 |
| 9 | 213.39 |
| 10 | 163.04 |
Checking cluster means for scaled data
km4_centers <- as.data.frame(round(km4$centers, 3))
km4_centers <- km4_centers %>% gather(Activity, Value)
km4_centers$Cluster <- rep(paste0("Cluster", " ", 1:4), 10)
km4_centers <- arrange(km4_centers, Activity)
ggplot(km4_centers, aes(Activity, Value, group=Cluster, fill=Cluster)) +
geom_bar(stat="identity", position = position_dodge()) +
theme(axis.text.x = element_text(hjust = 1, angle = 60)) +
labs(y = "Scaled Mean Rating", fill="") +
ggtitle("Cluster means")
Get euclidean distance matrix
distances <- dist(reviews_data, method = "euclidean")
Perform hierarchical clustering with Wards method.
reviews.hclust <- hclust(distances, method = "ward.D")
Plot dendrogram
plot(reviews.hclust)
Cut dendrogram/tree to obtain 4 clusters
reviews.hclusters <- cutree(reviews.hclust, k=4)
Split data by cluster
spl <- split(as.data.frame(reviews_data), reviews.hclusters)
hclustDf <- as.data.frame(round(sapply(spl, colMeans), 3))
names(hclustDf) <- c("Cluster1", "Cluster2", "Cluster3", "Cluster4")
hclustDf$Activity <- rownames(hclustDf)
# Delete row names
rownames(hclustDf) <- NULL
# Reorder columns
hclustDf <- hclustDf[,c(5, 1:4)]
hclustDf <- gather(hclustDf, Cluster, Value, -Activity) %>% arrange(Activity)
ggplot(hclustDf, aes(Activity, Value, group=Cluster, fill=Cluster)) +
geom_bar(stat="identity", position = position_dodge()) +
theme(axis.text.x = element_text(hjust = 1, angle = 60)) +
labs(y = "Scaled Mean Rating", fill="") +
ggtitle("Cluster means")
This data exploration showed that the output from both k-means and hierarchical clustering or other clustering methods don’t necessarily have to agree with each other. However the methods used here clearly distinguished the group that was more devoted to religion and managed to make each group different to each other.
| Variable | Description |
|---|---|
| IDLink (numeric) | Unique identifier of news items |
| Title (string) | Title of the news item according to the official media sources |
| Headline (string) | Headline of the news item according to the official media sources |
| Source (string) | Original news outlet that published the news item |
| Topic (string) | Query topic used to obtain the items in the official media sources |
| PublishDate (timestamp) | Date and time of the news items publication |
| SentimentTitle (numeric) | Sentiment score of the text in the news items title |
| SentimentHeadline (numeric) | Sentiment score of the text in the news items headline |
| Facebook (numeric) | Final value of the news items popularity according to the social media source Facebook |
| GooglePlus (numeric) | Final value of the news items popularity according to the social media source Google+ |
| LinkedIn (numeric) | Final value of the news items popularity according to the social media source LinkedIn |
library(tm) # Text mining
library(SnowballC) # Word Stemming
library(caret) # Split data into train and test sets
library(lubridate) # Handle dates
library(tidyverse) # Data manipulation, plotting, etc.
library(glmnet) # Lasso regression
library(adabag) # Boosting for classification trees
library(ipred) # Bagging for regression trees
news_data <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/00432/Data/News_Final.csv",
stringsAsFactors = FALSE)
Remove Id column
news_data <- news_data[, -1]
Set system to english for current session
Sys.setlocale("LC_ALL", "C")
## [1] "C"
Take smaller subset/sample from original data file.
set.seed(441110560)
index <- sample(nrow(news_data), nrow(news_data)*0.25, replace = FALSE)
news_data <- news_data[index,]
Topic to factor.WeekDay).news_data <- mutate(news_data,
PublishDate = ymd_hms(PublishDate),
Topic = as.factor(Topic),
Source = as.factor(Source),
WeekDay = weekdays(PublishDate))
WeekDay to factornews_data$WeekDay <- as.factor(news_data$WeekDay)
news_data <- subset(news_data, year(PublishDate) %in% 2015:2016)
Sentiment title distribution by day of the week - There seems to be no influence in the day of the week on the news sentiment.
news_data %>% ggplot(aes(x = SentimentTitle, group = WeekDay, fill = WeekDay)) +
geom_density(position = "stack", alpha = 0.4)
news_data %>% ggplot(aes(x = SentimentHeadline, group = WeekDay, fill = WeekDay)) +
geom_density(position = "stack", alpha = 0.4)
Helper function makeCorpus to create and perform basic cleaning for corpus.
makeCorpus <- function(var){
# Convert var to corpus
corpus <- VCorpus(VectorSource(var))
# Convert documents to lowercase
corpus = tm_map(corpus, content_transformer(tolower))
# Remove punctuation
corpus = tm_map(corpus, removePunctuation)
# Remove stop words (words like the, I, etc)
corpus <- tm_map(corpus, removeWords, stopwords('english'))
# Stem documents (represent words with different endings as the same word)
corpus <- tm_map(corpus, stemDocument)
# Return corpus
corpus
}
Corpus for Title variable
titleCorpus <- makeCorpus(news_data$Title)
Create word frequency matrix
frequencies <- DocumentTermMatrix(titleCorpus)
frequencies
## <<DocumentTermMatrix (documents: 23309, terms: 13617)>>
## Non-/sparse entries: 151078/317247575
## Sparsity : 100%
## Maximal term length: 26
## Weighting : term frequency (tf)
Inspect matrix
inspect(frequencies[10:14, 2000:2003])
## <<DocumentTermMatrix (documents: 5, terms: 4)>>
## Non-/sparse entries: 0/20
## Sparsity : 100%
## Maximal term length: 5
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs bmws bnm boao board
## 10 0 0 0 0
## 11 0 0 0 0
## 12 0 0 0 0
## 13 0 0 0 0
## 14 0 0 0 0
Take a look at the most frequent words (that appear at least 500 times).
findFreqTerms(frequencies, lowfreq = 500)
## [1] "2016" "china" "econom" "economi" "global" "microsoft"
## [7] "new" "obama" "palestin" "presid" "say" "will"
## [13] "window"
Not surprisingly, the words that appear more often are related to the news topics.
table(news_data$Topic)
##
## economy microsoft obama palestine
## 8557 5479 7101 2172
Get rid of sparse terms that don’t appear very often - Keep terms that are present by at least 1% or more in the news titles.
sparse.title <- removeSparseTerms(frequencies, 0.99)
sparse.title
## <<DocumentTermMatrix (documents: 23309, terms: 59)>>
## Non-/sparse entries: 40367/1334864
## Sparsity : 97%
## Maximal term length: 11
## Weighting : term frequency (tf)
Convert to data frame (df)
sparseTitles <- as.data.frame(as.matrix(sparse.title))
Change names to convey R standards
names(sparseTitles) <- make.names(names(sparseTitles))
Add dependent variable
sparseTitles <- sparseTitles %>%
mutate(SentimentTitle = news_data$SentimentTitle)
Since this process will be repeated for the Headline column, two more helper functions are created:
getDtm: Create document term matrix (dtm).getDtm <- function(corpus, sparseness=.99){
removeSparseTerms(DocumentTermMatrix(corpus), sparseness)
}
asDf: Convert from dtm to df.asDf <- function(dtm, outcome){
df <- as.data.frame(as.matrix(dtm))
# Friendly names
colnames(df) <- make.names(colnames(df))
# Append outcome
df[, c(outcome)] <- news_data[, c(outcome)]
df
}
Split data: 60% to train set and the rest for testing.
set.seed(1203160122)
train_index.title <- createDataPartition(sparseTitles$SentimentTitle, times = 1, p = 0.6, list = FALSE)
trainTitles <- sparseTitles[train_index.title,]
testTitles <- sparseTitles[-train_index.title,]
Pass x matrix of predictors and y vector because glmnet does not take data frame as input alpha = 1 provides Lasso penalty.
lasso_fit <- glmnet(x = as.matrix(trainTitles[, -c(which(colnames(trainTitles) == 'SentimentTitle'))]),
y = trainTitles$SentimentTitle,
alpha = 1)
Lambda = 0.01 - Lower Lambda values gives higher coefficients.
coef(lasso_fit, s = 0.01)
## 60 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -0.005796573
## X2016 .
## app .
## back 0.003143269
## bank .
## barack .
## boost .
## brexit .
## busi .
## call .
## can .
## china 0.021419408
## clinton .
## cloud .
## court 0.144839942
## data .
## econom .
## economi .
## first 0.019540219
## get .
## global .
## grow .
## growth -0.073788373
## gun .
## help .
## hous .
## india .
## israel .
## job .
## make .
## market .
## may .
## meet .
## microsoft .
## new .
## now .
## obama .
## one .
## palestin .
## palestinian .
## plan .
## presid .
## rate .
## report .
## say .
## share .
## show .
## state .
## support -0.014206188
## surfac 0.021327171
## take .
## trump .
## updat .
## visit .
## white .
## will .
## window .
## world .
## xbox .
## year .
Lambda = 5 - Increasing Lambda provides smaller coefficients.
coef(lasso_fit, s = 5)
## 60 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -0.004791821
## X2016 .
## app .
## back .
## bank .
## barack .
## boost .
## brexit .
## busi .
## call .
## can .
## china .
## clinton .
## cloud .
## court .
## data .
## econom .
## economi .
## first .
## get .
## global .
## grow .
## growth .
## gun .
## help .
## hous .
## india .
## israel .
## job .
## make .
## market .
## may .
## meet .
## microsoft .
## new .
## now .
## obama .
## one .
## palestin .
## palestinian .
## plan .
## presid .
## rate .
## report .
## say .
## share .
## show .
## state .
## support .
## surfac .
## take .
## trump .
## updat .
## visit .
## white .
## will .
## window .
## world .
## xbox .
## year .
Reiterating:
cv_lasso.title <- cv.glmnet(x = as.matrix(trainTitles[, -c(which(colnames(trainTitles) == 'SentimentTitle'))]),
y = trainTitles$SentimentTitle,
alpha = 1,
nfolds = 5)
plot(cv_lasso.title)
With this lambda value we may get a higher training error but this compensates with the model generalization in the test set (better prediction accuracy).
# Best lambda (or s) that minimized error on validation set
cv_lasso.title$lambda.min
## [1] 0.0004139912
Make predictions on test set for the selected lambda
preds.title <- predict(lasso_fit,
as.matrix(testTitles[, -c(which(colnames(testTitles) == 'SentimentTitle'))]),
s = cv_lasso.title$lambda.min)
Another helper function to calculate RMSE.
RMSE <- function(pred, label){
sqrt(mean((pred - label)^2))
}
RMSE on train set = 0.12667
preds.title.train <- predict(lasso_fit,
as.matrix(trainTitles[, -c(which(colnames(trainTitles) == 'SentimentTitle'))]),
s = cv_lasso.title$lambda.min)
RMSE(as.vector(preds.title.train), trainTitles$SentimentTitle)
## [1] 0.12667
RMSE on testing = 0.1259853
RMSE(as.vector(preds.title), testTitles$SentimentTitle)
## [1] 0.1259853
Fit a tree for each bootstrap sample, and then aggregate the predicted values from all these trees.
bagged.title <- bagging(SentimentTitle ~ ., data = trainTitles, nbagg=100)
bagged.title
##
## Bagging regression trees with 100 bootstrap replications
##
## Call: bagging.data.frame(formula = SentimentTitle ~ ., data = trainTitles,
## nbagg = 100)
RMSE on train set = 0.131326
RMSE(predict(bagged.title), trainTitles$SentimentTitle)
## [1] 0.131326
Predict on test set
bag.pred.title <- predict(bagged.title, newdata = testTitles)
RMSE on test set = 0.1303784
RMSE(bag.pred.title, testTitles$SentimentTitle)
## [1] 0.1303784
headlineCorpus <- makeCorpus(news_data$Headline)
sparse.headline <- getDtm(headlineCorpus)
sparseHeadlines <- asDf(sparse.headline, "SentimentHeadline")
Split data
set.seed(1203160122)
train_index.headline <- createDataPartition(sparseHeadlines$SentimentHeadline, times = 1, p = 0.6, list = FALSE)
trainHeadlines <- sparseHeadlines[train_index.headline,]
testHeadlines <- sparseHeadlines[-train_index.headline,]
lasso.headlines <- glmnet(x=as.matrix(trainHeadlines[, -c(which(colnames(trainHeadlines) == "SentimentHeadline"))]), y=trainHeadlines$SentimentHeadline, alpha = 1, numFolds=5)
RMSE train = 0.1300101
pred.headlines.train <- predict(lasso.headlines,
as.matrix(trainHeadlines[, -c(which(colnames(trainHeadlines) == "SentimentHeadline"))]),
s = lasso.headlines$lambda.min)
RMSE(pred.headlines.train, trainHeadlines$SentimentHeadline)
## [1] 0.1300101
RMSE test = 0.1322504
pred.headlines.test <- predict(lasso.headlines,
as.matrix(testHeadlines[, -c(which(colnames(testHeadlines) == 'SentimentHeadline'))]),
s = lasso.headlines$lambda.min)
RMSE(pred.headlines.test, testHeadlines$SentimentHeadline)
## [1] 0.1322504
Remove outcomes
sparseTitles$SentimentTitle <- NULL
sparseHeadlines$SentimentHeadline <- NULL
Change columns names of sparse dataframes for both titles and headlines.
colnames(sparseTitles) <- paste0("T", colnames(sparseTitles))
colnames(sparseHeadlines) <- paste0("H", colnames(sparseHeadlines))
Join Title and Headline sparseMatrix as features.
final_data <- cbind(news_data, sparseTitles, sparseHeadlines)
final_data <- select(final_data, -c(Title, Headline, Source, PublishDate, WeekDay, SentimentTitle, SentimentHeadline))
Split data
set.seed(882)
train_index <- createDataPartition(final_data$Topic, times = 1, p = 0.6, list = FALSE)
final_data.train <- final_data[train_index,]
final_data.test <- final_data[-train_index,]
knn.topic <- knn3(Topic ~., data=final_data.train, k=10)
Since there is no higher cost involved when predicting false positives or false negatives (we care about specificity and sensitivity equally), accuracy serves as a good measure of model performance.
confusionMatrix(predict(knn.topic, final_data.train, type="class"),
final_data.train$Topic)
## Confusion Matrix and Statistics
##
## Reference
## Prediction economy microsoft obama palestine
## economy 4520 234 159 35
## microsoft 198 2841 58 11
## obama 336 141 4026 191
## palestine 81 72 18 1067
##
## Overall Statistics
##
## Accuracy : 0.8903
## 95% CI : (0.885, 0.8955)
## No Information Rate : 0.3671
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.845
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Statistics by Class:
##
## Class: economy Class: microsoft Class: obama
## Sensitivity 0.8802 0.8641 0.9448
## Specificity 0.9517 0.9750 0.9313
## Pos Pred Value 0.9135 0.9141 0.8577
## Neg Pred Value 0.9320 0.9589 0.9747
## Prevalence 0.3671 0.2351 0.3046
## Detection Rate 0.3231 0.2031 0.2878
## Detection Prevalence 0.3537 0.2222 0.3356
## Balanced Accuracy 0.9159 0.9195 0.9381
## Class: palestine
## Sensitivity 0.81825
## Specificity 0.98652
## Pos Pred Value 0.86187
## Neg Pred Value 0.98141
## Prevalence 0.09322
## Detection Rate 0.07628
## Detection Prevalence 0.08850
## Balanced Accuracy 0.90238
Test set accuracy = 0.8702929
confusionMatrix(predict(knn.topic, final_data.test, type="class"),
final_data.test$Topic)$overall["Accuracy"]
## Accuracy
## 0.8702929
Using 90 trees and bootstrapped samples.
boost.topic <- boosting(Topic ~., data=final_data.train, boost=TRUE, mfinal=90)
Plotting variable importance according to boosting model - The fact that the word Obama appears or not in the title (Tobama) influences the topic assignment.
importanceplot(boost.topic)
Training Accuracy = 0.993137
pred.boost.topic <- predict(boost.topic, final_data.train)
confusionMatrix(factor(pred.boost.topic$class), final_data.train$Topic)$overall["Accuracy"]
## Accuracy
## 0.993137
Testing Accuracy = 0.9897007
pred.boost.topic <- predict(boost.topic, final_data.test)
confusionMatrix(factor(pred.boost.topic$class), final_data.test$Topic)$overall["Accuracy"]
## Accuracy
## 0.9897007
In general, there is a relatively high accuracy. Boosting significantly outperformed KNN because it takes the average vote of many trees.