searchkeys <- read.csv("~/Downloads/Unsaved report (2).csv", stringsAsFactors=FALSE)
df <- read.csv("~/Downloads/AdWords Analysis from Aug 1 - Aug 14, 2016 - Sheet4.csv")
names(searchkeys)[1] <- "keyword"
## Summarize cost by country
library(dplyr)
cost_by_country <- df %>%
group_by(Country.Territory) %>%
summarise(Cost = sum(Cost))
## Summarize clicks by country
clicks_by_country <- df %>%
group_by(Country.Territory) %>%
summarise(Clicks = sum(Clicks))
## Summarize cost by day
cost_by_day <- df %>%
group_by(Day.of.week) %>%
summarise(Cost = sum(Cost))
## Summarize clicks by day
clicks_by_day <- df %>%
group_by(Day.of.week) %>%
summarise(Clicks = sum(Clicks))
write.csv(cost_by_country, file = "~/Downloads/CostByCountry.csv", row.names = F)
write.csv(clicks_by_country, file = "~/Downloads/ClicksByCountry.csv", row.names = F)
Textanalytics of keywords being used in the queries that lead to a click on the Foundational Research Institute website
#install.packages("tm")
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
#install.packages("SnowballC")
library(SnowballC)
# Create corpus
corpus = Corpus(VectorSource(searchkeys$keyword))
# Look at corpus
corpus
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 435
corpus[[1]]$content
## [1] "moral disagreement"
# Convert to lower-case
corpus = tm_map(corpus, tolower)
corpus = tm_map(corpus, PlainTextDocument)
# Remove punctuation
corpus = tm_map(corpus, removePunctuation)
corpus[[1]]$content
## [1] "moral disagreement"
# Look at stop words
stopwords("english")[1:10]
## [1] "i" "me" "my" "myself" "we"
## [6] "our" "ours" "ourselves" "you" "your"
# Remove stopwords
corpus = tm_map(corpus, removeWords, c(stopwords("english")))
corpus[[1]]$content
## [1] "moral disagreement"
# Stem words
corpus = tm_map(corpus, stemDocument)
corpus[[1]]$content
## [1] "moral disagr"
# Create matrix
frequencies = DocumentTermMatrix(corpus)
frequencies
## <<DocumentTermMatrix (documents: 435, terms: 220)>>
## Non-/sparse entries: 975/94725
## Sparsity : 99%
## Maximal term length: 14
## Weighting : term frequency (tf)
# Look at matrix
inspect(frequencies[10:15,50:55])
## <<DocumentTermMatrix (documents: 6, terms: 6)>>
## Non-/sparse entries: 0/36
## Sparsity : 100%
## Maximal term length: 7
## Weighting : term frequency (tf)
##
## Terms
## Docs crisi crucial cruel cruelti cultur danger
## character(0) 0 0 0 0 0 0
## character(0) 0 0 0 0 0 0
## character(0) 0 0 0 0 0 0
## character(0) 0 0 0 0 0 0
## character(0) 0 0 0 0 0 0
## character(0) 0 0 0 0 0 0
# Check for sparsity, filter for frequency >5
findFreqTerms(frequencies, lowfreq=5)
## [1] "abus" "altruism" "anim" "articl" "artifici"
## [6] "chariti" "chicken" "circus" "cooper" "cruel"
## [11] "cruelti" "essay" "ethic" "factori" "farm"
## [16] "futur" "happi" "help" "intellig" "intern"
## [21] "life" "moral" "natur" "philosophi" "protect"
## [26] "rescu" "research" "risk" "save" "scienc"
## [31] "societi" "stop" "suffer" "technolog" "test"
## [36] "utilitarian" "valu" "vegan" "welfar" "wild"
## [41] "wildlif"
# Remove sparse terms that is terms that occure in less than 99% of search queries
sparse = removeSparseTerms(frequencies, .99)
sparse
## <<DocumentTermMatrix (documents: 435, terms: 41)>>
## Non-/sparse entries: 687/17148
## Sparsity : 96%
## Maximal term length: 11
## Weighting : term frequency (tf)
# Convert to a data frame
keywordsSparse = as.data.frame(as.matrix(sparse))
# Make all variable names R-friendly
colnames(keywordsSparse) = make.names(colnames(keywordsSparse))
rownames(keywordsSparse) <- 1:nrow(keywordsSparse)
keywordsFreq <- apply(keywordsSparse, 2, sum)
keywordsFreq <- data.frame(keywords = names(keywordsFreq),n = keywordsFreq)
Plot frequency
keywordsFreq <- transform(keywordsFreq, keywords = reorder(keywords, n))
ggplot(data=keywordsFreq, aes(y=n, x=keywords)) +
geom_bar(stat = "identity", color="black", fill="green") +
coord_flip() +
scale_y_continuous(breaks=seq(0,100, 25)) +
geom_text(aes(keywords,n, label=n), size=3, show.legend =F, hjust=-.35) +
ggtitle("Most frequently used search keywords of Google search queries\nwhich lead to AdClicks")

Set threshlevel to 95%
sparse = removeSparseTerms(frequencies, 0.95)
sparse
## <<DocumentTermMatrix (documents: 435, terms: 7)>>
## Non-/sparse entries: 358/2687
## Sparsity : 88%
## Maximal term length: 8
## Weighting : term frequency (tf)
# Convert to a data frame
keywordsSparse = as.data.frame(as.matrix(sparse))
# Make all variable names R-friendly
colnames(keywordsSparse) = make.names(colnames(keywordsSparse))
rownames(keywordsSparse) <- 1:nrow(keywordsSparse)
keywordsFreq <- apply(keywordsSparse, 2, sum)
keywordsFreq <- data.frame(keywords = names(keywordsFreq),n = keywordsFreq)
Plot frequency
keywordsFreq <- transform(keywordsFreq, keywords = reorder(keywords, n))
ggplot(data=keywordsFreq, aes(y=n, x=keywords)) +
geom_bar(stat = "identity", color="black", fill="green") +
coord_flip() +
scale_y_continuous(breaks=seq(0,100, 25)) +
geom_text(aes(keywords,n, label=n), size=3, show.legend =F, hjust=-.35) +
ggtitle("Most frequently used search keywords of Google search queries\nwhich lead to AdClicks")

Create fancy Word Cloud
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
m <- TermDocumentMatrix(corpus, control = list(minWordLength = 1))
m <- as.matrix(m)
# calculate the frequency of words
v <- sort(rowSums(m), decreasing=TRUE)
myNames <- names(v)
a <- which(myNames=="anim")
myNames[a] <- "animal"
i <- which(myNames=="intellig")
myNames[i] <- "intelligence"
d <- data.frame(word=myNames, freq=v)
pal2 <- brewer.pal(8, "Dark2")
wordcloud(d$word, d$freq, scale=c(8,.2),min.freq=1,
max.words=Inf, random.order=FALSE, rot.per=.15, colors=pal2)
