Introduction

The purpose of this analysis is to use three different types of data sets gathered from different text sources to make a prediction algorithm for text prediction The sources of the texts were news, blogs, and twitter

First reading the data into R was achived by direct download as temp files this was a fairly straightforward step although rather time consuming because of the size of the files

temp <- tempfile() # makes temp file to unzip the swiftKeys data
  download.file(Corpus,temp) # downloads to zipfile to the temp file
  file_names <- list.files("file")
  walk(temp, ~ unzip(zipfile = temp))
#knitr::opts_knit$set(root.dir = "./Coursera-SwiftKey/final")

# reads in the lines of data. And creates the data frame
blogData <-readLines("./final/en_US/en_US.blogs.txt")
newsData <-readLines("./final/en_US/en_US.news.txt")
## Warning in readLines("./final/en_US/en_US.news.txt"): incomplete final line
## found on './final/en_US/en_US.news.txt'
twitterData<-readLines("./final/en_US/en_US.twitter.txt")
## Warning in readLines("./final/en_US/en_US.twitter.txt"): line 167155 appears to
## contain an embedded nul
## Warning in readLines("./final/en_US/en_US.twitter.txt"): line 268547 appears to
## contain an embedded nul
## Warning in readLines("./final/en_US/en_US.twitter.txt"): line 1274086 appears to
## contain an embedded nul
## Warning in readLines("./final/en_US/en_US.twitter.txt"): line 1759032 appears to
## contain an embedded nul
# Change to variable names to Text for consistency
#########



newsData_c <-as_tibble(newsData)
colnames(newsData_c) <-c("word")
twitterData_c <-as_tibble(twitterData)
colnames(twitterData_c) <-c("word")
blogData_c <-as_tibble( blogData)
colnames(blogData_c) <-("word")

##Preprocessing

Preprocessing was started by first removing all of the stop words. This step was chosen first because removing apostrophes from words can be cumbersome and not always effective. Removing special characters has different contraints open it depending which platform is being used.

########################
rm(blogData, twitterData)
newsData_c <-newsData_c %>%
  anti_join(stop_words)
## Joining, by = "word"
twitterData_c <-twitterData_c%>%
  anti_join(stop_words)
## Joining, by = "word"
blogData_c<-blogData_c%>%
  anti_join(stop_words)
## Joining, by = "word"

Next, each Corpus was tokenized and removing the punctuation and tranforming any capital letters to lowercase. Then any residual punctuation not removed within the stopwords was cleaned away, last all of the digits were removed.

newsData_c <-(unnest_tokens(tbl=newsData_c,input="word", output="word"))

twitterData_c<-(unnest_tokens(tbl=twitterData_c, input="word", output="word"))

blogData_c<-(unnest_tokens(tbl=blogData_c, input="word", output='word'))


newsData_c$word<- str_remove_all(newsData_c$word, "[^[:alnum:]]")
twitterData_c$word<-str_remove_all(twitterData_c$word, "[^[:alnum:]]")
blogData_c$word<-str_remove_all(blogData_c$word, "[^[:alnum:]]")


newsData_c$word<-str_remove_all( newsData_c$word, "\\d")
twitterData_c$word<-str_remove_all( twitterData_c$word, "\\d")
blogData_c$word<-str_remove_all( blogData_c$word, "\\d")

Take out all the stop words and remove foreign characters. Remove all the digits, and combine all the tibble into one dataframe by source type. Then the files were stripped of punctuation and stop words These are defined as frequently seen words that add no meaning to the sentiment of the text examples of this include articles, pronouns, and prepositions Then an numbers and foreign characters were removed, then the words were “stemmed” from endings like ing, ed, s, and es.

##Word Frequency

Next the the word frequency was examined by source type(blog, news, twitter).

#########Combine all sources by type#############
AllData <- bind_rows(mutate(blogData_c, source_type = "Twitter"),
           mutate(newsData_c, source_type = "News"),
           mutate(twitterData_c, source_type = "Blog"))

### count occurences of words filter out 2 letter 
AllData$word <-str_trim(AllData$word)
AllData$word <-str_remove_all(AllData$word,"[^[:alnum:]]" )
AllData$word <-str_remove_all(AllData$word ,"\\w*[Ac]+\\w*\\s*")


AllData1 <-AllData%>%
  anti_join(stop_words)%>%
  group_by(source_type)%>%
  count(word, sort=TRUE)
## Joining, by = "word"
AllData1 <-AllData1%>%
  filter(nchar(word)>2)%>%
  select(source_type, word, n)%>%
  filter(n>250)

frequency <- AllData1%>% 
   count(source_type, word) %>%
   group_by(source_type) %>%
   mutate(proportion = n / sum(n)) %>% 
   select(-n) %>% 
   pivot_wider(names_from = source_type, values_from = proportion) %>%
   pivot_longer(`News`:`Twitter`,
                names_to = "source_type", values_to = "proportion")
   

colnames(frequency) <-c("Word", "Blog", "Source_Type", "Proportion")
library(scales)

###############graph freq##################
# expect a warning about rows with missing values being removed
g1 <-ggplot(frequency, aes(x =Proportion, y =Blog,     fill=Source_Type, color = abs(Blog - Proportion))) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 1.5, width = 0.5, height = 0.5) +
  geom_text(aes(label = Word),color="black", check_overlap = TRUE, vjust = 2.5) +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits=c(0.5, 1.0), low = "darkslategray4", high = "gray75") +
  facet_wrap(~Source_Type, ncol = 2) +
  theme(legend.position="none")





g1
## Warning: Removed 9735 rows containing missing values (geom_point).
## Warning: Removed 9735 rows containing missing values (geom_text).

Sentiment Analysis

With Sentiment Analysis, we examine negative and positive connotations of the words used grouped by source_type.

#################
colnames(AllData1) <-c( "source_type","word", "n")
AllSentiment<-AllData1%>%
  inner_join(get_sentiments("bing")) %>%
  count(word, index = n, sentiment) %>%
  group_by(source_type) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>% 
  mutate(sentiment = positive - negative)
## Joining, by = "word"
 ggplot(AllSentiment, aes(index, sentiment, fill=source_type))+
  geom_col(show.legend = FALSE) +
   xlim(250, 1000)+
  facet_wrap(~source_type, nrow=3, scales="free") +
  geom_hline(yintercept=0, color = 'black')
## Warning: Removed 537 rows containing missing values (position_stack).

## Joining, by = "source_type"

Anything above zero is a positive sentiment and anything below zero is negative sentiment. The percent of negative sentiment was actually lower in the news group than in blogs and twitter 56, 42, 55.

########## Term Frequency and Inverse Term Frequency#######

  
TotalBySource <-AllData1 %>%
              group_by(source_type)%>%
             summarize(Total=sum(n))


CompleteData <-left_join(AllData1, TotalBySource)
## Joining, by = "source_type"
g3<-ggplot(CompleteData, aes(n/Total, fill = source_type)) +
  geom_histogram(show.legend = FALSE, bins=100) +

  xlim(0.001,0.005)+
  facet_wrap(~source_type, nrow = 3, scales = "free")


g3
## Warning: Removed 10771 rows containing non-finite values (stat_bin).
## Warning: Removed 6 rows containing missing values (geom_bar).

CompleteDataByRank<- CompleteData %>% 
  group_by(source_type) %>% 
  mutate(rank = row_number(), 
         term_frequency = n/Total) %>%
  ungroup()

rank_subset <- CompleteDataByRank %>% 
  filter(rank < 500,
         rank > 10)

lm_line <-lm(log10(term_frequency) ~ log10(rank), data = rank_subset)

CompleteDataByRank %>% 
  ggplot(aes(rank, term_frequency, color = source_type)) +
  geom_abline(intercept = lm_line$coef[1], slope = lm_line$coef[2], 
              color = "gray50", linetype = 2) +
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) + 
  scale_x_log10() +
  scale_y_log10()

CompleteData_tf_itf<-CompleteData%>%
  bind_tf_idf(word, source_type, n)


CompleteData_tf_itf<-CompleteData_tf_itf%>%
select(-Total) %>%
  arrange(desc(tf_idf))




CompleteData_tf_itf %>%
  group_by(source_type) %>%
  slice_max(tf_idf, n = 15) %>%
  ungroup() %>%
  ggplot(aes(tf_idf, fct_reorder(word, tf_idf), fill = source_type)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~source_type, nrow  = 3, scales = "free") 

“An n-gram model is a type of probabilistic language model for predicting the next item in such a sequence in the form of a (n − 1)–order Markov model.[2] n-gram models are now widely used in probability, communication theory, computational linguistics (for instance, statistical natural language processing), computational biology (for instance, biological sequence analysis), and data compression. Two benefits of n-gram models (and algorithms that use them) are simplicity and scalability – with larger n, a model can store more context with a well-understood space–time tradeoff, enabling small experiments to scale up efficiently.”link

################ News N-gram###############


newsTibble <-as_tibble(newsData)
newsTibble$value<-iconv(newsTibble$value, from = '', to = 'ASCII//TRANSLIT')

newsTibble$value<-str_replace_all(newsTibble$value, "[^[:alnum:]]"," ")
newsTibble$value<-tolower(newsTibble$value )
newsTibble$value<-str_remove_all(newsTibble$value,"\\d")

set.seed(12345)
newsData_ngram<-newsTibble%>%
  unnest_tokens(bigram, value, token = "ngrams", n = 2)
newsNgramCount<-newsData_ngram%>%
  count(bigram, sort = TRUE)
newsNgramCount<-newsNgramCount%>%
  separate(bigram, c("word1", "word2"), sep = " ")


newsbigrams_filtered <- newsNgramCount %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

newsbigram_counts <- newsbigrams_filtered %>%
  count(word1, word2, sort = TRUE)
# new bigram counts:
#install.packages("igraph")

newsbigram_counts <- newsbigrams_filtered %>%
  count(word1, word2, sort = TRUE)
newsbigram_graph <-newsbigram_counts %>%
  slice(-c(1:200))%>%
  filter(nchar(word1)>3)%>%
  graph_from_data_frame()
newsbigram_graph
## IGRAPH 61a013e DN-- 61652 318453 -- 
## + attr: name (v/c), n (e/n)
## + edges from 61a013e (vertex names):
##  [1] abbreviated->length      abbreviated->monticello  abbreviated->retrial    
##  [4] abby       ->chapel      abby       ->frerotte    abby       ->maas       
##  [7] abby       ->martone     abby       ->raising     abby       ->wambach    
## [10] abdel      ->maged       abdenour   ->wrote       abdi       ->abdirahman 
## [13] abdomen    ->protuberant abdominal  ->cramping    abdominal  ->cramps     
## [16] abdominal  ->injury      abdominal  ->pain        abdominal  ->strain     
## [19] abdominal  ->ultrasounds abdul      ->aziz        abdul      ->boyd       
## [22] abdul      ->ghany       abdul      ->haaq        abdul      ->isn        
## + ... omitted several edges
#install.packages("ggraph")

In order to do a more thorough analysis, text features will be added to n-gram and Markov chain to better understand how to perform predictive text from the Shiny app.