the goal of this first annalyst, is to see wich words a the most user in post and articls.
import all library needed
library(stringr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(NLP)
library(ngram)
library(tm)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
import the data needed
the data comes from news, blogs or tweet
fileName <- 'C:\\Users\\rache\\Downloads\\Coursera-SwiftKey\\final\\en_US'
get only a part of the data there are to big
con <- file(paste(fileName , '\\en_US.blogs.txt', sep =''),open="r")
lineblogus <- readLines(con, 1000)
con <- file(paste(fileName , '\\en_US.news.txt', sep =''),open="r")
linenewsus <- readLines(con, 1000)
con <- file(paste(fileName , '\\en_US.twitter.txt', sep =''),open="r")
linetwitteus <- readLines(con, 1000)
allSources <- c(lineblogus, linenewsus, linetwitteus)
delecte ponctuation, numbers because there are not useful for the analyse
data <- allSources %>%
paste(collapse = " ") %>%
VectorSource() %>%
Corpus() %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removePunctuation) %>%
tm_map(removeNumbers) %>%
tm_map(removeWords, stopwords("english"))
#head(data[[1]])
summary(data[[1]])
## Length Class Mode
## content 1 -none- character
## meta 7 TextDocumentMeta list
function to count to count the number of each word in all the text
word_list <- c()
countWord <- function(sentence){
for (word in str_split(sentence, " ")[[1]]) {
if(is.null(word_list[word]) || is.na(word_list[word])) {#
if(word != ""){
word_list[[word]] <- 1
}
}else{
word_list[[word]] <- word_list[[word]] + 1
}
}
word_list
}
counted <- countWord(data[[1]])
counted2 <- data.frame(counted)
plot(as.factor(names(counted)), counted2$counted)
cloud of the word we see the most in the texts
wordcloud(words = names(counted), freq = counted, min.freq = 0,
max.words=1000, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))