Introduction

the goal of this first annalyst, is to see wich words a the most user in post and articls.

Library

import all library needed

library(stringr)


library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(NLP)
library(ngram)
library(tm)

library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)

get data

import the data needed

the data comes from news, blogs or tweet

fileName <- 'C:\\Users\\rache\\Downloads\\Coursera-SwiftKey\\final\\en_US'

clean data

get only a part of the data there are to big

con <- file(paste(fileName , '\\en_US.blogs.txt', sep =''),open="r")
lineblogus <- readLines(con, 1000)

con <- file(paste(fileName , '\\en_US.news.txt', sep =''),open="r")
linenewsus <- readLines(con, 1000)

con <- file(paste(fileName , '\\en_US.twitter.txt', sep =''),open="r")
linetwitteus <- readLines(con, 1000)
allSources <- c(lineblogus, linenewsus, linetwitteus)

delecte ponctuation, numbers because there are not useful for the analyse

data <- allSources %>%
  paste(collapse = " ") %>%
  VectorSource() %>%
  Corpus() %>% 
  tm_map(content_transformer(tolower)) %>%
  tm_map(removePunctuation) %>% 
  tm_map(removeNumbers) %>%
  tm_map(removeWords, stopwords("english"))

summaryse

#head(data[[1]])

summary(data[[1]])
##         Length Class            Mode     
## content 1      -none-           character
## meta    7      TextDocumentMeta list

count the word

function to count to count the number of each word in all the text

word_list <- c()

countWord <- function(sentence){
  for (word in str_split(sentence, " ")[[1]]) {
    if(is.null(word_list[word]) || is.na(word_list[word])) {# 
      
      if(word != ""){
        word_list[[word]] <- 1
      }
    }else{
      word_list[[word]] <- word_list[[word]] + 1
    }
  }
  word_list
}

counted <- countWord(data[[1]])

make the plot

counted2 <- data.frame(counted)


plot(as.factor(names(counted)), counted2$counted)

cloud of the word we see the most in the texts

wordcloud(words = names(counted), freq = counted, min.freq = 0,
          max.words=1000, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))