R Markdown

Download data and libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(quanteda)
## Warning: package 'quanteda' was built under R version 4.5.1
## Package version: 4.3.1
## Unicode version: 15.1
## ICU version: 74.1
## Parallel computing: 4 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
# Load csv data

twitter <- read.csv("C:/Users/molop/OneDrive/Desktop/Coursera/twitter.csv",
                    stringsAsFactors = FALSE)


news <- read.csv("C:/Users/molop/OneDrive/Desktop/Coursera/news.csv",
                 stringsAsFactors = FALSE)

blogs <- read.csv("C:/Users/molop/OneDrive/Desktop/Coursera/blogs.csv",
                  stringsAsFactors = FALSE)


length(which(!complete.cases(twitter)))
## [1] 0
length(which(!complete.cases(news)))
## [1] 0
length(which(!complete.cases(blogs)))
## [1] 0

1 # Tokenize twitter messages.

twitter.tokens <- tokens(twitter$X1, what = "word", 
                       remove_numbers = TRUE, remove_punct = TRUE,
                       remove_symbols = TRUE, remove_hyphens = TRUE)


# Lower case the tokens.
twitter.tokens <- tokens_tolower(twitter.tokens)


# Remove stopwords
twitter.tokens <- tokens_select(twitter.tokens, stopwords(), 
                              selection = "remove")

# Perform stemming on the tokens.
twitter.tokens <- tokens_wordstem(twitter.tokens, language = "english")

# Create our first bag-of-words model.
twitter.tokens.dfm <- dfm(twitter.tokens, tolower = FALSE)

# Transform to a matrix and inspect.
twitter.tokens.matrix <- as.matrix(twitter.tokens.dfm)
View(twitter.tokens.matrix[1:20, 1:100])
# dim(train.tokens.matrix)

# Number of words in a document

twitterLength <- data.frame(rowSums(twitter.tokens.matrix))
twitter.tokens <- cbind(twitter.tokens,twitterLength)

sorted_tweet <- arrange(twitter,desc(twitter$counts))

Current space constraint prevent the davelopment of the word frequency matrix and recieving attention

The same process will be implemented for both the blog and the news documents


blog.tokens <- tokens(blogs$X1, what = "word", 
                         remove_numbers = TRUE, remove_punct = TRUE,
                         remove_symbols = TRUE, remove_hyphens = TRUE)


# Lower case the tokens.
blog.tokens <- tokens_tolower(blog.tokens)


# Remove stopwords
blog.tokens <- tokens_select(blog.tokens, stopwords(), 
                                selection = "remove")

# Perform stemming on the tokens.
blog.tokens <- tokens_wordstem(blog.tokens, language = "english")

# Create our first bag-of-words model.
blog.tokens.dfm <- dfm(blog.tokens, tolower = FALSE)

# Transform to a matrix and inspect.
blog.tokens.matrix <- as.matrix(blog.tokens.dfm)
View(blog.tokens.matrix[1:20, 1:100])
# dim(train.tokens.matrix)

# Number of words in a document

blogsLength <- data.frame(rowSums(blog.tokens.matrix))
blog.tokens <- cbind(blog.tokens,blogsLength)
sorted_tweet <- arrange(blogs,desc(blogs$blogsLength))

For news…!

news.tokens <- tokens(twitter$X1, what = "word", 
                         remove_numbers = TRUE, remove_punct = TRUE,
                         remove_symbols = TRUE, remove_hyphens = TRUE)


# Lower case the tokens.
news.tokens <- tokens_tolower(news.tokens)


# Remove stopwords
news.tokens <- tokens_select(news.tokens, stopwords(), 
                                selection = "remove")

# Perform stemming on the tokens.
news.tokens <- tokens_wordstem(news.tokens, language = "english")

# Create our first bag-of-words model.
news.tokens.dfm <- dfm(news.tokens, tolower = FALSE)

# Transform to a matrix and inspect.
news.tokens.matrix <- as.matrix(news.tokens.dfm)
View(news.tokens.matrix[1:20, 1:100])
# dim(train.tokens.matrix)

# Number of words in a document

newsLength <- data.frame(rowSums(news.tokens.matrix))
news.tokens <- cbind(news.tokens, newsLength)
View(news.tokens[1:5, ])


sorted_tweet <- arrange(news,desc(newsLength))
summary(cars)

The three datasets are to be merged together and a predictor to determine

whether a document with certain words come from one of the three outlets

Elementary Exploration and plots as the matrices experiance the following error

Error: cannot allocate vector of size 1634.2 Gb In addition: Warning message: In asMethod(object) : sparse->dense coercion: allocating vector of size 1634.2 GiB