Download data and libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(quanteda)
## Warning: package 'quanteda' was built under R version 4.5.1
## Package version: 4.3.1
## Unicode version: 15.1
## ICU version: 74.1
## Parallel computing: 4 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
# Load csv data
twitter <- read.csv("C:/Users/molop/OneDrive/Desktop/Coursera/twitter.csv",
stringsAsFactors = FALSE)
news <- read.csv("C:/Users/molop/OneDrive/Desktop/Coursera/news.csv",
stringsAsFactors = FALSE)
blogs <- read.csv("C:/Users/molop/OneDrive/Desktop/Coursera/blogs.csv",
stringsAsFactors = FALSE)
length(which(!complete.cases(twitter)))
## [1] 0
length(which(!complete.cases(news)))
## [1] 0
length(which(!complete.cases(blogs)))
## [1] 0
1 # Tokenize twitter messages.
twitter.tokens <- tokens(twitter$X1, what = "word",
remove_numbers = TRUE, remove_punct = TRUE,
remove_symbols = TRUE, remove_hyphens = TRUE)
# Lower case the tokens.
twitter.tokens <- tokens_tolower(twitter.tokens)
# Remove stopwords
twitter.tokens <- tokens_select(twitter.tokens, stopwords(),
selection = "remove")
# Perform stemming on the tokens.
twitter.tokens <- tokens_wordstem(twitter.tokens, language = "english")
# Create our first bag-of-words model.
twitter.tokens.dfm <- dfm(twitter.tokens, tolower = FALSE)
# Transform to a matrix and inspect.
twitter.tokens.matrix <- as.matrix(twitter.tokens.dfm)
View(twitter.tokens.matrix[1:20, 1:100])
# dim(train.tokens.matrix)
# Number of words in a document
twitterLength <- data.frame(rowSums(twitter.tokens.matrix))
twitter.tokens <- cbind(twitter.tokens,twitterLength)
sorted_tweet <- arrange(twitter,desc(twitter$counts))
Current space constraint prevent the davelopment of the word frequency matrix and recieving attention
blog.tokens <- tokens(blogs$X1, what = "word",
remove_numbers = TRUE, remove_punct = TRUE,
remove_symbols = TRUE, remove_hyphens = TRUE)
# Lower case the tokens.
blog.tokens <- tokens_tolower(blog.tokens)
# Remove stopwords
blog.tokens <- tokens_select(blog.tokens, stopwords(),
selection = "remove")
# Perform stemming on the tokens.
blog.tokens <- tokens_wordstem(blog.tokens, language = "english")
# Create our first bag-of-words model.
blog.tokens.dfm <- dfm(blog.tokens, tolower = FALSE)
# Transform to a matrix and inspect.
blog.tokens.matrix <- as.matrix(blog.tokens.dfm)
View(blog.tokens.matrix[1:20, 1:100])
# dim(train.tokens.matrix)
# Number of words in a document
blogsLength <- data.frame(rowSums(blog.tokens.matrix))
blog.tokens <- cbind(blog.tokens,blogsLength)
sorted_tweet <- arrange(blogs,desc(blogs$blogsLength))
For news…!
news.tokens <- tokens(twitter$X1, what = "word",
remove_numbers = TRUE, remove_punct = TRUE,
remove_symbols = TRUE, remove_hyphens = TRUE)
# Lower case the tokens.
news.tokens <- tokens_tolower(news.tokens)
# Remove stopwords
news.tokens <- tokens_select(news.tokens, stopwords(),
selection = "remove")
# Perform stemming on the tokens.
news.tokens <- tokens_wordstem(news.tokens, language = "english")
# Create our first bag-of-words model.
news.tokens.dfm <- dfm(news.tokens, tolower = FALSE)
# Transform to a matrix and inspect.
news.tokens.matrix <- as.matrix(news.tokens.dfm)
View(news.tokens.matrix[1:20, 1:100])
# dim(train.tokens.matrix)
# Number of words in a document
newsLength <- data.frame(rowSums(news.tokens.matrix))
news.tokens <- cbind(news.tokens, newsLength)
View(news.tokens[1:5, ])
sorted_tweet <- arrange(news,desc(newsLength))
summary(cars)
Elementary Exploration and plots as the matrices experiance the following error
Error: cannot allocate vector of size 1634.2 Gb In addition: Warning message: In asMethod(object) : sparse->dense coercion: allocating vector of size 1634.2 GiB