library(tidytext)
library(ggplot2)
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble 2.1.3 ✓ dplyr 0.8.5
## ✓ tidyr 1.0.2 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ✓ purrr 0.3.3
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(stringr)
library(R.utils)
## Loading required package: R.oo
## Loading required package: R.methodsS3
## R.methodsS3 v1.8.0 (2020-02-14 07:10:20 UTC) successfully loaded. See ?R.methodsS3 for help.
## R.oo v1.23.0 successfully loaded. See ?R.oo for help.
##
## Attaching package: 'R.oo'
## The following object is masked from 'package:R.methodsS3':
##
## throw
## The following objects are masked from 'package:methods':
##
## getClasses, getMethods
## The following objects are masked from 'package:base':
##
## attach, detach, load, save
## R.utils v2.9.2 successfully loaded. See ?R.utils for help.
##
## Attaching package: 'R.utils'
## The following object is masked from 'package:tidyr':
##
## extract
## The following object is masked from 'package:utils':
##
## timestamp
## The following objects are masked from 'package:base':
##
## cat, commandArgs, getOption, inherits, isOpen, nullfile, parse,
## warnings
library(ngram)
library(dplyr)
library(stringi)
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(RWeka)
This project consists of US english data of news, blogs and twitter.
To understand the structure of datasets of three files data will be organized into lines of text, number of words and charecters.
In this explaratory data analysis variables of interest will be compared against each other from three files.
The variables of interest:
size of the three files
number of lines
number of total words
number of words per line
maximum words per line
number of characters
number of characters per word
Datasets files used:
“en_US.blogs.txt” ,
“en_US.news.txt” ,
“en_US.twitter.txt” .
size<-file.info("en_US.blogs.txt")
kb<-size$size/1024
sizeBlogs<-kb/1024
size<-file.info("en_US.news.txt")
kb<-size$size/1024
sizeNews<-kb/1024
size<-file.info("en_US.twitter.txt")
kb<-size$size/1024
sizeTwitter<-kb/1024
nlinesBlogs <- countLines("en_US.blogs.txt")
nlinesTwitter <- countLines("en_US.twitter.txt")
nlinesNews <- countLines("en_US.news.txt")
blogs <- readLines(con <- file("en_US.blogs.txt"), encoding = "UTF-8", skipNul = TRUE)
news <- readLines(con <- file("en_US.news.txt"), encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines(con <- file("en_US.twitter.txt"), encoding = "UTF-8", skipNul = TRUE)
nwordsBlogs <- wordcount(blogs, sep = " ")
nwordsTwitter <- wordcount(twitter, sep = " ")
nwordsNews <- wordcount(news, sep = " ")
statsBlogs <- stri_stats_general(blogs)
statsTwitter <- stri_stats_general(twitter)
statsNews <- stri_stats_general(news)
maxwordsBlogs <- max(stri_count_words(blogs))
maxwordsTwitter <- max(stri_count_words(twitter))
maxwordsNews <- max(stri_count_words(news))
WordsBlogs <- stri_count_words(blogs)
WordsTwitter <- stri_count_words(twitter)
WordsNews <- stri_count_words(news)
charswordBlogs <- statsBlogs[3]/nwordsBlogs
charswordTwitter <- statsTwitter[3]/nwordsTwitter
charswordNews <- statsNews[3]/nwordsNews
chartotBlogs <- statsBlogs[3]
chartotTwitter <- statsTwitter[3]
chartotNews <- statsNews[3]
DataSummary <- data.frame(source = c("blogs", "news", "twitter"),
file.size.MB = c(sizeBlogs, sizeNews, sizeTwitter),
num.lines = c(length(blogs), length(news), length(twitter)), #number of lines
num.words = c(sum(nwordsBlogs), sum(nwordsNews), sum(nwordsTwitter)), #sum of total words
Total.Chars = c(chartotBlogs, chartotNews, chartotTwitter), #total charecters
Words.Per.Lne = c(maxwordsBlogs, maxwordsNews, maxwordsTwitter), #words per line
Chars.Per.Word = c(charswordBlogs, charswordNews, charswordTwitter)) #charecter per word
#MaxWordsLine = c(maxwordsBlogs, maxwordsTwitter, maxwordsNews)) #max words per line
DataSummary
## source file.size.MB num.lines num.words Total.Chars Words.Per.Lne
## 1 blogs 200.4242 899288 37334131 206824382 6726
## 2 news 196.2775 1010242 34372530 203223154 1796
## 3 twitter 159.3641 2360148 30373583 162096241 47
## Chars.Per.Word
## 1 5.539820
## 2 5.912371
## 3 5.336751
hist(WordsBlogs, main="Histogram of words in blogs", xlab="No. of words per blog post", col="lightblue", breaks=100, xlim=c(0, 500))
hist(WordsNews, main="Histogram of words in news", xlab="No. of words per news post", col="lightblue", breaks=100, xlim=c(0, 500))
hist(WordsTwitter, main="Histogram of words in twitter", xlab="No. of words per twitter post", col="lightblue", breaks=20, xlim=c(0, 50))
charsWordBlogs <- chartotBlogs/WordsBlogs
charsWordTwitter <- chartotTwitter/WordsTwitter
charsWordNews <- chartotNews/WordsNews
hist(charsWordBlogs, main="Histogram of charecters in blogs", xlab="Number of charecters in blogs", col="lightgreen", breaks=20)
hist(charsWordNews, main="Histogram of charecters in news", xlab="Number of charecters in news", col="lightgreen", breaks=20)
hist(charsWordTwitter, main="Histogram of charecters in twitter", xlab="Number of charecters in twitter", col="lightgreen", breaks=20)
plot <- tibble(counts = c(nwordsBlogs, nwordsTwitter, nwordsNews, nlinesBlogs, nlinesTwitter, nlinesNews),
class = as.factor(c(rep("words",3), rep("lines", 3))),
medium = as.factor(c(rep(c("blogs", "twitter", "news"),2 ))))
plot$names <- paste(plot$class, plot$medium)
plot
## # A tibble: 6 x 4
## counts class medium names
## <int> <fct> <fct> <chr>
## 1 37334131 words blogs words blogs
## 2 30373583 words twitter words twitter
## 3 34372530 words news words news
## 4 899288 lines blogs lines blogs
## 5 2360148 lines twitter lines twitter
## 6 1010242 lines news lines news
WordsLinesRatio = data_frame(WordsLinesRatio = c(nwordsBlogs/ nlinesBlogs, nwordsNews/nlinesNews, nwordsTwitter/nlinesTwitter), MediaType = as.factor(c("Blogs","News","Twitter")))
## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.
ggplot(data = WordsLinesRatio, aes(x=MediaType, y= WordsLinesRatio, fill = MediaType)) + geom_bar(stat="identity") + ggtitle("Words/Lines Ratio") + ylab("Words/Lines")
The initial exploratory analysis of datasets showed that blogs have the smallest number of lines but largest numbers of words, total characters and words per line.
The news articles have the largest number of characters per word indicating that the language used in the text is more complex than text in blogs and twitter.