Libraries used for the exploratory analysis

library(tidytext)
library(ggplot2)
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble  2.1.3     ✓ dplyr   0.8.5
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ✓ purrr   0.3.3
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(stringr)
library(R.utils)
## Loading required package: R.oo
## Loading required package: R.methodsS3
## R.methodsS3 v1.8.0 (2020-02-14 07:10:20 UTC) successfully loaded. See ?R.methodsS3 for help.
## R.oo v1.23.0 successfully loaded. See ?R.oo for help.
## 
## Attaching package: 'R.oo'
## The following object is masked from 'package:R.methodsS3':
## 
##     throw
## The following objects are masked from 'package:methods':
## 
##     getClasses, getMethods
## The following objects are masked from 'package:base':
## 
##     attach, detach, load, save
## R.utils v2.9.2 successfully loaded. See ?R.utils for help.
## 
## Attaching package: 'R.utils'
## The following object is masked from 'package:tidyr':
## 
##     extract
## The following object is masked from 'package:utils':
## 
##     timestamp
## The following objects are masked from 'package:base':
## 
##     cat, commandArgs, getOption, inherits, isOpen, nullfile, parse,
##     warnings
library(ngram)
library(dplyr)
library(stringi)
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(RWeka)

Introduction

This project consists of US english data of news, blogs and twitter.

To understand the structure of datasets of three files data will be organized into lines of text, number of words and charecters.

In this explaratory data analysis variables of interest will be compared against each other from three files.

The variables of interest:

Datasets files used:

Data Exploration

Size of the files:

size<-file.info("en_US.blogs.txt")
kb<-size$size/1024
sizeBlogs<-kb/1024
size<-file.info("en_US.news.txt")
kb<-size$size/1024
sizeNews<-kb/1024
size<-file.info("en_US.twitter.txt")
kb<-size$size/1024
sizeTwitter<-kb/1024

Number of lines of text in files:

nlinesBlogs <- countLines("en_US.blogs.txt")
nlinesTwitter <- countLines("en_US.twitter.txt")
nlinesNews <- countLines("en_US.news.txt")

Number of words:

blogs <- readLines(con <- file("en_US.blogs.txt"), encoding = "UTF-8", skipNul = TRUE)
news <- readLines(con <- file("en_US.news.txt"), encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines(con <- file("en_US.twitter.txt"), encoding = "UTF-8", skipNul = TRUE)
nwordsBlogs  <- wordcount(blogs, sep = " ")
nwordsTwitter <- wordcount(twitter, sep = " ")
nwordsNews  <- wordcount(news, sep = " ")

Statistics:

statsBlogs <- stri_stats_general(blogs)
statsTwitter <- stri_stats_general(twitter)
statsNews <- stri_stats_general(news)

Maximum words per line:

maxwordsBlogs <- max(stri_count_words(blogs))
maxwordsTwitter <- max(stri_count_words(twitter))
maxwordsNews <- max(stri_count_words(news))

Words per line:

WordsBlogs <- stri_count_words(blogs)
WordsTwitter <- stri_count_words(twitter)
WordsNews <- stri_count_words(news)

Total charecters per word:

charswordBlogs <- statsBlogs[3]/nwordsBlogs
charswordTwitter <- statsTwitter[3]/nwordsTwitter
charswordNews <- statsNews[3]/nwordsNews

Total charecters:

chartotBlogs <- statsBlogs[3]
chartotTwitter <- statsTwitter[3]
chartotNews <- statsNews[3]

Summary of the data sets:

DataSummary <- data.frame(source = c("blogs", "news", "twitter"),
           file.size.MB = c(sizeBlogs, sizeNews, sizeTwitter),
           num.lines = c(length(blogs), length(news), length(twitter)), #number of lines
           num.words = c(sum(nwordsBlogs), sum(nwordsNews), sum(nwordsTwitter)), #sum of total words
           Total.Chars = c(chartotBlogs, chartotNews, chartotTwitter), #total charecters
           Words.Per.Lne = c(maxwordsBlogs, maxwordsNews, maxwordsTwitter), #words per line
           Chars.Per.Word = c(charswordBlogs, charswordNews, charswordTwitter)) #charecter per word
           #MaxWordsLine = c(maxwordsBlogs, maxwordsTwitter, maxwordsNews)) #max words per line
DataSummary       
##    source file.size.MB num.lines num.words Total.Chars Words.Per.Lne
## 1   blogs     200.4242    899288  37334131   206824382          6726
## 2    news     196.2775   1010242  34372530   203223154          1796
## 3 twitter     159.3641   2360148  30373583   162096241            47
##   Chars.Per.Word
## 1       5.539820
## 2       5.912371
## 3       5.336751

Histograms of number of words:

hist(WordsBlogs, main="Histogram of words in blogs", xlab="No. of words per blog post", col="lightblue", breaks=100, xlim=c(0, 500))

hist(WordsNews, main="Histogram of words in news", xlab="No. of words per news post", col="lightblue", breaks=100, xlim=c(0, 500))

hist(WordsTwitter, main="Histogram of words in twitter", xlab="No. of words per twitter post", col="lightblue", breaks=20, xlim=c(0, 50))

Histograms of number of characters:

charsWordBlogs <- chartotBlogs/WordsBlogs
charsWordTwitter <- chartotTwitter/WordsTwitter
charsWordNews <- chartotNews/WordsNews

hist(charsWordBlogs, main="Histogram of charecters in blogs", xlab="Number of charecters in blogs", col="lightgreen", breaks=20)

hist(charsWordNews, main="Histogram of charecters in news", xlab="Number of charecters in news", col="lightgreen", breaks=20)

hist(charsWordTwitter, main="Histogram of charecters in twitter", xlab="Number of charecters in twitter", col="lightgreen", breaks=20)

Words/Lines Ratio

plot <- tibble(counts = c(nwordsBlogs, nwordsTwitter, nwordsNews, nlinesBlogs, nlinesTwitter, nlinesNews),
    class = as.factor(c(rep("words",3), rep("lines", 3))), 
    medium = as.factor(c(rep(c("blogs", "twitter", "news"),2 ))))

plot$names <- paste(plot$class, plot$medium)
plot
## # A tibble: 6 x 4
##     counts class medium  names        
##      <int> <fct> <fct>   <chr>        
## 1 37334131 words blogs   words blogs  
## 2 30373583 words twitter words twitter
## 3 34372530 words news    words news   
## 4   899288 lines blogs   lines blogs  
## 5  2360148 lines twitter lines twitter
## 6  1010242 lines news    lines news
WordsLinesRatio = data_frame(WordsLinesRatio = c(nwordsBlogs/ nlinesBlogs, nwordsNews/nlinesNews, nwordsTwitter/nlinesTwitter), MediaType = as.factor(c("Blogs","News","Twitter")))
## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.
ggplot(data = WordsLinesRatio, aes(x=MediaType, y= WordsLinesRatio, fill = MediaType)) + geom_bar(stat="identity") + ggtitle("Words/Lines Ratio") + ylab("Words/Lines")

Conclusion

The initial exploratory analysis of datasets showed that blogs have the smallest number of lines but largest numbers of words, total characters and words per line.

The news articles have the largest number of characters per word indicating that the language used in the text is more complex than text in blogs and twitter.