Exploratory Analysis of HC Corpora

Introduction

The goal of this project is to demonstrate that we have properly downloaded data from HC Corpora, and create a basic report of summary statistics about the datasets. We will also report any initial interesting finding.

Labelling files

For manipulation purpose we will assign the file names to variables

file1 <- "en_US.blogs.txt"
file2 <- "en_US.news.txt"
file3 <- "en_US.twitter.txt"

We would like to create a summary table which will include some basic statistics, such as the size of the file, number of lines, number of characters, etc.

Finding Size of all files

setwd("C:/Users/Frantz/CloudStation/Documents/Coursera/CapstoneProject/Data/final/en_US/")
Infofile1<-file.info(paste(file1))
MBites_file1 <- (Infofile1$size/1024)/1024
Infofile1<-file.info(paste(file2))
MBites_file2 <- (Infofile1$size/1024)/1024

Infofile1<-file.info(paste(file3))
MBites_file3 <- (Infofile1$size/1024)/1024


TextinFile1 <- readLines(con <- file(file1), encoding = "UTF-8", skipNul = TRUE)
close(con)
TextinFile2 <- readLines(con <- file(file2), encoding = "UTF-8", skipNul = TRUE)

## Warning in readLines(con <- file(file2), encoding = "UTF-8", skipNul =
## TRUE): incomplete final line found on 'en_US.news.txt'

close(con)
TextinFile3 <- readLines(con <- file(file3), encoding = "UTF-8", skipNul = TRUE)
close(con)

Finding lengh of all files in number of lines

TotalLine_File1 <- length(TextinFile1)
TotalLine_File2 <- length(TextinFile3)
TotalLine_File3 <- length(TextinFile3)

Maximum Number of Character

TotalChar_File1 <- max(nchar(TextinFile1))
TotalChar_File2 <- max(nchar(TextinFile2))
TotalChar_File3 <- max(nchar(TextinFile3))

Converting Data into Dataframe

USBlogText <- data_frame(text = TextinFile1)
USNewsText <- data_frame(text = TextinFile2)
USTwitterText <- data_frame(text = TextinFile3)

We will pick up individual words and put them as rows “unnest_tokens”

We reduce the dataset just for demonstration purpose by selecting only the first 10000 raws of each set

USBlogText <- head(USBlogText,10000)
USNewsText <- head(USNewsText,10000)
USTwitterText <- head(USTwitterText,10000)
USBlogText_V1 <- USBlogText %>% unnest_tokens(output = word, input = text)
USNewsText_V1 <- USNewsText %>% unnest_tokens(output = word, input = text)
USTwitterText_V1 <- USTwitterText %>% unnest_tokens(output = word, input = text)

We remove stopwords using anti_join()

are words which do not contain important significance to be used in Search Queries

USBlogText_V2 <- USBlogText_V1 %>% anti_join(stop_words)

## Joining, by = "word"

USNewsText_V2 <- USNewsText_V1 %>% anti_join(stop_words)

## Joining, by = "word"

USTwitterText_V2 <- USTwitterText_V1 %>% anti_join(stop_words)

## Joining, by = "word"

We count word using the count function of dplyr package

Word_count_USBlogText_V2 <- USBlogText_V2 %>% count(word, sort=TRUE)

## Warning: package 'bindrcpp' was built under R version 3.3.3

Word_count_USNewsText_V2 <- USNewsText_V2 %>% count(word, sort=TRUE)
Word_count_USTwitterText_V2 <- USTwitterText_V2 %>% count(word, sort=TRUE)

We Count the each words

Total_Word_count_USBlogText_V2 <- nrow((USBlogText_V2))
Total_Word_count_USNewsText_V2 <- nrow((USNewsText_V2))
Total_Word_count_USTwitterText_V2 <- nrow((USTwitterText_V2))

We Count the unique words

Unique_Word_count_USBlogText_V2 <- nrow(unique(USBlogText_V2))
Unique_Word_count_USNewsText_V2 <- nrow(unique(USNewsText_V2))
Unique_Word_count_USTwitterText_V2 <- nrow(unique(USTwitterText_V2))

Creating our first Summary

Report <- cbind(c(file1, file2, file3), c(MBites_file1, MBites_file2, MBites_file3), c(TotalLine_File1, TotalLine_File2, TotalLine_File3),
                c(TotalChar_File1, TotalChar_File2, TotalChar_File3),
                c(Unique_Word_count_USBlogText_V2,Unique_Word_count_USNewsText_V2,Unique_Word_count_USTwitterText_V2),
                c(Total_Word_count_USBlogText_V2, Total_Word_count_USNewsText_V2, Total_Word_count_USTwitterText_V2 ))
Report<- as.data.frame(Report)

colnames(Report)[1] <- "File"
colnames(Report)[2] <- "Size In Bites"
colnames(Report)[3] <- "Number Of Lines"
colnames(Report)[4] <- "Number of Character"
colnames(Report)[5] <- "Unique Words"
colnames(Report)[6] <- "Total Words"
print(Report)

##                File    Size In Bites Number Of Lines Number of Character
## 1   en_US.blogs.txt 200.424207687378          899288               40833
## 2    en_US.news.txt 196.277512550354         2360148                5760
## 3 en_US.twitter.txt 159.364068984985         2360148                 140
##   Unique Words Total Words
## 1        30313      160451
## 2        30429      164167
## 3        14898       52468

Creating some frequency plots

We arbitrarily decide here the filter value n > x in order to see which word is outstandingly used.

Word_count_USBlogText_V2 %>% 
  filter(n > 500) %>% 
  mutate(word = reorder(word, n)) %>% 
  ggplot(aes(word, n)) + 
  geom_col() +
  coord_flip() +
  labs(x = "Word \n", y = "\n Count ", title = paste("Frequent Words In ", file1)) +
  geom_text(aes(label = n), hjust = 1.2, colour = "white", fontface = "bold") +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.title.x = element_text(face="bold", colour="darkblue", size = 12),
        axis.title.y = element_text(face="bold", colour="darkblue", size = 12))

Word_count_USNewsText_V2 %>% 
  filter(n > 350) %>% 
  mutate(word = reorder(word, n)) %>% 
  ggplot(aes(word, n)) + 
  geom_col() +
  coord_flip() +
  labs(x = "Word \n", y = "\n Count ", title = paste("Frequent Words In ", file2)) +
  geom_text(aes(label = n), hjust = 1.2, colour = "white", fontface = "bold") +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.title.x = element_text(face="bold", colour="darkblue", size = 12),
        axis.title.y = element_text(face="bold", colour="darkblue", size = 12))

Word_count_USTwitterText_V2 %>% 
  filter(n > 250) %>% 
  mutate(word = reorder(word, n)) %>% 
  ggplot(aes(word, n)) + 
  geom_col() +
  coord_flip() +
  labs(x = "Word \n", y = "\n Count ", title = paste("Frequent Words In ", file3)) +
  geom_text(aes(label = n), hjust = 1.2, colour = "white", fontface = "bold") +
  theme(plot.title = element_text(hjust = 0.5), 
        axis.title.x = element_text(face="bold", colour="darkblue", size = 12),
        axis.title.y = element_text(face="bold", colour="darkblue", size = 12))

Doing a word cloud

The below charts help us see in a user friendly way which words are often used by the authors

set.seed(1234)
wordcloud(words = Word_count_USBlogText_V2$word, freq = Word_count_USBlogText_V2$n, min.freq = 10,
          max.words=50, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

set.seed(1234)
wordcloud(words = Word_count_USNewsText_V2$word, freq = Word_count_USNewsText_V2$n, min.freq = 10,
          max.words=50, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

set.seed(1234)
wordcloud(words = Word_count_USTwitterText_V2$word, freq = Word_count_USTwitterText_V2$n, min.freq = 10,
          max.words=50, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))