The goal of this project is to demonstrate that we have properly downloaded data from HC Corpora, and create a basic report of summary statistics about the datasets. We will also report any initial interesting finding.
For manipulation purpose we will assign the file names to variables
file1 <- "en_US.blogs.txt"
file2 <- "en_US.news.txt"
file3 <- "en_US.twitter.txt"
We would like to create a summary table which will include some basic statistics, such as the size of the file, number of lines, number of characters, etc.
setwd("C:/Users/Frantz/CloudStation/Documents/Coursera/CapstoneProject/Data/final/en_US/")
Infofile1<-file.info(paste(file1))
MBites_file1 <- (Infofile1$size/1024)/1024
Infofile1<-file.info(paste(file2))
MBites_file2 <- (Infofile1$size/1024)/1024
Infofile1<-file.info(paste(file3))
MBites_file3 <- (Infofile1$size/1024)/1024
TextinFile1 <- readLines(con <- file(file1), encoding = "UTF-8", skipNul = TRUE)
close(con)
TextinFile2 <- readLines(con <- file(file2), encoding = "UTF-8", skipNul = TRUE)
## Warning in readLines(con <- file(file2), encoding = "UTF-8", skipNul =
## TRUE): incomplete final line found on 'en_US.news.txt'
close(con)
TextinFile3 <- readLines(con <- file(file3), encoding = "UTF-8", skipNul = TRUE)
close(con)
TotalLine_File1 <- length(TextinFile1)
TotalLine_File2 <- length(TextinFile3)
TotalLine_File3 <- length(TextinFile3)
TotalChar_File1 <- max(nchar(TextinFile1))
TotalChar_File2 <- max(nchar(TextinFile2))
TotalChar_File3 <- max(nchar(TextinFile3))
USBlogText <- data_frame(text = TextinFile1)
USNewsText <- data_frame(text = TextinFile2)
USTwitterText <- data_frame(text = TextinFile3)
We reduce the dataset just for demonstration purpose by selecting only the first 10000 raws of each set
USBlogText <- head(USBlogText,10000)
USNewsText <- head(USNewsText,10000)
USTwitterText <- head(USTwitterText,10000)
USBlogText_V1 <- USBlogText %>% unnest_tokens(output = word, input = text)
USNewsText_V1 <- USNewsText %>% unnest_tokens(output = word, input = text)
USTwitterText_V1 <- USTwitterText %>% unnest_tokens(output = word, input = text)
are words which do not contain important significance to be used in Search Queries
USBlogText_V2 <- USBlogText_V1 %>% anti_join(stop_words)
## Joining, by = "word"
USNewsText_V2 <- USNewsText_V1 %>% anti_join(stop_words)
## Joining, by = "word"
USTwitterText_V2 <- USTwitterText_V1 %>% anti_join(stop_words)
## Joining, by = "word"
Word_count_USBlogText_V2 <- USBlogText_V2 %>% count(word, sort=TRUE)
## Warning: package 'bindrcpp' was built under R version 3.3.3
Word_count_USNewsText_V2 <- USNewsText_V2 %>% count(word, sort=TRUE)
Word_count_USTwitterText_V2 <- USTwitterText_V2 %>% count(word, sort=TRUE)
Total_Word_count_USBlogText_V2 <- nrow((USBlogText_V2))
Total_Word_count_USNewsText_V2 <- nrow((USNewsText_V2))
Total_Word_count_USTwitterText_V2 <- nrow((USTwitterText_V2))
Unique_Word_count_USBlogText_V2 <- nrow(unique(USBlogText_V2))
Unique_Word_count_USNewsText_V2 <- nrow(unique(USNewsText_V2))
Unique_Word_count_USTwitterText_V2 <- nrow(unique(USTwitterText_V2))
Report <- cbind(c(file1, file2, file3), c(MBites_file1, MBites_file2, MBites_file3), c(TotalLine_File1, TotalLine_File2, TotalLine_File3),
c(TotalChar_File1, TotalChar_File2, TotalChar_File3),
c(Unique_Word_count_USBlogText_V2,Unique_Word_count_USNewsText_V2,Unique_Word_count_USTwitterText_V2),
c(Total_Word_count_USBlogText_V2, Total_Word_count_USNewsText_V2, Total_Word_count_USTwitterText_V2 ))
Report<- as.data.frame(Report)
colnames(Report)[1] <- "File"
colnames(Report)[2] <- "Size In Bites"
colnames(Report)[3] <- "Number Of Lines"
colnames(Report)[4] <- "Number of Character"
colnames(Report)[5] <- "Unique Words"
colnames(Report)[6] <- "Total Words"
print(Report)
## File Size In Bites Number Of Lines Number of Character
## 1 en_US.blogs.txt 200.424207687378 899288 40833
## 2 en_US.news.txt 196.277512550354 2360148 5760
## 3 en_US.twitter.txt 159.364068984985 2360148 140
## Unique Words Total Words
## 1 30313 160451
## 2 30429 164167
## 3 14898 52468
We arbitrarily decide here the filter value n > x in order to see which word is outstandingly used.
Word_count_USBlogText_V2 %>%
filter(n > 500) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
coord_flip() +
labs(x = "Word \n", y = "\n Count ", title = paste("Frequent Words In ", file1)) +
geom_text(aes(label = n), hjust = 1.2, colour = "white", fontface = "bold") +
theme(plot.title = element_text(hjust = 0.5),
axis.title.x = element_text(face="bold", colour="darkblue", size = 12),
axis.title.y = element_text(face="bold", colour="darkblue", size = 12))
Word_count_USNewsText_V2 %>%
filter(n > 350) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
coord_flip() +
labs(x = "Word \n", y = "\n Count ", title = paste("Frequent Words In ", file2)) +
geom_text(aes(label = n), hjust = 1.2, colour = "white", fontface = "bold") +
theme(plot.title = element_text(hjust = 0.5),
axis.title.x = element_text(face="bold", colour="darkblue", size = 12),
axis.title.y = element_text(face="bold", colour="darkblue", size = 12))
Word_count_USTwitterText_V2 %>%
filter(n > 250) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
coord_flip() +
labs(x = "Word \n", y = "\n Count ", title = paste("Frequent Words In ", file3)) +
geom_text(aes(label = n), hjust = 1.2, colour = "white", fontface = "bold") +
theme(plot.title = element_text(hjust = 0.5),
axis.title.x = element_text(face="bold", colour="darkblue", size = 12),
axis.title.y = element_text(face="bold", colour="darkblue", size = 12))
The below charts help us see in a user friendly way which words are often used by the authors
set.seed(1234)
wordcloud(words = Word_count_USBlogText_V2$word, freq = Word_count_USBlogText_V2$n, min.freq = 10,
max.words=50, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
set.seed(1234)
wordcloud(words = Word_count_USNewsText_V2$word, freq = Word_count_USNewsText_V2$n, min.freq = 10,
max.words=50, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
set.seed(1234)
wordcloud(words = Word_count_USTwitterText_V2$word, freq = Word_count_USTwitterText_V2$n, min.freq = 10,
max.words=50, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))