Load libraries

library(ggplot2) library(dplyr)

Example for one file, repeat for all three

blogs <- readLines(“C:/Users/Yashika singh/OneDrive/Desktop/final/en_US.blogs.txt”) twitter <- readLines(“C:/Users/Yashika singh/OneDrive/Desktop/final/en_US.twitter.txt”) news <- readLines(“C:/Users/Yashika singh/OneDrive/Desktop/final/en_US.news.txt”)

Basic summaries

length(blogs) # Number of lines sapply(list(blogs, twitter, news), nchar) %>% summary() # Line lengths

sum(grepl(“love”, twitter)) sum(grepl(“hate”, twitter)) sum(grepl(“Future”, twitter)) sum(grepl(“Ai”, twitter)) sum(grepl(“Complete”, twitter)) sum(grepl(“knowledge”, twitter)) sum(grepl(“friends”, twitter)) max(nchar(blogs)) sum(grepl(“months”, blogs)) sum(grepl(“tuesday”, blogs)) sum(grepl(“world”, blogs)) sum(grepl(“benefits”, blogs)) sum(grepl(“fairy”, blogs)) max(nchar(news)) sum(grepl(“Alone”, news)) sum(grepl(“Said”, news)) sum(grepl(“She”, news)) sum(grepl(“Working”, news)) sum(grepl(“toward”, news))

hist(nchar(blogs), main=“Line Lengths in Blogs”, xlab=“Characters per line”)

line_counts <- data.frame( Dataset = c(“Blogs”, “Twitter”, “News”), Line_Count = c(length(blogs), length(twitter), length(news)) )

library(ggplot2)

data_size <- data.frame( Dataset = c(“Blogs”, “Twitter”, “News”), Lines = c(length(blogs), length(twitter), length(news)) )

ggplot(data_size, aes(x = ““, y = Lines, fill = Dataset)) + geom_bar(stat =”identity”, width = 1) + coord_polar(“y”) + labs( title = “Proportion of Lines in Each Dataset”, x = ““, y =”” ) + theme_void()

line_counts <- data.frame( Dataset = c(“Blogs”, “Twitter”, “News”), Line_Count = c(length(blogs), length(twitter), length(news)) )

word_count <- function(text) { sum(strsplit(text, “\s+”) |> lengths()) }

word_counts <- data.frame( Dataset = c(“Blogs”, “Twitter”, “News”), Word_Count = c( word_count(blogs), word_count(twitter), word_count(news) ) )

install.packages(c(“rmarkdown”, “knitr”))