library(ggplot2) library(dplyr)
blogs <- readLines(“C:/Users/Yashika singh/OneDrive/Desktop/final/en_US.blogs.txt”) twitter <- readLines(“C:/Users/Yashika singh/OneDrive/Desktop/final/en_US.twitter.txt”) news <- readLines(“C:/Users/Yashika singh/OneDrive/Desktop/final/en_US.news.txt”)
length(blogs) # Number of lines sapply(list(blogs, twitter, news), nchar) %>% summary() # Line lengths
sum(grepl(“love”, twitter)) sum(grepl(“hate”, twitter)) sum(grepl(“Future”, twitter)) sum(grepl(“Ai”, twitter)) sum(grepl(“Complete”, twitter)) sum(grepl(“knowledge”, twitter)) sum(grepl(“friends”, twitter)) max(nchar(blogs)) sum(grepl(“months”, blogs)) sum(grepl(“tuesday”, blogs)) sum(grepl(“world”, blogs)) sum(grepl(“benefits”, blogs)) sum(grepl(“fairy”, blogs)) max(nchar(news)) sum(grepl(“Alone”, news)) sum(grepl(“Said”, news)) sum(grepl(“She”, news)) sum(grepl(“Working”, news)) sum(grepl(“toward”, news))
hist(nchar(blogs), main=“Line Lengths in Blogs”, xlab=“Characters per line”)
line_counts <- data.frame( Dataset = c(“Blogs”, “Twitter”, “News”), Line_Count = c(length(blogs), length(twitter), length(news)) )
library(ggplot2)
data_size <- data.frame( Dataset = c(“Blogs”, “Twitter”, “News”), Lines = c(length(blogs), length(twitter), length(news)) )
ggplot(data_size, aes(x = ““, y = Lines, fill = Dataset)) + geom_bar(stat =”identity”, width = 1) + coord_polar(“y”) + labs( title = “Proportion of Lines in Each Dataset”, x = ““, y =”” ) + theme_void()
line_counts <- data.frame( Dataset = c(“Blogs”, “Twitter”, “News”), Line_Count = c(length(blogs), length(twitter), length(news)) )
word_count <- function(text) { sum(strsplit(text, “\s+”) |> lengths()) }
word_counts <- data.frame( Dataset = c(“Blogs”, “Twitter”, “News”), Word_Count = c( word_count(blogs), word_count(twitter), word_count(news) ) )
install.packages(c(“rmarkdown”, “knitr”))