#Setting the working directory
setwd("/Users/MAC/Desktop/STUFF/RICHMOND/MSc Data Science/Sem 4/Capstone Project/Capstone Project/dataset/en_US")
# Loading the necessary libraries
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidytext)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidytext)
#Saving files into variable
twitter_data <- "en_US.twitter.txt"
blog_data <- "en_US.blogs.txt"
news_data <- "en_US.news.txt"
# Loading data
twitter_db <- readLines(twitter_data, warn = FALSE, encoding = "UTF-8")
blogs_db <- readLines(blog_data, warn = FALSE, encoding = "UTF-8")
news_db <- readLines(news_data, warn = FALSE, encoding = "UTF-8")
# Displaying summary statistics
summary_db <- data.frame(
Dataset = c("Twitter", "Blogs", "News"),
Lines = c(length(twitter_db), length(blogs_db), length(news_db)),
Longest_Line = c(max(nchar(twitter_db)), max(nchar(blogs_db)), max(nchar(news_db))),
File_Size_MB = c(file.info(twitter_data)$size / (1024^2),
file.info(blog_data)$size / (1024^2),
file.info(news_data)$size / (1024^2))
)
summary_db
## Dataset Lines Longest_Line File_Size_MB
## 1 Twitter 2360148 140 159.3641
## 2 Blogs 899288 40833 200.4242
## 3 News 1010242 11384 196.2775
# Sampling data for the analysis
twitter_db2 <- sample(twitter_db, 2000)
blog_db2 <- sample(blogs_db, 2000)
news_db2 <- sample(news_db, 2000)
# Writing a function to calculate word frequencies
freq_w <- function(db) {
freq_words <- unlist(strsplit(tolower(db), "\\W"))
freq_words <- freq_words[freq_words != ""]
table(freq_words)
}
# Calculate word frequencies for each dataset
twitter_frequency <- freq_w(twitter_db2)
blogs_frequency <- freq_w(blog_db2)
news_frequency <- freq_w(news_db2)
# Create a plot of the most common words in Twitter data
twitter_top <- head(sort(twitter_frequency, decreasing = TRUE), 30)
twitter_top2 <- data.frame(Word = names(twitter_top), Frequency = as.numeric(twitter_top))
ggplot(twitter_top2, aes(x = reorder(Word, Frequency), y = Frequency)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Top 30 Words in Twitter Data", x = "Word", y = "Frequency")

# Calculate line lengths
twitter_length <- nchar(twitter_db)
blogs_length <- nchar(blogs_db)
news_length <- nchar(news_db)
# Ploting histogram
#Twitter
ggplot() +
geom_histogram(aes(x = twitter_length), binwidth = 10, fill = "blue", alpha = 0.5) +
labs(title = "Distribution of Line Lengths in Twitter Data", x = "Line Length (characters)", y = "Frequency") +
theme_minimal()

#Blog
ggplot() +
geom_histogram(aes(x = blogs_length), binwidth = 100, fill = "green", alpha = 0.5) +
labs(title = "Distribution of Line Lengths in Blogs Data", x = "Line Length (characters)", y = "Frequency") +
theme_minimal()

#News
ggplot() +
geom_histogram(aes(x = news_length), binwidth = 50, fill = "orange", alpha = 0.5) +
labs(title = "Distribution of Line Lengths in News Data", x = "Line Length (characters)", y = "Frequency") +
theme_minimal()

# Plot histograms of character counts per line
#Twitter
ggplot() +
geom_histogram(aes(x = twitter_length), binwidth = 1, fill = "blue", alpha = 0.5) +
labs(title = "Character Count Distribution in Twitter Data", x = "Character Count per Line", y = "Frequency") +
theme_minimal()

#Blog
ggplot() +
geom_histogram(aes(x = blogs_length), binwidth = 50, fill = "green", alpha = 0.5) +
labs(title = "Character Count Distribution in Blogs Data", x = "Character Count per Line", y = "Frequency") +
theme_minimal()

#
ggplot() +
geom_histogram(aes(x = news_length), binwidth = 20, fill = "orange", alpha = 0.5) +
labs(title = "Character Count Distribution in News Data", x = "Character Count per Line", y = "Frequency") +
theme_minimal()

# Calculate the ratio of 'love' to 'hate'
love_count <- sum(grepl("\\blove\\b", tolower(twitter_db)))
hate_count <- sum(grepl("\\bhate\\b", tolower(twitter_db)))
love_hate_ratio <- love_count / hate_count
cat("The ratio of 'love' to 'hate' in Twitter data is:", love_hate_ratio, "\n")
## The ratio of 'love' to 'hate' in Twitter data is: 5.675705
#Some Interesting Findings
#Word Frequencies: Common English words such as “the”, “to”, “and” dominate the frequency charts across all data sets.
#Line Lengths: Twitter data, constrained by character limits, shows a distribution with a peak around 140 characters. Blogs data exhibits the longest individual lines, reaching over 40,000 characters.
#Longest Lines: The longest line in the blogs dataset is significantly longer than those in the Twitter and news datasets, which indicates that blog entries can be much lengthier compared to tweets and news articles.
#Word Ratios: In the Twitter dataset, the ratio of lines containing the word “love” to those containing “hate” is approximately 5.7, indicating a higher occurrence of positive sentiment.