Data Science Capstone Project Module 2

#Setting the working directory

setwd("/Users/MAC/Desktop/STUFF/RICHMOND/MSc Data Science/Sem 4/Capstone Project/Capstone Project/dataset/en_US")

# Loading the necessary libraries
library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidytext)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ✔ readr     2.1.5

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidytext)

#Saving files into variable
twitter_data <- "en_US.twitter.txt"
blog_data <- "en_US.blogs.txt"
news_data <- "en_US.news.txt" 

# Loading data
twitter_db <- readLines(twitter_data, warn = FALSE, encoding = "UTF-8")
blogs_db <- readLines(blog_data, warn = FALSE, encoding = "UTF-8")
news_db <- readLines(news_data, warn = FALSE, encoding = "UTF-8")



# Displaying summary statistics
summary_db <- data.frame(
  Dataset = c("Twitter", "Blogs", "News"),
  Lines = c(length(twitter_db), length(blogs_db), length(news_db)),
  Longest_Line = c(max(nchar(twitter_db)), max(nchar(blogs_db)), max(nchar(news_db))),
  File_Size_MB = c(file.info(twitter_data)$size / (1024^2), 
                   file.info(blog_data)$size / (1024^2), 
                   file.info(news_data)$size / (1024^2))
)

summary_db

##   Dataset   Lines Longest_Line File_Size_MB
## 1 Twitter 2360148          140     159.3641
## 2   Blogs  899288        40833     200.4242
## 3    News 1010242        11384     196.2775

# Sampling data for the analysis
twitter_db2 <- sample(twitter_db, 2000)
blog_db2 <- sample(blogs_db, 2000)
news_db2 <- sample(news_db, 2000)

# Writing a function to calculate word frequencies
freq_w <- function(db) {
  freq_words <- unlist(strsplit(tolower(db), "\\W"))
  freq_words <- freq_words[freq_words != ""]
  table(freq_words)
}

# Calculate word frequencies for each dataset
twitter_frequency <- freq_w(twitter_db2)
blogs_frequency <- freq_w(blog_db2)
news_frequency <- freq_w(news_db2)

# Create a plot of the most common words in Twitter data
twitter_top <- head(sort(twitter_frequency, decreasing = TRUE), 30)
twitter_top2 <- data.frame(Word = names(twitter_top), Frequency = as.numeric(twitter_top))

ggplot(twitter_top2, aes(x = reorder(Word, Frequency), y = Frequency)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Top 30 Words in Twitter Data", x = "Word", y = "Frequency")

# Calculate line lengths
twitter_length <- nchar(twitter_db)
blogs_length <- nchar(blogs_db)
news_length <- nchar(news_db)


# Ploting histogram
#Twitter
ggplot() +
  geom_histogram(aes(x = twitter_length), binwidth = 10, fill = "blue", alpha = 0.5) +
  labs(title = "Distribution of Line Lengths in Twitter Data", x = "Line Length (characters)", y = "Frequency") +
  theme_minimal()

#Blog
ggplot() +
  geom_histogram(aes(x = blogs_length), binwidth = 100, fill = "green", alpha = 0.5) +
  labs(title = "Distribution of Line Lengths in Blogs Data", x = "Line Length (characters)", y = "Frequency") +
  theme_minimal()

#News
ggplot() +
  geom_histogram(aes(x = news_length), binwidth = 50, fill = "orange", alpha = 0.5) +
  labs(title = "Distribution of Line Lengths in News Data", x = "Line Length (characters)", y = "Frequency") +
  theme_minimal()

# Plot histograms of character counts per line
#Twitter
ggplot() +
  geom_histogram(aes(x = twitter_length), binwidth = 1, fill = "blue", alpha = 0.5) +
  labs(title = "Character Count Distribution in Twitter Data", x = "Character Count per Line", y = "Frequency") +
  theme_minimal()

#Blog
ggplot() +
  geom_histogram(aes(x = blogs_length), binwidth = 50, fill = "green", alpha = 0.5) +
  labs(title = "Character Count Distribution in Blogs Data", x = "Character Count per Line", y = "Frequency") +
  theme_minimal()

#
ggplot() +
  geom_histogram(aes(x = news_length), binwidth = 20, fill = "orange", alpha = 0.5) +
  labs(title = "Character Count Distribution in News Data", x = "Character Count per Line", y = "Frequency") +
  theme_minimal()

# Calculate the ratio of 'love' to 'hate'
love_count <- sum(grepl("\\blove\\b", tolower(twitter_db)))
hate_count <- sum(grepl("\\bhate\\b", tolower(twitter_db)))
love_hate_ratio <- love_count / hate_count
cat("The ratio of 'love' to 'hate' in Twitter data is:", love_hate_ratio, "\n")

## The ratio of 'love' to 'hate' in Twitter data is: 5.675705

#Some Interesting Findings
#Word Frequencies: Common English words such as “the”, “to”, “and” dominate the frequency charts across all data sets.
#Line Lengths: Twitter data, constrained by character limits, shows a distribution with a peak around 140 characters. Blogs data exhibits the longest individual lines, reaching over 40,000 characters.
#Longest Lines: The longest line in the blogs dataset is significantly longer than those in the Twitter and news datasets, which indicates that blog entries can be much lengthier compared to tweets and news articles.
#Word Ratios: In the Twitter dataset, the ratio of lines containing the word “love” to those containing “hate” is approximately 5.7, indicating a higher occurrence of positive sentiment.

Data Science Capstone Project Module 2

Richmond Doe Sowah

2024-07-22