Project Module 2

#Project: Exploratory Data Analysis and Predictive Text Model Development #Introduction #The goal of this project is to demonstrate proficiency in handling large text datasets and to lay the groundwork for developing a predictive text model.In this project, we will use the en_US datasets provided by the SwiftKey company. These datasets include text data from three sources: Twitter, blogs, and news articles. Our objectives are to perform an exploratory data analysis (EDA) to understand the structure and key features of these datasets and to outline our plans for creating a prediction algorithm and a Shiny app that utilizes this algorithm.

#Loading and Summary of Data

# Load necessary libraries
library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ✔ readr     2.1.5

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidytext)

# Define file paths
twitter_file <- "C:/Users/DELL/Documents/en_US.twitter.txt"
blogs_file <- "C:/Users/DELL/Documents/en_US.blogs.txt"
news_file <- "C:/Users/DELL/Documents/en_US.news.txt"

# Load data
twitter_data <- readLines(twitter_file, warn = FALSE, encoding = "UTF-8")
blogs_data <- readLines(blogs_file, warn = FALSE, encoding = "UTF-8")
news_data <- readLines(news_file, warn = FALSE, encoding = "UTF-8")

# Display summary statistics
summary_stats <- data.frame(
  Dataset = c("Twitter", "Blogs", "News"),
  Lines = c(length(twitter_data), length(blogs_data), length(news_data)),
  Longest_Line = c(max(nchar(twitter_data)), max(nchar(blogs_data)), max(nchar(news_data))),
  File_Size_MB = c(file.info(twitter_file)$size / (1024^2), 
                   file.info(blogs_file)$size / (1024^2), 
                   file.info(news_file)$size / (1024^2))
)

summary_stats

##   Dataset   Lines Longest_Line File_Size_MB
## 1 Twitter 2360148          140     159.3641
## 2   Blogs  899288        40833     200.4242
## 3    News   77259         5760     196.2775

#Basic Exploratory Analysis

# Sample data for analysis
sample_twitter <- sample(twitter_data, 1000)
sample_blogs <- sample(blogs_data, 1000)
sample_news <- sample(news_data, 1000)

# Function to calculate word frequencies
word_freq <- function(data) {
  words <- unlist(strsplit(tolower(data), "\\W"))
  words <- words[words != ""]
  table(words)
}

# Calculate word frequencies for each dataset
twitter_freq <- word_freq(sample_twitter)
blogs_freq <- word_freq(sample_blogs)
news_freq <- word_freq(sample_news)

# Create a plot of the most common words in Twitter data
twitter_top_words <- head(sort(twitter_freq, decreasing = TRUE), 20)
twitter_top_words_df <- data.frame(Word = names(twitter_top_words), Frequency = as.numeric(twitter_top_words))

ggplot(twitter_top_words_df, aes(x = reorder(Word, Frequency), y = Frequency)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Top 20 Words in Twitter Data", x = "Word", y = "Frequency")

# Load necessary libraries
library(ggplot2)

# Load the datasets
twitter_data <- readLines("C:/Users/DELL/Documents/en_US.twitter.txt", warn = FALSE, encoding = "UTF-8")
blogs_data <- readLines("C:/Users/DELL/Documents/en_US.blogs.txt", warn = FALSE, encoding = "UTF-8")
news_data <- readLines("C:/Users/DELL/Documents/en_US.news.txt", warn = FALSE, encoding = "UTF-8")

# Calculate line lengths
twitter_lengths <- nchar(twitter_data)
blogs_lengths <- nchar(blogs_data)
news_lengths <- nchar(news_data)

# Plot histograms
ggplot() +
  geom_histogram(aes(x = twitter_lengths), binwidth = 10, fill = "blue", alpha = 0.5) +
  labs(title = "Distribution of Line Lengths in Twitter Data", x = "Line Length (characters)", y = "Frequency") +
  theme_minimal()

ggplot() +
  geom_histogram(aes(x = blogs_lengths), binwidth = 100, fill = "green", alpha = 0.5) +
  labs(title = "Distribution of Line Lengths in Blogs Data", x = "Line Length (characters)", y = "Frequency") +
  theme_minimal()

ggplot() +
  geom_histogram(aes(x = news_lengths), binwidth = 50, fill = "red", alpha = 0.5) +
  labs(title = "Distribution of Line Lengths in News Data", x = "Line Length (characters)", y = "Frequency") +
  theme_minimal()

# Load necessary libraries
library(tidyverse)
library(tidytext)

# Create a function to tokenize and count word frequencies
word_freq <- function(data) {
  data_frame(text = data) %>%
    unnest_tokens(word, text) %>%
    count(word, sort = TRUE)
}

# Calculate word frequencies for each dataset
twitter_word_freq <- word_freq(twitter_data)

## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

blogs_word_freq <- word_freq(blogs_data)
news_word_freq <- word_freq(news_data)

# Plot histograms of word frequencies
ggplot(twitter_word_freq, aes(x = n)) +
  geom_histogram(binwidth = 1, fill = "blue", alpha = 0.5) +
  labs(title = "Word Frequency Distribution in Twitter Data", x = "Frequency", y = "Count") +
  theme_minimal()

ggplot(blogs_word_freq, aes(x = n)) +
  geom_histogram(binwidth = 5, fill = "green", alpha = 0.5) +
  labs(title = "Word Frequency Distribution in Blogs Data", x = "Frequency", y = "Count") +
  theme_minimal()

ggplot(news_word_freq, aes(x = n)) +
  geom_histogram(binwidth = 2, fill = "red", alpha = 0.5) +
  labs(title = "Word Frequency Distribution in News Data", x = "Frequency", y = "Count") +
  theme_minimal()

# Plot histograms of character counts per line
ggplot() +
  geom_histogram(aes(x = twitter_lengths), binwidth = 1, fill = "blue", alpha = 0.5) +
  labs(title = "Character Count Distribution in Twitter Data", x = "Character Count per Line", y = "Frequency") +
  theme_minimal()

ggplot() +
  geom_histogram(aes(x = blogs_lengths), binwidth = 50, fill = "green", alpha = 0.5) +
  labs(title = "Character Count Distribution in Blogs Data", x = "Character Count per Line", y = "Frequency") +
  theme_minimal()

ggplot() +
  geom_histogram(aes(x = news_lengths), binwidth = 20, fill = "red", alpha = 0.5) +
  labs(title = "Character Count Distribution in News Data", x = "Character Count per Line", y = "Frequency") +
  theme_minimal()

#Interesting Findings #Word Frequencies: Common English words such as “the”, “to”, “and” dominate the frequency charts across all data sets. #Line Lengths: Twitter data, constrained by character limits, shows a distribution with a peak around 140 characters. Blogs data exhibits the longest individual lines, reaching over 40,000 characters. #Longest Lines: The longest line in the blogs dataset is significantly longer than those in the Twitter and news datasets, which indicates that blog entries can be much lengthier compared to tweets and news articles. #Word Ratios: In the Twitter dataset, the ratio of lines containing the word “love” to those containing “hate” is approximately 4, indicating a higher occurrence of positive sentiment.

# Calculate the ratio of 'love' to 'hate'
love_count <- sum(grepl("\\blove\\b", tolower(twitter_data)))
hate_count <- sum(grepl("\\bhate\\b", tolower(twitter_data)))
love_hate_ratio <- love_count / hate_count
cat("The ratio of 'love' to 'hate' in Twitter data is:", love_hate_ratio, "\n")

## The ratio of 'love' to 'hate' in Twitter data is: 5.675705

#Plans for Prediction Algorithm and Shiny App #Prediction Algorithm: Develop a model that can predict the next word in a sentence based on the previous words. This will involve using n-gram models and potentially neural networks. #Shiny App: Create an interactive Shiny app that allows users to input text and receive predictive suggestions in real-time. #Further Analysis: Delve deeper into the context and semantics of the data to improve the prediction accuracy..

Project Module 2

Samuel Owusu Manu

2024-07-17