knitr::opts_chunk$set(echo = TRUE)
# Generate synthetic text data for demonstration
file1 <- c("This is the first line of file one.", 
           "Another line with some words.", 
           "File one contains text data for analysis.")

file2 <- c("Second file starts here.", 
           "It contains more text data than the first.", 
           "Text data is useful for machine learning projects.")

file3 <- c("Here we have the third file.", 
           "This file has fewer lines but still some text.", 
           "Exploratory data analysis is interesting.")
# Load required libraries
library(stringr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Function to summarize the datasets
summarize_data <- function(data, filename) {
  data.frame(
    File = filename,
    Total_Lines = length(data),
    Total_Words = sum(str_count(data, "\\S+")),
    Avg_Words_Per_Line = mean(str_count(data, "\\S+"))
  )
}

# Generate summaries for each file
summary_stats <- rbind(
  summarize_data(file1, "File 1"),
  summarize_data(file2, "File 2"),
  summarize_data(file3, "File 3")
)

# Display the summary statistics
summary_stats
##     File Total_Lines Total_Words Avg_Words_Per_Line
## 1 File 1           3          20           6.666667
## 2 File 2           3          20           6.666667
## 3 File 3           3          20           6.666667
# Load additional libraries for visualization
library(ggplot2)
library(purrr)

# Combine all data for visualization
all_data <- list(file1 = file1, file2 = file2, file3 = file3) %>% 
  map_df(~ data.frame(Line = seq_along(.), Text = ., Words = str_count(., "\\S+")), .id = "File")

# Check combined data to ensure it is populated
head(all_data)
##    File Line                                               Text Words
## 1 file1    1                This is the first line of file one.     8
## 2 file1    2                      Another line with some words.     5
## 3 file1    3          File one contains text data for analysis.     7
## 4 file2    1                           Second file starts here.     4
## 5 file2    2         It contains more text data than the first.     8
## 6 file2    3 Text data is useful for machine learning projects.     8
# Plot histogram of word counts
ggplot(all_data, aes(x = Words, fill = File)) +
  geom_histogram(binwidth = 1, alpha = 0.7, position = "dodge") +
  labs(
    title = "Word Count Distribution Across Files",
    x = "Number of Words per Line",
    y = "Frequency"
  ) +
  theme_minimal()

# Load library for text tokenization
library(tidytext)

# Tokenize data and calculate word frequencies
word_frequency <- all_data %>%
  unnest_tokens(word, Text) %>%
  count(word, sort = TRUE)

# Check word frequency to ensure it is populated
head(word_frequency)
##       word n
## 1     file 5
## 2     data 4
## 3     text 4
## 4       is 3
## 5      the 3
## 6 analysis 2
# Plot the top 10 frequent words
ggplot(word_frequency[1:10,], aes(x = reorder(word, n), y = n)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  labs(
    title = "Top 10 Most Frequent Words",
    x = "Words",
    y = "Frequency"
  ) +
  coord_flip() +
  theme_minimal()