knitr::opts_chunk$set(echo = TRUE)
# Generate synthetic text data for demonstration
file1 <- c("This is the first line of file one.",
"Another line with some words.",
"File one contains text data for analysis.")
file2 <- c("Second file starts here.",
"It contains more text data than the first.",
"Text data is useful for machine learning projects.")
file3 <- c("Here we have the third file.",
"This file has fewer lines but still some text.",
"Exploratory data analysis is interesting.")
# Load required libraries
library(stringr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Function to summarize the datasets
summarize_data <- function(data, filename) {
data.frame(
File = filename,
Total_Lines = length(data),
Total_Words = sum(str_count(data, "\\S+")),
Avg_Words_Per_Line = mean(str_count(data, "\\S+"))
)
}
# Generate summaries for each file
summary_stats <- rbind(
summarize_data(file1, "File 1"),
summarize_data(file2, "File 2"),
summarize_data(file3, "File 3")
)
# Display the summary statistics
summary_stats
## File Total_Lines Total_Words Avg_Words_Per_Line
## 1 File 1 3 20 6.666667
## 2 File 2 3 20 6.666667
## 3 File 3 3 20 6.666667
# Load additional libraries for visualization
library(ggplot2)
library(purrr)
# Combine all data for visualization
all_data <- list(file1 = file1, file2 = file2, file3 = file3) %>%
map_df(~ data.frame(Line = seq_along(.), Text = ., Words = str_count(., "\\S+")), .id = "File")
# Check combined data to ensure it is populated
head(all_data)
## File Line Text Words
## 1 file1 1 This is the first line of file one. 8
## 2 file1 2 Another line with some words. 5
## 3 file1 3 File one contains text data for analysis. 7
## 4 file2 1 Second file starts here. 4
## 5 file2 2 It contains more text data than the first. 8
## 6 file2 3 Text data is useful for machine learning projects. 8
# Plot histogram of word counts
ggplot(all_data, aes(x = Words, fill = File)) +
geom_histogram(binwidth = 1, alpha = 0.7, position = "dodge") +
labs(
title = "Word Count Distribution Across Files",
x = "Number of Words per Line",
y = "Frequency"
) +
theme_minimal()

# Load library for text tokenization
library(tidytext)
# Tokenize data and calculate word frequencies
word_frequency <- all_data %>%
unnest_tokens(word, Text) %>%
count(word, sort = TRUE)
# Check word frequency to ensure it is populated
head(word_frequency)
## word n
## 1 file 5
## 2 data 4
## 3 text 4
## 4 is 3
## 5 the 3
## 6 analysis 2
# Plot the top 10 frequent words
ggplot(word_frequency[1:10,], aes(x = reorder(word, n), y = n)) +
geom_bar(stat = "identity", fill = "steelblue") +
labs(
title = "Top 10 Most Frequent Words",
x = "Words",
y = "Frequency"
) +
coord_flip() +
theme_minimal()
