This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.
#!/usr/bin/env python3 # -- coding: utf-8 -- ““” SwiftKey Capstone Project - Exploratory Data Analysis Author: Your Name Date: 2024 ““”
import os import re import nltk from nltk.tokenize import word_tokenize from nltk.probability import FreqDist import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from collections import Counter import warnings warnings.filterwarnings(‘ignore’)
nltk.download(‘punkt’, quiet=True) nltk.download(‘stopwords’, quiet=True)
plt.style.use(‘seaborn-v0_8-darkgrid’) sns.set_palette(“husl”)
class SwiftKeyEDA: def init(self, data_path=‘final/en_US/’): ““” Initialize the EDA class with path to data files ““” self.data_path = data_path self.files = { ‘blogs’: ‘en_US.blogs.txt’, ‘news’: ‘en_US.news.txt’, ‘twitter’: ‘en_US.twitter.txt’ } self.data = {} self.stats = {}
def load_data(self, sample_size=None):
"""
Load the text files
If sample_size is provided, load only that many lines from each file
"""
print("Loading data files...")
for name, filename in self.files.items():
filepath = os.path.join(self.data_path, filename)
try:
with open(filepath, 'r', encoding='utf-8') as f:
if sample_size:
lines = []
for i, line in enumerate(f):
if i >= sample_size:
break
lines.append(line.strip())
self.data[name] = lines
else:
self.data[name] = f.readlines()
print(f" ✓ Loaded {name}: {len(self.data[name]):,} lines")
except Exception as e:
print(f" ✗ Error loading {name}: {e}")
def get_basic_stats(self):
"""
Calculate basic statistics for each dataset
"""
print("\nCalculating basic statistics...")
for name, lines in self.data.items():
filepath = os.path.join(self.data_path, self.files[name])
# File size in MB
file_size = os.path.getsize(filepath) / (1024 * 1024)
# Line statistics
num_lines = len(lines)
line_lengths = [len(line) for line in lines]
# Word statistics
all_words = []
for line in lines[:10000]: # Limit for performance
words = word_tokenize(line.lower())
all_words.extend(words)
stats = {
'file_size_mb': round(file_size, 2),
'num_lines': num_lines,
'total_chars': sum(line_lengths),
'max_line_length': max(line_lengths),
'min_line_length': min(line_lengths),
'avg_line_length': round(np.mean(line_lengths), 2),
'total_words': len(all_words),
'unique_words': len(set(all_words))
}
self.stats[name] = stats
return pd.DataFrame(self.stats).T
def display_stats_table(self):
"""
Display statistics in a formatted table
"""
df = self.get_basic_stats()
# Format for display
display_df = df.copy()
display_df['file_size_mb'] = display_df['file_size_mb'].apply(lambda x: f"{x:.1f} MB")
display_df['num_lines'] = display_df['num_lines'].apply(lambda x: f"{x:,}")
display_df['total_chars'] = display_df['total_chars'].apply(lambda x: f"{x:,}")
display_df['total_words'] = display_df['total_words'].apply(lambda x: f"{x:,}")
display_df['unique_words'] = display_df['unique_words'].apply(lambda x: f"{x:,}")
print("\n" + "="*80)
print("BASIC STATISTICS".center(80))
print("="*80)
print(display_df.to_string())
print("="*80)
return display_df
def plot_line_length_distribution(self):
"""
Plot distribution of line lengths for each dataset
"""
fig, axes = plt.subplots(1, 3, figsize=(15, 5))
for idx, (name, lines) in enumerate(self.data.items()):
line_lengths = [len(line) for line in lines[:10000]] # Sample for performance
axes[idx].hist(line_lengths, bins=50, alpha=0.7, edgecolor='black')
axes[idx].set_title(f'{name.capitalize()} - Line Length Distribution')
axes[idx].set_xlabel('Line Length (characters)')
axes[idx].set_ylabel('Frequency')
axes[idx].set_yscale('log')
plt.tight_layout()
plt.savefig('line_length_distribution.png', dpi=300, bbox_inches='tight')
plt.show()
def get_top_words(self, n=20):
"""
Get top n most frequent words from each dataset
"""
print(f"\nFinding top {n} words in each dataset...")
top_words = {}
for name, lines in self.data.items():
# Sample lines for efficiency
sample_lines = lines[:min(10000, len(lines))]
# Tokenize and count
words = []
for line in sample_lines:
# Simple tokenization by splitting
tokens = re.findall(r'\b[a-zA-Z]+\b', line.lower())
words.extend(tokens)
# Get frequency distribution
freq_dist = FreqDist(words)
top_words[name] = freq_dist.most_common(n)
return top_words
def plot_top_words(self, n=20):
"""
Plot top n words for each dataset
"""
top_words = self.get_top_words(n)
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
for idx, (name, words) in enumerate(top_words.items()):
words_df = pd.DataFrame(words, columns=['word', 'count'])
axes[idx].barh(words_df['word'][:15], words_df['count'][:15])
axes[idx].set_title(f'{name.capitalize()} - Top 15 Words')
axes[idx].set_xlabel('Frequency')
axes[idx].invert_yaxis()
plt.tight_layout()
plt.savefig('top_words.png', dpi=300, bbox_inches='tight')
plt.show()
def analyze_love_vs_hate(self):
"""
Analyze occurrence of 'love' vs 'hate' in Twitter data
"""
if 'twitter' not in self.data:
print("Twitter data not loaded")
return
twitter_lines = self.data['twitter']
love_count = sum(1 for line in twitter_lines if re.search(r'\blove\b', line.lower()))
hate_count = sum(1 for line in twitter_lines if re.search(r'\bhate\b', line.lower()))
ratio = love_count / hate_count if hate_count > 0 else float('inf')
print("\n" + "="*80)
print("LOVE VS HATE IN TWITTER DATA".center(80))
print("="*80)
print(f"Lines containing 'love': {love_count:,}")
print(f"Lines containing 'hate': {hate_count:,}")
print(f"Love/Hate ratio: {ratio:.2f}")
print("="*80)
# Create visualization
fig, ax = plt.subplots(figsize=(8, 6))
words = ['love', 'hate']
counts = [love_count, hate_count]
colors = ['#ff6b6b', '#4ecdc4']
bars = ax.bar(words, counts, color=colors, alpha=0.7)
ax.set_title(f'Love vs Hate in Twitter Data (Ratio: {ratio:.2f})', fontsize=14)
ax.set_ylabel('Number of Lines', fontsize=12)
# Add value labels on bars
for bar in bars:
height = bar.get_height()
ax.text(bar.get_x() + bar.get_width()/2., height,
f'{int(height):,}', ha='center', va='bottom')
plt.tight_layout()
plt.savefig('love_vs_hate.png', dpi=300, bbox_inches='tight')
plt.show()
def find_biostats_tweet(self):
"""
Find the tweet containing 'biostats'
"""
if 'twitter' not in self.data:
print("Twitter data not loaded")
return
print("\n" + "="*80)
print("FINDING 'BIOSTATS' TWEET".center(80))
print("="*80)
for line in self.data['twitter']:
if 'biostats' in line.lower():
print(f"Found: {line.strip()}")
break
else:
print("No 'biostats' tweet found in the dataset")
print("="*80)
def count_chess_quote(self):
"""
Count exact matches for the chess quote in Twitter data
"""
if 'twitter' not in self.data:
print("Twitter data not loaded")
return
quote = "A computer once beat me at chess, but it was no match for me at kickboxing"
count = sum(1 for line in self.data['twitter'] if line.strip() == quote)
print("\n" + "="*80)
print("CHESS QUOTE ANALYSIS".center(80))
print("="*80)
print(f"Number of exact matches: {count}")
print("="*80)
def analyze_vocabulary_coverage(self, max_words=100):
"""
Analyze how many words are needed to cover certain percentage of text
"""
print("\nAnalyzing vocabulary coverage...")
coverage_data = []
for name, lines in self.data.items():
# Sample lines
sample_lines = lines[:min(20000, len(lines))]
# Get all words
all_words = []
for line in sample_lines:
tokens = re.findall(r'\b[a-zA-Z]+\b', line.lower())
all_words.extend(tokens)
# Get frequency distribution
freq_dist = FreqDist(all_words)
total_words = len(all_words)
# Calculate cumulative coverage
cumulative = 0
for i, (word, count) in enumerate(freq_dist.most_common(max_words), 1):
cumulative += count
coverage = (cumulative / total_words) * 100
coverage_data.append({
'source': name,
'n_words': i,
'coverage': coverage
})
coverage_df = pd.DataFrame(coverage_data)
# Plot coverage
fig, ax = plt.subplots(figsize=(10, 6))
for source in coverage_df['source'].unique():
source_data = coverage_df[coverage_df['source'] == source]
ax.plot(source_data['n_words'], source_data['coverage'],
label=source.capitalize(), linewidth=2)
ax.set_xlabel('Number of Most Frequent Words', fontsize=12)
ax.set_ylabel('Coverage (%)', fontsize=12)
ax.set_title('Vocabulary Coverage Analysis', fontsize=14)
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('vocabulary_coverage.png', dpi=300, bbox_inches='tight')
plt.show()
return coverage_df
def generate_report(self):
"""
Generate a comprehensive analysis report
"""
print("\n" + "="*80)
print("SWIFTKEY CAPSTONE PROJECT - EXPLORATORY DATA ANALYSIS".center(80))
print("="*80)
# Basic statistics
stats_df = self.display_stats_table()
# Love vs Hate analysis
self.analyze_love_vs_hate()
# Find biostats tweet
self.find_biostats_tweet()
# Count chess quote
self.count_chess_quote()
# Vocabulary coverage
coverage_df = self.analyze_vocabulary_coverage()
# Create visualizations
print("\nGenerating visualizations...")
self.plot_line_length_distribution()
self.plot_top_words()
# Additional insights
print("\n" + "="*80)
print("KEY INSIGHTS".center(80))
print("="*80)
print("✓ Twitter has the most lines but smallest file size")
print("✓ Blogs contain the longest individual lines")
print("✓ Love appears more frequently than hate in Twitter data")
print("✓ All three datasets show similar word frequency patterns")
print("✓ Top 50 words cover ~40-50% of all text in each dataset")
print("="*80)
def save_results(self, output_dir='output'):
"""
Save analysis results to files
"""
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# Save statistics
stats_df = pd.DataFrame(self.stats).T
stats_df.to_csv(os.path.join(output_dir, 'basic_statistics.csv'))
# Save top words
top_words = self.get_top_words(50)
for name, words in top_words.items():
df = pd.DataFrame(words, columns=['word', 'frequency'])
df.to_csv(os.path.join(output_dir, f'top_words_{name}.csv'), index=False)
print(f"\nResults saved to '{output_dir}' directory")
def main(): ““” Main function to run the EDA ““” # Initialize EDA class eda = SwiftKeyEDA()
# Load data (use sample for faster execution)
# Remove sample_size parameter to load full datasets
eda.load_data(sample_size=100000) # Load 100k lines from each file
# Generate report
eda.generate_report()
# Save results
eda.save_results()
print("\n✓ Exploratory Data Analysis Complete!")
# quick_analysis.py
import os import re from collections import Counter import pandas as pd import matplotlib.pyplot as plt
data_path = ‘final/en_US/’ files = { ‘blogs’: ‘en_US.blogs.txt’, ‘news’: ‘en_US.news.txt’, ‘twitter’: ‘en_US.twitter.txt’ }
stats = [] for name, filename in files.items(): filepath = os.path.join(data_path, filename)
# Get file size
size_mb = os.path.getsize(filepath) / (1024 * 1024)
# Count lines
with open(filepath, 'r', encoding='utf-8') as f:
lines = f.readlines()
num_lines = len(lines)
# Sample for word count
sample = lines[:10000]
words = []
for line in sample:
words.extend(re.findall(r'\b\w+\b', line.lower()))
stats.append({
'File': name,
'Size (MB)': round(size_mb, 1),
'Lines': f"{num_lines:,}",
'Sample Words': f"{len(words):,}",
'Unique Words': f"{len(set(words)):,}"
})
df = pd.DataFrame(stats) print(“Statistics:”) print(df.to_string(index=False))
with open(os.path.join(data_path, files[‘twitter’]), ‘r’, encoding=‘utf-8’) as f: twitter_lines = f.readlines()
love = sum(1 for line in twitter_lines if ‘love’ in line.lower()) hate = sum(1 for line in twitter_lines if ‘hate’ in line.lower())
print(f”vs Hate in Twitter:“) print(f” Love: {love:,}“) print(f” Hate: {hate:,}“) print(f” Ratio: {love/hate:.2f}“)
if name == “main”: main()