import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import requests
from io import StringIO
import re
import string
import io
import base64
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import HTML, display
def save_plot_as_base64(plt, dpi=300):
"""Save a matplotlib plot as base64-encoded image"""
buf = io.BytesIO()
plt.savefig(buf, format='png', bbox_inches='tight', dpi=dpi)
plt.close()
img_base64 = base64.b64encode(buf.getvalue()).decode('utf-8')
img_html = f'<img src="data:image/png;base64,{img_base64}" alt="Plot" width="750" />'
return img_html
# URLs to the dataset files
data_url = "https://tonyfraser-data.s3.us-east-1.amazonaws.com/spambase/spambase.data"
names_url = "https://tonyfraser-data.s3.us-east-1.amazonaws.com/spambase/spambase.names"Week 10 - Spam
1 Introduction
This document explores the spambase dataset, a collection of email data for spam classification. We’ll load the data, analyze its characteristics, train a machine learning model, and evaluate its performance.
2 Setup and Data Loading
First, we’ll import the necessary libraries and set up our environment.
Now, let’s define functions to parse the feature names from the names file and load the data.
# Function to parse the names file and extract column names
def parse_names_file(url):
response = requests.get(url)
response.raise_for_status()
names_content = response.text.split('\n')
# Extract word frequency feature names
word_features = []
char_features = []
other_features = []
for line in names_content:
line = line.strip()
# Look for word frequency attributes
word_match = re.search(r'word_freq_(\w+)', line)
if word_match:
word_features.append(f'word_freq_{word_match.group(1)}')
# Look for char frequency attributes
char_match = re.search(r'char_freq_(\w+)', line)
if char_match:
char_features.append(f'char_freq_{char_match.group(1)}')
# Look for capital run length attributes
if 'capital_run_length_' in line:
match = re.search(r'capital_run_length_(\w+)', line)
if match:
other_features.append(f'capital_run_length_{match.group(1)}')
# If we couldn't extract the names correctly, use generic ones
if len(word_features) != 48:
word_features = [f'word_freq_{i}' for i in range(48)]
if len(char_features) != 6:
char_features = [f'char_freq_{i}' for i in range(6)]
if len(other_features) != 3:
other_features = ['capital_run_length_average',
'capital_run_length_longest',
'capital_run_length_total']
# Combine all feature names
all_features = word_features + char_features + other_features + ['is_spam']
return all_features, word_features, char_features, other_features
# Load the data
def load_spambase_data(data_url, names_url):
# Get column names from the names file
column_names, word_features, char_features, other_features = parse_names_file(names_url)
# Download the data
response = requests.get(data_url)
response.raise_for_status()
# Convert to StringIO object for pandas to read
data_string = StringIO(response.text)
# Load into dataframe
df = pd.read_csv(data_string, header=None, names=column_names)
feature_groups = {
'word_features': word_features,
'char_features': char_features,
'other_features': other_features
}
return df, column_names, feature_groups3 Data Exploration
Let’s load the data and explore its structure.
# Load the data with proper column names
df, feature_names, feature_groups = load_spambase_data(data_url, names_url)
# Display basic information
print("Dataset shape:", df.shape)
print("\nFeature groups:")
for group_name, features in feature_groups.items():
print(f"{group_name}: {len(features)} features")
if len(features) > 5:
print(f" First 5: {features[:5]}")
else:
print(f" All: {features}")
print("\nFirst few rows:")
print(df.head())
# Distribution of spam vs non-spam
spam_count = df['is_spam'].value_counts()
print("\nClass distribution:")
print(spam_count)
print(f"Spam ratio: {spam_count[1] / len(df):.2%}")
# Create a bar chart for spam distribution
plt.figure(figsize=(8, 5))
sns.countplot(x='is_spam', data=df, palette=['#3498db', '#e74c3c'])
plt.title('Distribution of Spam vs. Ham Emails')
plt.xlabel('Email Type (0 = Ham, 1 = Spam)')
plt.ylabel('Count')
plt.xticks([0, 1], ['Ham', 'Spam'])
plt.grid(axis='y', linestyle='--', alpha=0.7)
# Display the plot using base64 encoding
display(HTML(save_plot_as_base64(plt)))Dataset shape: (4601, 58)
Feature groups:
word_features: 48 features
First 5: ['word_freq_0', 'word_freq_1', 'word_freq_2', 'word_freq_3', 'word_freq_4']
char_features: 6 features
First 5: ['char_freq_0', 'char_freq_1', 'char_freq_2', 'char_freq_3', 'char_freq_4']
other_features: 3 features
All: ['capital_run_length_average', 'capital_run_length_longest', 'capital_run_length_total']
First few rows:
word_freq_0 word_freq_1 word_freq_2 word_freq_3 word_freq_4 \
0 0.00 0.64 0.64 0.0 0.32
1 0.21 0.28 0.50 0.0 0.14
2 0.06 0.00 0.71 0.0 1.23
3 0.00 0.00 0.00 0.0 0.63
4 0.00 0.00 0.00 0.0 0.63
word_freq_5 word_freq_6 word_freq_7 word_freq_8 word_freq_9 ... \
0 0.00 0.00 0.00 0.00 0.00 ...
1 0.28 0.21 0.07 0.00 0.94 ...
2 0.19 0.19 0.12 0.64 0.25 ...
3 0.00 0.31 0.63 0.31 0.63 ...
4 0.00 0.31 0.63 0.31 0.63 ...
char_freq_0 char_freq_1 char_freq_2 char_freq_3 char_freq_4 \
0 0.00 0.000 0.0 0.778 0.000
1 0.00 0.132 0.0 0.372 0.180
2 0.01 0.143 0.0 0.276 0.184
3 0.00 0.137 0.0 0.137 0.000
4 0.00 0.135 0.0 0.135 0.000
char_freq_5 capital_run_length_average capital_run_length_longest \
0 0.000 3.756 61
1 0.048 5.114 101
2 0.010 9.821 485
3 0.000 3.537 40
4 0.000 3.537 40
capital_run_length_total is_spam
0 278 1
1 1028 1
2 2259 1
3 191 1
4 191 1
[5 rows x 58 columns]
Class distribution:
is_spam
0 2788
1 1813
Name: count, dtype: int64
Spam ratio: 39.40%
4 Feature Analysis
Let’s analyze the most important features for distinguishing spam from non-spam emails.
# Calculate mean values for features in spam vs non-spam
spam_means = df[df['is_spam'] == 1].mean()
ham_means = df[df['is_spam'] == 0].mean()
# Find features with the largest differences
diff = spam_means - ham_means
top_diff_features = diff.abs().sort_values(ascending=False)[:10]
print("Top 10 features with the largest difference between spam and ham:")
print(top_diff_features)
# Create a bar chart for top differentiating features
plt.figure(figsize=(10, 6))
top_features = top_diff_features.index
feature_names_clean = [f.replace('word_freq_', '').replace('char_freq_', '') for f in top_features]
# Get values for ham and spam
ham_values = [ham_means[f] for f in top_features]
spam_values = [spam_means[f] for f in top_features]
# Create a dataframe for plotting
plot_df = pd.DataFrame({
'Feature': feature_names_clean,
'Ham': ham_values,
'Spam': spam_values
}).melt(id_vars='Feature', var_name='Class', value_name='Frequency')
# Plot
sns.barplot(x='Feature', y='Frequency', hue='Class', data=plot_df, palette=['#3498db', '#e74c3c'])
plt.title('Top Differentiating Features Between Spam and Ham')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.legend(title='Email Type')
# Display the plot using base64 encoding
display(HTML(save_plot_as_base64(plt)))Top 10 features with the largest difference between spam and ham:
capital_run_length_total 309.148468
capital_run_length_longest 86.178780
capital_run_length_average 7.141864
word_freq_26 1.263716
is_spam 1.000000
word_freq_18 0.994199
word_freq_20 0.941668
word_freq_24 0.877994
word_freq_15 0.444775
word_freq_25 0.422822
dtype: float64
5 Model Training and Evaluation
Now, let’s train a Random Forest classifier to predict spam and evaluate its performance.
# Split features and target
X = df.drop('is_spam', axis=1)
y = df['is_spam']
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train a random forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_scaled, y_train)
# Make predictions
y_pred = clf.predict(X_test_scaled)
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))Accuracy: 0.9554831704668838
Classification Report:
precision recall f1-score support
0 0.94 0.98 0.96 531
1 0.98 0.92 0.95 390
accuracy 0.96 921
macro avg 0.96 0.95 0.95 921
weighted avg 0.96 0.96 0.96 921
6 Feature Importance
Let’s examine which features are most important for the Random Forest model’s predictions.
# Feature importance
feature_importances = pd.DataFrame({
'feature': X.columns,
'importance': clf.feature_importances_
}).sort_values('importance', ascending=False)
print("Top 10 most important features for spam classification:")
print(feature_importances.head(10))
# Create a bar chart for feature importance
plt.figure(figsize=(10, 6))
top_n = 15
top_features = feature_importances.head(top_n)
feature_names_clean = [f.replace('word_freq_', '').replace('char_freq_', '') for f in top_features['feature']]
sns.barplot(x='importance', y=feature_names_clean,
data={'importance': top_features['importance'], 'feature': feature_names_clean},
palette='viridis')
plt.title(f'Top {top_n} Most Important Features for Spam Classification')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
# Display the plot using base64 encoding
display(HTML(save_plot_as_base64(plt)))Top 10 most important features for spam classification:
feature importance
51 char_freq_3 0.113786
52 char_freq_4 0.096676
6 word_freq_6 0.081848
15 word_freq_15 0.067141
55 capital_run_length_longest 0.058623
54 capital_run_length_average 0.057715
56 capital_run_length_total 0.052335
20 word_freq_20 0.046336
24 word_freq_24 0.042364
18 word_freq_18 0.032968
7 Making Predictions
Finally, let’s create a function to make predictions on new data.
# Function to make predictions on new data
def predict_spam(new_data, model, scaler):
"""
Make predictions on new data
Parameters:
-----------
new_data : pandas DataFrame
New data to make predictions on, should have the same features as the training data
model : trained model
The trained model to use for predictions
scaler : fitted scaler
The scaler used to scale the training data
Returns:
--------
predictions : numpy array
Array of predictions (1 for spam, 0 for not spam)
"""
# Scale the data
new_data_scaled = scaler.transform(new_data)
# Make predictions
predictions = model.predict(new_data_scaled)
probabilities = model.predict_proba(new_data_scaled)
return predictions, probabilities
# Example of using the prediction function (with test data)
example_emails = X_test.iloc[:5] # Take 5 examples from test set
predictions, probabilities = predict_spam(example_emails, clf, scaler)
# Create a results table
results_df = pd.DataFrame({
'True Label': y_test.iloc[:5],
'Predicted': predictions,
'Probability of Spam': [p[1] for p in probabilities]
})
print("Example predictions:")
print(results_df)Example predictions:
True Label Predicted Probability of Spam
3683 0 0 0.00
4412 0 0 0.26
2584 0 0 0.06
69 1 1 0.57
1844 0 0 0.04
8 Conclusion
In this document, we’ve explored the spambase dataset, trained a Random Forest classifier for spam detection, and analyzed the most important features for distinguishing spam from legitimate emails. The model achieves good performance, demonstrating the effectiveness of the selected features in identifying spam emails.