import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import requests
from io import StringIO
import re
import string
import io
import base64
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import HTML, display
def save_plot_as_base64(plt, dpi=300):
"""Save a matplotlib plot as base64-encoded image"""
= io.BytesIO()
buf format='png', bbox_inches='tight', dpi=dpi)
plt.savefig(buf,
plt.close()= base64.b64encode(buf.getvalue()).decode('utf-8')
img_base64 = f'<img src="data:image/png;base64,{img_base64}" alt="Plot" width="750" />'
img_html return img_html
# URLs to the dataset files
= "https://tonyfraser-data.s3.us-east-1.amazonaws.com/spambase/spambase.data"
data_url = "https://tonyfraser-data.s3.us-east-1.amazonaws.com/spambase/spambase.names" names_url
Week 10 - Spam
1 Introduction
This document explores the spambase dataset, a collection of email data for spam classification. We’ll load the data, analyze its characteristics, train a machine learning model, and evaluate its performance.
2 Setup and Data Loading
First, we’ll import the necessary libraries and set up our environment.
Now, let’s define functions to parse the feature names from the names file and load the data.
# Function to parse the names file and extract column names
def parse_names_file(url):
= requests.get(url)
response
response.raise_for_status()
= response.text.split('\n')
names_content
# Extract word frequency feature names
= []
word_features = []
char_features = []
other_features
for line in names_content:
= line.strip()
line
# Look for word frequency attributes
= re.search(r'word_freq_(\w+)', line)
word_match if word_match:
f'word_freq_{word_match.group(1)}')
word_features.append(
# Look for char frequency attributes
= re.search(r'char_freq_(\w+)', line)
char_match if char_match:
f'char_freq_{char_match.group(1)}')
char_features.append(
# Look for capital run length attributes
if 'capital_run_length_' in line:
= re.search(r'capital_run_length_(\w+)', line)
match if match:
f'capital_run_length_{match.group(1)}')
other_features.append(
# If we couldn't extract the names correctly, use generic ones
if len(word_features) != 48:
= [f'word_freq_{i}' for i in range(48)]
word_features
if len(char_features) != 6:
= [f'char_freq_{i}' for i in range(6)]
char_features
if len(other_features) != 3:
= ['capital_run_length_average',
other_features 'capital_run_length_longest',
'capital_run_length_total']
# Combine all feature names
= word_features + char_features + other_features + ['is_spam']
all_features
return all_features, word_features, char_features, other_features
# Load the data
def load_spambase_data(data_url, names_url):
# Get column names from the names file
= parse_names_file(names_url)
column_names, word_features, char_features, other_features
# Download the data
= requests.get(data_url)
response
response.raise_for_status()
# Convert to StringIO object for pandas to read
= StringIO(response.text)
data_string
# Load into dataframe
= pd.read_csv(data_string, header=None, names=column_names)
df
= {
feature_groups 'word_features': word_features,
'char_features': char_features,
'other_features': other_features
}
return df, column_names, feature_groups
3 Data Exploration
Let’s load the data and explore its structure.
# Load the data with proper column names
= load_spambase_data(data_url, names_url)
df, feature_names, feature_groups
# Display basic information
print("Dataset shape:", df.shape)
print("\nFeature groups:")
for group_name, features in feature_groups.items():
print(f"{group_name}: {len(features)} features")
if len(features) > 5:
print(f" First 5: {features[:5]}")
else:
print(f" All: {features}")
print("\nFirst few rows:")
print(df.head())
# Distribution of spam vs non-spam
= df['is_spam'].value_counts()
spam_count print("\nClass distribution:")
print(spam_count)
print(f"Spam ratio: {spam_count[1] / len(df):.2%}")
# Create a bar chart for spam distribution
=(8, 5))
plt.figure(figsize='is_spam', data=df, palette=['#3498db', '#e74c3c'])
sns.countplot(x'Distribution of Spam vs. Ham Emails')
plt.title('Email Type (0 = Ham, 1 = Spam)')
plt.xlabel('Count')
plt.ylabel(0, 1], ['Ham', 'Spam'])
plt.xticks([='y', linestyle='--', alpha=0.7)
plt.grid(axis
# Display the plot using base64 encoding
display(HTML(save_plot_as_base64(plt)))
Dataset shape: (4601, 58)
Feature groups:
word_features: 48 features
First 5: ['word_freq_0', 'word_freq_1', 'word_freq_2', 'word_freq_3', 'word_freq_4']
char_features: 6 features
First 5: ['char_freq_0', 'char_freq_1', 'char_freq_2', 'char_freq_3', 'char_freq_4']
other_features: 3 features
All: ['capital_run_length_average', 'capital_run_length_longest', 'capital_run_length_total']
First few rows:
word_freq_0 word_freq_1 word_freq_2 word_freq_3 word_freq_4 \
0 0.00 0.64 0.64 0.0 0.32
1 0.21 0.28 0.50 0.0 0.14
2 0.06 0.00 0.71 0.0 1.23
3 0.00 0.00 0.00 0.0 0.63
4 0.00 0.00 0.00 0.0 0.63
word_freq_5 word_freq_6 word_freq_7 word_freq_8 word_freq_9 ... \
0 0.00 0.00 0.00 0.00 0.00 ...
1 0.28 0.21 0.07 0.00 0.94 ...
2 0.19 0.19 0.12 0.64 0.25 ...
3 0.00 0.31 0.63 0.31 0.63 ...
4 0.00 0.31 0.63 0.31 0.63 ...
char_freq_0 char_freq_1 char_freq_2 char_freq_3 char_freq_4 \
0 0.00 0.000 0.0 0.778 0.000
1 0.00 0.132 0.0 0.372 0.180
2 0.01 0.143 0.0 0.276 0.184
3 0.00 0.137 0.0 0.137 0.000
4 0.00 0.135 0.0 0.135 0.000
char_freq_5 capital_run_length_average capital_run_length_longest \
0 0.000 3.756 61
1 0.048 5.114 101
2 0.010 9.821 485
3 0.000 3.537 40
4 0.000 3.537 40
capital_run_length_total is_spam
0 278 1
1 1028 1
2 2259 1
3 191 1
4 191 1
[5 rows x 58 columns]
Class distribution:
is_spam
0 2788
1 1813
Name: count, dtype: int64
Spam ratio: 39.40%
4 Feature Analysis
Let’s analyze the most important features for distinguishing spam from non-spam emails.
# Calculate mean values for features in spam vs non-spam
= df[df['is_spam'] == 1].mean()
spam_means = df[df['is_spam'] == 0].mean()
ham_means
# Find features with the largest differences
= spam_means - ham_means
diff = diff.abs().sort_values(ascending=False)[:10]
top_diff_features
print("Top 10 features with the largest difference between spam and ham:")
print(top_diff_features)
# Create a bar chart for top differentiating features
=(10, 6))
plt.figure(figsize= top_diff_features.index
top_features = [f.replace('word_freq_', '').replace('char_freq_', '') for f in top_features]
feature_names_clean
# Get values for ham and spam
= [ham_means[f] for f in top_features]
ham_values = [spam_means[f] for f in top_features]
spam_values
# Create a dataframe for plotting
= pd.DataFrame({
plot_df 'Feature': feature_names_clean,
'Ham': ham_values,
'Spam': spam_values
='Feature', var_name='Class', value_name='Frequency')
}).melt(id_vars
# Plot
='Feature', y='Frequency', hue='Class', data=plot_df, palette=['#3498db', '#e74c3c'])
sns.barplot(x'Top Differentiating Features Between Spam and Ham')
plt.title(=45, ha='right')
plt.xticks(rotation
plt.tight_layout()='Email Type')
plt.legend(title
# Display the plot using base64 encoding
display(HTML(save_plot_as_base64(plt)))
Top 10 features with the largest difference between spam and ham:
capital_run_length_total 309.148468
capital_run_length_longest 86.178780
capital_run_length_average 7.141864
word_freq_26 1.263716
is_spam 1.000000
word_freq_18 0.994199
word_freq_20 0.941668
word_freq_24 0.877994
word_freq_15 0.444775
word_freq_25 0.422822
dtype: float64
5 Model Training and Evaluation
Now, let’s train a Random Forest classifier to predict spam and evaluate its performance.
# Split features and target
= df.drop('is_spam', axis=1)
X = df['is_spam']
y
# Split into training and testing sets
= train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test
# Scale the features
= StandardScaler()
scaler = scaler.fit_transform(X_train)
X_train_scaled = scaler.transform(X_test)
X_test_scaled
# Train a random forest classifier
= RandomForestClassifier(n_estimators=100, random_state=42)
clf
clf.fit(X_train_scaled, y_train)
# Make predictions
= clf.predict(X_test_scaled)
y_pred
# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
Accuracy: 0.9554831704668838
Classification Report:
precision recall f1-score support
0 0.94 0.98 0.96 531
1 0.98 0.92 0.95 390
accuracy 0.96 921
macro avg 0.96 0.95 0.95 921
weighted avg 0.96 0.96 0.96 921
6 Feature Importance
Let’s examine which features are most important for the Random Forest model’s predictions.
# Feature importance
= pd.DataFrame({
feature_importances 'feature': X.columns,
'importance': clf.feature_importances_
'importance', ascending=False)
}).sort_values(
print("Top 10 most important features for spam classification:")
print(feature_importances.head(10))
# Create a bar chart for feature importance
=(10, 6))
plt.figure(figsize= 15
top_n = feature_importances.head(top_n)
top_features = [f.replace('word_freq_', '').replace('char_freq_', '') for f in top_features['feature']]
feature_names_clean
='importance', y=feature_names_clean,
sns.barplot(x={'importance': top_features['importance'], 'feature': feature_names_clean},
data='viridis')
palettef'Top {top_n} Most Important Features for Spam Classification')
plt.title('Importance')
plt.xlabel('Feature')
plt.ylabel(
plt.tight_layout()
# Display the plot using base64 encoding
display(HTML(save_plot_as_base64(plt)))
Top 10 most important features for spam classification:
feature importance
51 char_freq_3 0.113786
52 char_freq_4 0.096676
6 word_freq_6 0.081848
15 word_freq_15 0.067141
55 capital_run_length_longest 0.058623
54 capital_run_length_average 0.057715
56 capital_run_length_total 0.052335
20 word_freq_20 0.046336
24 word_freq_24 0.042364
18 word_freq_18 0.032968
7 Making Predictions
Finally, let’s create a function to make predictions on new data.
# Function to make predictions on new data
def predict_spam(new_data, model, scaler):
"""
Make predictions on new data
Parameters:
-----------
new_data : pandas DataFrame
New data to make predictions on, should have the same features as the training data
model : trained model
The trained model to use for predictions
scaler : fitted scaler
The scaler used to scale the training data
Returns:
--------
predictions : numpy array
Array of predictions (1 for spam, 0 for not spam)
"""
# Scale the data
= scaler.transform(new_data)
new_data_scaled
# Make predictions
= model.predict(new_data_scaled)
predictions = model.predict_proba(new_data_scaled)
probabilities
return predictions, probabilities
# Example of using the prediction function (with test data)
= X_test.iloc[:5] # Take 5 examples from test set
example_emails = predict_spam(example_emails, clf, scaler)
predictions, probabilities
# Create a results table
= pd.DataFrame({
results_df 'True Label': y_test.iloc[:5],
'Predicted': predictions,
'Probability of Spam': [p[1] for p in probabilities]
})
print("Example predictions:")
print(results_df)
Example predictions:
True Label Predicted Probability of Spam
3683 0 0 0.00
4412 0 0 0.26
2584 0 0 0.06
69 1 1 0.57
1844 0 0 0.04
8 Conclusion
In this document, we’ve explored the spambase dataset, trained a Random Forest classifier for spam detection, and analyzed the most important features for distinguishing spam from legitimate emails. The model achieves good performance, demonstrating the effectiveness of the selected features in identifying spam emails.