Week 10 - Spam

Author

Tony Fraser and Mark Gonsalves

Published

April 18, 2025

1 Introduction

This document explores the spambase dataset, a collection of email data for spam classification. We’ll load the data, analyze its characteristics, train a machine learning model, and evaluate its performance.

2 Setup and Data Loading

First, we’ll import the necessary libraries and set up our environment.

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import requests
from io import StringIO
import re
import string
import io
import base64
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import HTML, display

def save_plot_as_base64(plt, dpi=300):
    """Save a matplotlib plot as base64-encoded image"""
    buf = io.BytesIO()
    plt.savefig(buf, format='png', bbox_inches='tight', dpi=dpi)
    plt.close()
    img_base64 = base64.b64encode(buf.getvalue()).decode('utf-8')
    img_html = f'<img src="data:image/png;base64,{img_base64}" alt="Plot"  width="750" />'
    return img_html
    
# URLs to the dataset files
data_url = "https://tonyfraser-data.s3.us-east-1.amazonaws.com/spambase/spambase.data"
names_url = "https://tonyfraser-data.s3.us-east-1.amazonaws.com/spambase/spambase.names"

Now, let’s define functions to parse the feature names from the names file and load the data.

# Function to parse the names file and extract column names
def parse_names_file(url):
    response = requests.get(url)
    response.raise_for_status()
    
    names_content = response.text.split('\n')
    
    # Extract word frequency feature names
    word_features = []
    char_features = []
    other_features = []
    
    for line in names_content:
        line = line.strip()
        
        # Look for word frequency attributes
        word_match = re.search(r'word_freq_(\w+)', line)
        if word_match:
            word_features.append(f'word_freq_{word_match.group(1)}')
            
        # Look for char frequency attributes
        char_match = re.search(r'char_freq_(\w+)', line)
        if char_match:
            char_features.append(f'char_freq_{char_match.group(1)}')
            
        # Look for capital run length attributes
        if 'capital_run_length_' in line:
            match = re.search(r'capital_run_length_(\w+)', line)
            if match:
                other_features.append(f'capital_run_length_{match.group(1)}')
    
    # If we couldn't extract the names correctly, use generic ones
    if len(word_features) != 48:
        word_features = [f'word_freq_{i}' for i in range(48)]
    
    if len(char_features) != 6:
        char_features = [f'char_freq_{i}' for i in range(6)]
        
    if len(other_features) != 3:
        other_features = ['capital_run_length_average', 
                         'capital_run_length_longest', 
                         'capital_run_length_total']
    
    # Combine all feature names
    all_features = word_features + char_features + other_features + ['is_spam']
    
    return all_features, word_features, char_features, other_features

# Load the data
def load_spambase_data(data_url, names_url):
    # Get column names from the names file
    column_names, word_features, char_features, other_features = parse_names_file(names_url)
    
    # Download the data
    response = requests.get(data_url)
    response.raise_for_status()  
    
    # Convert to StringIO object for pandas to read
    data_string = StringIO(response.text)
    
    # Load into dataframe
    df = pd.read_csv(data_string, header=None, names=column_names)
    
    feature_groups = {
        'word_features': word_features,
        'char_features': char_features,
        'other_features': other_features
    }
    
    return df, column_names, feature_groups

3 Data Exploration

Let’s load the data and explore its structure.

# Load the data with proper column names
df, feature_names, feature_groups = load_spambase_data(data_url, names_url)

# Display basic information
print("Dataset shape:", df.shape)
print("\nFeature groups:")
for group_name, features in feature_groups.items():
    print(f"{group_name}: {len(features)} features")
    if len(features) > 5:
        print(f"  First 5: {features[:5]}")
    else:
        print(f"  All: {features}")

print("\nFirst few rows:")
print(df.head())

# Distribution of spam vs non-spam
spam_count = df['is_spam'].value_counts()
print("\nClass distribution:")
print(spam_count)
print(f"Spam ratio: {spam_count[1] / len(df):.2%}")

# Create a bar chart for spam distribution
plt.figure(figsize=(8, 5))
sns.countplot(x='is_spam', data=df, palette=['#3498db', '#e74c3c'])
plt.title('Distribution of Spam vs. Ham Emails')
plt.xlabel('Email Type (0 = Ham, 1 = Spam)')
plt.ylabel('Count')
plt.xticks([0, 1], ['Ham', 'Spam'])
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Display the plot using base64 encoding
display(HTML(save_plot_as_base64(plt)))
Dataset shape: (4601, 58)

Feature groups:
word_features: 48 features
  First 5: ['word_freq_0', 'word_freq_1', 'word_freq_2', 'word_freq_3', 'word_freq_4']
char_features: 6 features
  First 5: ['char_freq_0', 'char_freq_1', 'char_freq_2', 'char_freq_3', 'char_freq_4']
other_features: 3 features
  All: ['capital_run_length_average', 'capital_run_length_longest', 'capital_run_length_total']

First few rows:
   word_freq_0  word_freq_1  word_freq_2  word_freq_3  word_freq_4  \
0         0.00         0.64         0.64          0.0         0.32   
1         0.21         0.28         0.50          0.0         0.14   
2         0.06         0.00         0.71          0.0         1.23   
3         0.00         0.00         0.00          0.0         0.63   
4         0.00         0.00         0.00          0.0         0.63   

   word_freq_5  word_freq_6  word_freq_7  word_freq_8  word_freq_9  ...  \
0         0.00         0.00         0.00         0.00         0.00  ...   
1         0.28         0.21         0.07         0.00         0.94  ...   
2         0.19         0.19         0.12         0.64         0.25  ...   
3         0.00         0.31         0.63         0.31         0.63  ...   
4         0.00         0.31         0.63         0.31         0.63  ...   

   char_freq_0  char_freq_1  char_freq_2  char_freq_3  char_freq_4  \
0         0.00        0.000          0.0        0.778        0.000   
1         0.00        0.132          0.0        0.372        0.180   
2         0.01        0.143          0.0        0.276        0.184   
3         0.00        0.137          0.0        0.137        0.000   
4         0.00        0.135          0.0        0.135        0.000   

   char_freq_5  capital_run_length_average  capital_run_length_longest  \
0        0.000                       3.756                          61   
1        0.048                       5.114                         101   
2        0.010                       9.821                         485   
3        0.000                       3.537                          40   
4        0.000                       3.537                          40   

   capital_run_length_total  is_spam  
0                       278        1  
1                      1028        1  
2                      2259        1  
3                       191        1  
4                       191        1  

[5 rows x 58 columns]

Class distribution:
is_spam
0    2788
1    1813
Name: count, dtype: int64
Spam ratio: 39.40%
Plot

4 Feature Analysis

Let’s analyze the most important features for distinguishing spam from non-spam emails.

# Calculate mean values for features in spam vs non-spam
spam_means = df[df['is_spam'] == 1].mean()
ham_means = df[df['is_spam'] == 0].mean()

# Find features with the largest differences
diff = spam_means - ham_means
top_diff_features = diff.abs().sort_values(ascending=False)[:10]

print("Top 10 features with the largest difference between spam and ham:")
print(top_diff_features)

# Create a bar chart for top differentiating features
plt.figure(figsize=(10, 6))
top_features = top_diff_features.index
feature_names_clean = [f.replace('word_freq_', '').replace('char_freq_', '') for f in top_features]

# Get values for ham and spam
ham_values = [ham_means[f] for f in top_features]
spam_values = [spam_means[f] for f in top_features]

# Create a dataframe for plotting
plot_df = pd.DataFrame({
    'Feature': feature_names_clean,
    'Ham': ham_values,
    'Spam': spam_values
}).melt(id_vars='Feature', var_name='Class', value_name='Frequency')

# Plot
sns.barplot(x='Feature', y='Frequency', hue='Class', data=plot_df, palette=['#3498db', '#e74c3c'])
plt.title('Top Differentiating Features Between Spam and Ham')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.legend(title='Email Type')

# Display the plot using base64 encoding
display(HTML(save_plot_as_base64(plt)))
Top 10 features with the largest difference between spam and ham:
capital_run_length_total      309.148468
capital_run_length_longest     86.178780
capital_run_length_average      7.141864
word_freq_26                    1.263716
is_spam                         1.000000
word_freq_18                    0.994199
word_freq_20                    0.941668
word_freq_24                    0.877994
word_freq_15                    0.444775
word_freq_25                    0.422822
dtype: float64
Plot

5 Model Training and Evaluation

Now, let’s train a Random Forest classifier to predict spam and evaluate its performance.

# Split features and target
X = df.drop('is_spam', axis=1)
y = df['is_spam']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a random forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train_scaled, y_train)

# Make predictions
y_pred = clf.predict(X_test_scaled)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
Accuracy: 0.9554831704668838

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.98      0.96       531
           1       0.98      0.92      0.95       390

    accuracy                           0.96       921
   macro avg       0.96      0.95      0.95       921
weighted avg       0.96      0.96      0.96       921

6 Feature Importance

Let’s examine which features are most important for the Random Forest model’s predictions.

# Feature importance
feature_importances = pd.DataFrame({
    'feature': X.columns,
    'importance': clf.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 10 most important features for spam classification:")
print(feature_importances.head(10))

# Create a bar chart for feature importance
plt.figure(figsize=(10, 6))
top_n = 15
top_features = feature_importances.head(top_n)
feature_names_clean = [f.replace('word_freq_', '').replace('char_freq_', '') for f in top_features['feature']]

sns.barplot(x='importance', y=feature_names_clean, 
            data={'importance': top_features['importance'], 'feature': feature_names_clean},
            palette='viridis')
plt.title(f'Top {top_n} Most Important Features for Spam Classification')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()

# Display the plot using base64 encoding
display(HTML(save_plot_as_base64(plt)))
Top 10 most important features for spam classification:
                       feature  importance
51                 char_freq_3    0.113786
52                 char_freq_4    0.096676
6                  word_freq_6    0.081848
15                word_freq_15    0.067141
55  capital_run_length_longest    0.058623
54  capital_run_length_average    0.057715
56    capital_run_length_total    0.052335
20                word_freq_20    0.046336
24                word_freq_24    0.042364
18                word_freq_18    0.032968
Plot

7 Making Predictions

Finally, let’s create a function to make predictions on new data.

# Function to make predictions on new data
def predict_spam(new_data, model, scaler):
    """
    Make predictions on new data
    
    Parameters:
    -----------
    new_data : pandas DataFrame
        New data to make predictions on, should have the same features as the training data
    model : trained model
        The trained model to use for predictions
    scaler : fitted scaler
        The scaler used to scale the training data
    
    Returns:
    --------
    predictions : numpy array
        Array of predictions (1 for spam, 0 for not spam)
    """
    # Scale the data
    new_data_scaled = scaler.transform(new_data)
    
    # Make predictions
    predictions = model.predict(new_data_scaled)
    probabilities = model.predict_proba(new_data_scaled)
    
    return predictions, probabilities

# Example of using the prediction function (with test data)
example_emails = X_test.iloc[:5]  # Take 5 examples from test set
predictions, probabilities = predict_spam(example_emails, clf, scaler)

# Create a results table
results_df = pd.DataFrame({
    'True Label': y_test.iloc[:5],
    'Predicted': predictions,
    'Probability of Spam': [p[1] for p in probabilities]
})

print("Example predictions:")
print(results_df)
Example predictions:
      True Label  Predicted  Probability of Spam
3683           0          0                 0.00
4412           0          0                 0.26
2584           0          0                 0.06
69             1          1                 0.57
1844           0          0                 0.04

8 Conclusion

In this document, we’ve explored the spambase dataset, trained a Random Forest classifier for spam detection, and analyzed the most important features for distinguishing spam from legitimate emails. The model achieves good performance, demonstrating the effectiveness of the selected features in identifying spam emails.