assignment3

Packages

import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import datetime as dt
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import pickle
import time
import re

nltk.download('punkt_tab')

## True

nltk.download('wordnet')

## True

Import data:

df = pd.read_csv('data/reddit_data.csv')

df['Text'].isnull().value_counts()

## Text
## True     10426
## False     2428
## Name: count, dtype: int64

df['Title'].isnull().value_counts()

## Title
## False    12854
## Name: count, dtype: int64

Because only a small minority of posts have additional text, but all have title text, we will work only with the title text as a starting point.

df['datetime'] = df['Date Created'].apply(dt.datetime.fromtimestamp)

df['datetime'].apply(lambda x: x.year).value_counts()

## datetime
## 2022    7039
## 2021    4360
## 2020     521
## 2018     143
## 2019     130
## 2013     103
## 2016      94
## 2011      92
## 2014      83
## 2015      80
## 2012      72
## 2017      55
## 2010      46
## 2009      36
## Name: count, dtype: int64

Our data stretches back all the way to 2009, but the majority is from 2021 and 2022. For the sake of recency, and in recognition that political discourse can change rapidly over the course of a decade, we will only retain data from those most recent two years.

df = df[df['datetime'].apply(lambda x: x.year) >= 2021]

Now we will limit only to the fields we care about: Title, which will create our features, and Political Lean, which is our target variable. There is also a ‘score’ variable which shows the amount of engagement for a particular post. That could be an interesting variable for predicting post performance–but it’s out of scope for this analysis.

df = df[['Title','Political Lean']].rename(columns={'Title':'title',
                                              'Political Lean':'political_lean'})

Since the target is binary, I can convert it into 0 and 1. Going alphabetically, I’ll make Conservative 0 and Liberal 1.

df['political_lean'] = df['political_lean'].map({'Conservative':0,
                          'Liberal':1})

Now I’ll draw up a function to clean the text, including:

Lowercasing for consistency
Removing any stray URLs, punctuation and extra whitespace
Lemmatizing

def clean_and_lemmatize(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"\d+", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r"\s+", " ", text).strip()
    tokens = nltk.word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return " ".join(lemmatized_tokens)

lemmatizer = WordNetLemmatizer()

df['title_clean'] = df['title'].apply(clean_and_lemmatize)

Now I can vectorize the data. I’ll use TF-IDF vectorization, which relies on relative frequency rather than raw word counts.

# Create the vectorizer
vectorizer = TfidfVectorizer(
    stop_words='english',   
    max_features=10000     
)

# Fit and transform the text column
X = vectorizer.fit_transform(df['title_clean'])

And set y:

y = df['political_lean']

Now we can split the data:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=905
)

Run the SVM, trying different kernels:

# Linear kernel
clf_linear = SVC(kernel='linear')
clf_linear.fit(X_train, y_train)

SVC(kernel='linear')

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

pred_linear = clf_linear.predict(X_test)

# RBF kernel
clf_rbf = SVC(kernel='rbf')
clf_rbf.fit(X_train, y_train)

SVC()

pred_rbf = clf_rbf.predict(X_test)

# Polynomial kernel (degree 3 is default)
clf_poly = SVC(kernel='poly', degree=3)
clf_poly.fit(X_train, y_train)

SVC(kernel='poly')

pred_poly = clf_poly.predict(X_test)

print("Linear Kernel")

## Linear Kernel

print(classification_report(y_test, pred_linear))

##               precision    recall  f1-score   support
## 
##            0       0.72      0.51      0.60       795
##            1       0.77      0.89      0.83      1485
## 
##     accuracy                           0.76      2280
##    macro avg       0.75      0.70      0.71      2280
## weighted avg       0.76      0.76      0.75      2280

print("RBF Kernel")

## RBF Kernel

print(classification_report(y_test, pred_rbf))

##               precision    recall  f1-score   support
## 
##            0       0.81      0.45      0.58       795
##            1       0.76      0.94      0.84      1485
## 
##     accuracy                           0.77      2280
##    macro avg       0.79      0.70      0.71      2280
## weighted avg       0.78      0.77      0.75      2280

print("Polynomial Kernel")

## Polynomial Kernel

print(classification_report(y_test, pred_poly))

##               precision    recall  f1-score   support
## 
##            0       0.92      0.22      0.35       795
##            1       0.70      0.99      0.82      1485
## 
##     accuracy                           0.72      2280
##    macro avg       0.81      0.60      0.59      2280
## weighted avg       0.78      0.72      0.66      2280

The results showed comparable overall accuracy, though much lower recall (and therefore F-1 score) with the polynomial kernel.

For the sake of comparison, let’s run some simple random forests:

# Initialize and train
rf = RandomForestClassifier(n_estimators=100, random_state=905)
rf.fit(X_train, y_train)

RandomForestClassifier(random_state=905)

# Predict and evaluate
y_pred_rf = rf.predict(X_test)
print(classification_report(y_test, y_pred_rf))

##               precision    recall  f1-score   support
## 
##            0       0.73      0.56      0.63       795
##            1       0.79      0.89      0.84      1485
## 
##     accuracy                           0.77      2280
##    macro avg       0.76      0.72      0.73      2280
## weighted avg       0.77      0.77      0.76      2280

If we dig deeper into the SVM results using the linear kernel, we can identify feature importance to understand which terms were most predictive of one ideological lean or the other.

# Get feature names and coefficients
feature_names = vectorizer.get_feature_names_out()
coefs = clf_linear.coef_.toarray().flatten()

# Combine into a DataFrame
coef_df = pd.DataFrame({
    'feature': feature_names,
    'weight': coefs
})

Top 20 predicting conservative:

coef_df.sort_values(by='weight',
                    ascending=True).iloc[:20,:]

##           feature    weight
## 4539  libertarian -3.819915
## 5168          nov -3.571341
## 2124     desantis -3.501517
## 1613         cpac -2.960295
## 902    capitalism -2.893035
## 5915     property -2.657235
## 7798       sowell -2.615943
## 4061    inflation -2.559516
## 5212          oct -2.537729
## 8878      trudeau -2.523635
## 2577       durham -2.446805
## 6047        putin -2.385296
## 3833      hillary -2.318959
## 315      antiwork -2.315644
## 5037    narrative -2.263012
## 6977      russian -2.231567
## 663       bitcoin -2.217985
## 9007      ukraine -2.143356
## 88          added -2.114426
## 3202          fed -2.097963

Top 20 predicting liberal:

coef_df.sort_values(by='weight',
                    ascending=False).iloc[:20,:]

##           feature    weight
## 4167      invades  2.128395
## 7974    statement  1.746002
## 5270      opinion  1.728149
## 9808       worker  1.711885
## 1579         coup  1.676956
## 8493      tension  1.625141
## 7751   solidarity  1.568630
## 2033    democracy  1.555745
## 4946       modern  1.515167
## 4237          jan  1.496771
## 6113       racist  1.469324
## 1859         deal  1.469110
## 5549         plan  1.457777
## 8526         text  1.448866
## 3778   healthcare  1.446009
## 5503  perspective  1.409805
## 3138     farright  1.403697
## 3228       female  1.383830
## 4621       losing  1.382278
## 5484      perfect  1.378698

assignment3

2025-04-10

Packages