knitr::opts_chunk$set(echo = TRUE)
#reticulate allows for python code in R environment
library(reticulate)
#set environment and options specific for reticulate
use_condaenv("r-reticulate", required=TRUE)
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import datetime as dt
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import pickle
import time
import re
nltk.download('punkt_tab')
## True
nltk.download('wordnet')
## True
Import data:
df = pd.read_csv('data/reddit_data.csv')
df['Text'].isnull().value_counts()
## Text
## True 10426
## False 2428
## Name: count, dtype: int64
df['Title'].isnull().value_counts()
## Title
## False 12854
## Name: count, dtype: int64
Because only a small minority of posts have additional text, but all have title text, we will work only with the title text as a starting point.
df['datetime'] = df['Date Created'].apply(dt.datetime.fromtimestamp)
df['datetime'].apply(lambda x: x.year).value_counts()
## datetime
## 2022 7039
## 2021 4360
## 2020 521
## 2018 143
## 2019 130
## 2013 103
## 2016 94
## 2011 92
## 2014 83
## 2015 80
## 2012 72
## 2017 55
## 2010 46
## 2009 36
## Name: count, dtype: int64
Our data stretches back all the way to 2009, but the majority is from 2021 and 2022. For the sake of recency, and in recognition that political discourse can change rapidly over the course of a decade, we will only retain data from those most recent two years.
df = df[df['datetime'].apply(lambda x: x.year) >= 2021]
Now we will limit only to the fields we care about: Title, which will create our features, and Political Lean, which is our target variable. There is also a ‘score’ variable which shows the amount of engagement for a particular post. That could be an interesting variable for predicting post performance–but it’s out of scope for this analysis.
df = df[['Title','Political Lean']].rename(columns={'Title':'title',
'Political Lean':'political_lean'})
Since the target is binary, I can convert it into 0 and 1. Going alphabetically, I’ll make Conservative 0 and Liberal 1.
df['political_lean'] = df['political_lean'].map({'Conservative':0,
'Liberal':1})
Now I’ll draw up a function to clean the text, including:
def clean_and_lemmatize(text):
text = str(text).lower()
text = re.sub(r"http\S+|www\S+", "", text)
text = re.sub(r"\d+", "", text)
text = text.translate(str.maketrans("", "", string.punctuation))
text = re.sub(r"\s+", " ", text).strip()
tokens = nltk.word_tokenize(text)
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
return " ".join(lemmatized_tokens)
lemmatizer = WordNetLemmatizer()
df['title_clean'] = df['title'].apply(clean_and_lemmatize)
Now I can vectorize the data. I’ll use TF-IDF vectorization, which relies on relative frequency rather than raw word counts.
# Create the vectorizer
vectorizer = TfidfVectorizer(
stop_words='english',
max_features=10000
)
# Fit and transform the text column
X = vectorizer.fit_transform(df['title_clean'])
And set y:
y = df['political_lean']
Now we can split the data:
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=905
)
Run the SVM, trying different kernels:
# Linear kernel
clf_linear = SVC(kernel='linear')
clf_linear.fit(X_train, y_train)
SVC(kernel='linear')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SVC(kernel='linear')
pred_linear = clf_linear.predict(X_test)
# RBF kernel
clf_rbf = SVC(kernel='rbf')
clf_rbf.fit(X_train, y_train)
SVC()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SVC()
pred_rbf = clf_rbf.predict(X_test)
# Polynomial kernel (degree 3 is default)
clf_poly = SVC(kernel='poly', degree=3)
clf_poly.fit(X_train, y_train)
SVC(kernel='poly')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SVC(kernel='poly')
pred_poly = clf_poly.predict(X_test)
print("Linear Kernel")
## Linear Kernel
print(classification_report(y_test, pred_linear))
## precision recall f1-score support
##
## 0 0.72 0.51 0.60 795
## 1 0.77 0.89 0.83 1485
##
## accuracy 0.76 2280
## macro avg 0.75 0.70 0.71 2280
## weighted avg 0.76 0.76 0.75 2280
print("RBF Kernel")
## RBF Kernel
print(classification_report(y_test, pred_rbf))
## precision recall f1-score support
##
## 0 0.81 0.45 0.58 795
## 1 0.76 0.94 0.84 1485
##
## accuracy 0.77 2280
## macro avg 0.79 0.70 0.71 2280
## weighted avg 0.78 0.77 0.75 2280
print("Polynomial Kernel")
## Polynomial Kernel
print(classification_report(y_test, pred_poly))
## precision recall f1-score support
##
## 0 0.92 0.22 0.35 795
## 1 0.70 0.99 0.82 1485
##
## accuracy 0.72 2280
## macro avg 0.81 0.60 0.59 2280
## weighted avg 0.78 0.72 0.66 2280
The results showed comparable overall accuracy, though much lower recall (and therefore F-1 score) with the polynomial kernel.
For the sake of comparison, let’s run some simple random forests:
# Initialize and train
rf = RandomForestClassifier(n_estimators=100, random_state=905)
rf.fit(X_train, y_train)
RandomForestClassifier(random_state=905)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(random_state=905)
# Predict and evaluate
y_pred_rf = rf.predict(X_test)
print(classification_report(y_test, y_pred_rf))
## precision recall f1-score support
##
## 0 0.73 0.56 0.63 795
## 1 0.79 0.89 0.84 1485
##
## accuracy 0.77 2280
## macro avg 0.76 0.72 0.73 2280
## weighted avg 0.77 0.77 0.76 2280
If we dig deeper into the SVM results using the linear kernel, we can identify feature importance to understand which terms were most predictive of one ideological lean or the other.
# Get feature names and coefficients
feature_names = vectorizer.get_feature_names_out()
coefs = clf_linear.coef_.toarray().flatten()
# Combine into a DataFrame
coef_df = pd.DataFrame({
'feature': feature_names,
'weight': coefs
})
Top 20 predicting conservative:
coef_df.sort_values(by='weight',
ascending=True).iloc[:20,:]
## feature weight
## 4539 libertarian -3.819915
## 5168 nov -3.571341
## 2124 desantis -3.501517
## 1613 cpac -2.960295
## 902 capitalism -2.893035
## 5915 property -2.657235
## 7798 sowell -2.615943
## 4061 inflation -2.559516
## 5212 oct -2.537729
## 8878 trudeau -2.523635
## 2577 durham -2.446805
## 6047 putin -2.385296
## 3833 hillary -2.318959
## 315 antiwork -2.315644
## 5037 narrative -2.263012
## 6977 russian -2.231567
## 663 bitcoin -2.217985
## 9007 ukraine -2.143356
## 88 added -2.114426
## 3202 fed -2.097963
Top 20 predicting liberal:
coef_df.sort_values(by='weight',
ascending=False).iloc[:20,:]
## feature weight
## 4167 invades 2.128395
## 7974 statement 1.746002
## 5270 opinion 1.728149
## 9808 worker 1.711885
## 1579 coup 1.676956
## 8493 tension 1.625141
## 7751 solidarity 1.568630
## 2033 democracy 1.555745
## 4946 modern 1.515167
## 4237 jan 1.496771
## 6113 racist 1.469324
## 1859 deal 1.469110
## 5549 plan 1.457777
## 8526 text 1.448866
## 3778 healthcare 1.446009
## 5503 perspective 1.409805
## 3138 farright 1.403697
## 3228 female 1.383830
## 4621 losing 1.382278
## 5484 perfect 1.378698