setwd("/Users/isaiahmireles/Desktop/Trump folder")
getwd()
## [1] "/Users/isaiahmireles/Desktop/Trump folder"
df <- read.csv("tweet_themes_with_sentiment.csv")
colnames(df)
##  [1] "date"           "platform"       "handle"         "text"          
##  [5] "favorite_count" "repost_count"   "deleted_flag"   "word_count"    
##  [9] "hashtags"       "urls"           "user_mentions"  "media_count"   
## [13] "media_urls"     "post_url"       "text_clean"     "theme_label"   
## [17] "confidence"     "word_scores"    "sentiment"      "pos"           
## [21] "neu"            "neg"            "but_count"
df$date <- as.POSIXct(df$date, format = "%Y-%m-%d %H:%M:%S")
df$date <- as.Date(df$date) 
unique(df$theme_label)
## [1] "economics"    "immigration"  "homelessness" "religion"     "education"

okay, the topic labels are :

"economics"    "immigration"  "homelessness" "religion"     "education"   

We will use government data for each variable.

Date Range

range(df$date)
## [1] "2009-05-12" "2026-01-06"
# YYYY-MM-DD HH:MM:SS

Economics

Gross Domestic Product

Real Gross Domestic Product

Real GDP Per Capita

Median Consumer Price Index

Exports of goods and services

Imports of goods and services

start_date <- as.Date(range(df$date)[1])

GDP <- read.csv("GDP.csv")
GDP$observation_date <- as.Date(GDP$observation_date)
GDP <- GDP[GDP$observation_date >= start_date, ]

REALGDP <- read.csv("REALGDP.csv")
REALGDP$observation_date <- as.Date(REALGDP$observation_date)
REALGDP <- REALGDP[REALGDP$observation_date >= start_date, ]

GDPPERCAPITA <- read.csv("GDPPERCAPITA.csv")
GDPPERCAPITA$observation_date <- as.Date(GDPPERCAPITA$observation_date)
GDPPERCAPITA <- GDPPERCAPITA[GDPPERCAPITA$observation_date >= start_date, ]

MEDIANCPI <- read.csv("MEDIANCPI.csv")
MEDIANCPI$observation_date <- as.Date(MEDIANCPI$observation_date)
MEDIANCPI <- MEDIANCPI[MEDIANCPI$observation_date >= start_date, ]

Exports  <- read.csv("Exports.csv")
Exports$observation_date <- as.Date(Exports$observation_date)
Exports <- GDPPERCAPITA[Exports$observation_date >= start_date, ]

IMPORTS  <- read.csv("IMPORTS.csv")
IMPORTS$observation_date <- as.Date(IMPORTS$observation_date)
IMPORTS <- GDPPERCAPITA[IMPORTS$observation_date >= start_date, ]

homelessness

Unemployment Rate

Not in Labor Force - Want a Job Now, Marginally Attached (Searched for Work in Previous Year, Available to Work Now) (LNU05026642)

UNRATE <- read.csv("UNRATE.csv")
UNRATE$observation_date <- as.Date(UNRATE$observation_date)
UNRATE <- UNRATE[UNRATE$observation_date >= start_date, ]


SearchingWork <- read.csv("SearchingWork.csv")
SearchingWork$observation_date <- as.Date(SearchingWork$observation_date)
SearchingWork <- SearchingWork[SearchingWork$observation_date >= start_date, ]

Joining Data

Education

Government current expenditures: Education

GovExpendituresEdu <- read.csv("GovExpendituresEdu.csv")
GovExpendituresEdu$observation_date <- as.Date(GovExpendituresEdu$observation_date)
GovExpendituresEdu <- SearchingWork[GovExpendituresEdu$observation_date >= start_date, ]

immigration

Nationwide Encounters

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
df_joined <- df %>%
  left_join(GDP,              by = c("date" = "observation_date")) %>%
  left_join(REALGDP,          by = c("date" = "observation_date")) %>%
  left_join(GDPPERCAPITA,     by = c("date" = "observation_date")) %>%
  left_join(MEDIANCPI,        by = c("date" = "observation_date")) %>%
  left_join(Exports,          by = c("date" = "observation_date")) %>%
  left_join(IMPORTS,          by = c("date" = "observation_date")) %>%
  left_join(UNRATE,           by = c("date" = "observation_date")) %>%
  left_join(SearchingWork,    by = c("date" = "observation_date")) %>%
  left_join(GovExpendituresEdu, by = c("date" = "observation_date"))

Vis. NAs

library(naniar)
library(dplyr)
library(tidyr)
library(ggplot2)

df_shadow <- bind_shadow(df_joined)
df_long <- df_shadow |>
  select(date, ends_with("_NA")) |>
  pivot_longer(-date,
               names_to = "variable",
               values_to = "missing")
ggplot(df_long, aes(x = date, y = variable, fill = missing)) +
  geom_tile() +
  scale_fill_manual(values = c("grey80", "red")) +
  labs(
    title = "Missing Data Across Time",
    x = "Date",
    y = "Variable"
  )