setwd("/Users/isaiahmireles/Desktop/Trump folder")
getwd()
## [1] "/Users/isaiahmireles/Desktop/Trump folder"
df <- read.csv("tweet_themes_with_sentiment.csv")
colnames(df)
## [1] "date" "platform" "handle" "text"
## [5] "favorite_count" "repost_count" "deleted_flag" "word_count"
## [9] "hashtags" "urls" "user_mentions" "media_count"
## [13] "media_urls" "post_url" "text_clean" "theme_label"
## [17] "confidence" "word_scores" "sentiment" "pos"
## [21] "neu" "neg" "but_count"
df$date <- as.POSIXct(df$date, format = "%Y-%m-%d %H:%M:%S")
df$date <- as.Date(df$date)
unique(df$theme_label)
## [1] "economics" "immigration" "homelessness" "religion" "education"
okay, the topic labels are :
"economics" "immigration" "homelessness" "religion" "education"
We will use government data for each variable.
range(df$date)
## [1] "2009-05-12" "2026-01-06"
# YYYY-MM-DD HH:MM:SS
start_date <- as.Date(range(df$date)[1])
GDP <- read.csv("GDP.csv")
GDP$observation_date <- as.Date(GDP$observation_date)
GDP <- GDP[GDP$observation_date >= start_date, ]
REALGDP <- read.csv("REALGDP.csv")
REALGDP$observation_date <- as.Date(REALGDP$observation_date)
REALGDP <- REALGDP[REALGDP$observation_date >= start_date, ]
GDPPERCAPITA <- read.csv("GDPPERCAPITA.csv")
GDPPERCAPITA$observation_date <- as.Date(GDPPERCAPITA$observation_date)
GDPPERCAPITA <- GDPPERCAPITA[GDPPERCAPITA$observation_date >= start_date, ]
MEDIANCPI <- read.csv("MEDIANCPI.csv")
MEDIANCPI$observation_date <- as.Date(MEDIANCPI$observation_date)
MEDIANCPI <- MEDIANCPI[MEDIANCPI$observation_date >= start_date, ]
Exports <- read.csv("Exports.csv")
Exports$observation_date <- as.Date(Exports$observation_date)
Exports <- GDPPERCAPITA[Exports$observation_date >= start_date, ]
IMPORTS <- read.csv("IMPORTS.csv")
IMPORTS$observation_date <- as.Date(IMPORTS$observation_date)
IMPORTS <- GDPPERCAPITA[IMPORTS$observation_date >= start_date, ]
UNRATE <- read.csv("UNRATE.csv")
UNRATE$observation_date <- as.Date(UNRATE$observation_date)
UNRATE <- UNRATE[UNRATE$observation_date >= start_date, ]
SearchingWork <- read.csv("SearchingWork.csv")
SearchingWork$observation_date <- as.Date(SearchingWork$observation_date)
SearchingWork <- SearchingWork[SearchingWork$observation_date >= start_date, ]
Joining Data
Government current expenditures: Education
GovExpendituresEdu <- read.csv("GovExpendituresEdu.csv")
GovExpendituresEdu$observation_date <- as.Date(GovExpendituresEdu$observation_date)
GovExpendituresEdu <- SearchingWork[GovExpendituresEdu$observation_date >= start_date, ]
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df_joined <- df %>%
left_join(GDP, by = c("date" = "observation_date")) %>%
left_join(REALGDP, by = c("date" = "observation_date")) %>%
left_join(GDPPERCAPITA, by = c("date" = "observation_date")) %>%
left_join(MEDIANCPI, by = c("date" = "observation_date")) %>%
left_join(Exports, by = c("date" = "observation_date")) %>%
left_join(IMPORTS, by = c("date" = "observation_date")) %>%
left_join(UNRATE, by = c("date" = "observation_date")) %>%
left_join(SearchingWork, by = c("date" = "observation_date")) %>%
left_join(GovExpendituresEdu, by = c("date" = "observation_date"))
Vis. NAs
library(naniar)
library(dplyr)
library(tidyr)
library(ggplot2)
df_shadow <- bind_shadow(df_joined)
df_long <- df_shadow |>
select(date, ends_with("_NA")) |>
pivot_longer(-date,
names_to = "variable",
values_to = "missing")
ggplot(df_long, aes(x = date, y = variable, fill = missing)) +
geom_tile() +
scale_fill_manual(values = c("grey80", "red")) +
labs(
title = "Missing Data Across Time",
x = "Date",
y = "Variable"
)