\[\\[0.2in]\]
\[\\[0.1in]\]
library(lexicon)
library(quanteda)
library(psych)
library(dplyr)
library(Hmisc)
library(ggplot2)
\[\\[0.1in]\]
youtube <- read.csv("tut3a.youtube.csv", header = T)
\[\\[0.001in]\]
\[\\[0.01in]\]
lexicon_yt <- dictionary(
list(partisanship = c("trump", "don*", "mitch", "mcconnell", "pence", "bush", "barack",
"obama", "republican*", "democrat*", "gop", "dems", "nancy",
"pelosi", "hillary", "clinton", "bernie", "sanders", "biden",
"liberal*", "sociali*", "conservative*", "potus", "administration"),
america = c("usa", "america*", "united", "union", "country", "nation"),
economy = c("econ*", "dollar*", "job*", "tax*", "employ*", "unemploy*", "mone*",
"pay*", "debt*", "homeless*", "poverty"),
emotresponse = c("lie*", "liar*", "true", "love", "hate", "shit", "sad", "hope",
"bless*", "stupid", "idiot",
"fuc*", "fake", "amaz*", "disgust*", "evil", "heaven", "hell")))
ytdfm <- dfm(youtube$cleaner_text, dictionary = lexicon_yt)
# Convert the DFM to a data frame
ytdfm2 <- convert(ytdfm, "data.frame")
names(ytdfm2)
## [1] "doc_id" "partisanship" "america" "economy" "emotresponse"
# Let's get familiar with the derived data...
psych::describe(ytdfm2)
## vars n mean sd median trimmed mad min max range
## doc_id* 1 9633 4817.00 2780.95 4817 4817.00 3570.1 1 9633 9632
## partisanship 2 9633 0.77 2.19 0 0.45 0.0 0 73 73
## america 3 9633 0.24 0.77 0 0.07 0.0 0 32 32
## economy 4 9633 0.19 0.94 0 0.00 0.0 0 39 39
## emotresponse 5 9633 0.33 0.80 0 0.16 0.0 0 14 14
## skew kurtosis se
## doc_id* 0.00 -1.20 28.33
## partisanship 17.40 450.13 0.02
## america 11.57 336.32 0.01
## economy 14.07 382.12 0.01
## emotresponse 4.64 38.83 0.01
# The correlation matrix for selected columns in the data frame
corPlot(cor(ytdfm2[2:5]))
# Let's create dummy variables for the dictionaries-based derived data
ytdfm2$partisan_dummy <- case_when((ytdfm2$partisanship > 0) ~ 1, TRUE ~ 0)
ytdfm2$america_dummy <- case_when((ytdfm2$america > 0) ~ 1, TRUE ~ 0)
ytdfm2$economy_dummy <- case_when((ytdfm2$economy > 0) ~ 1, TRUE ~ 0)
ytdfm2$emotresp_dummy <- case_when((ytdfm2$emotresponse > 0) ~ 1, TRUE ~ 0)
# Let's get familiar with these dummies...
psych::describe(ytdfm2)
## vars n mean sd median trimmed mad min max range
## doc_id* 1 9633 4817.00 2780.95 4817 4817.00 3570.1 1 9633 9632
## partisanship 2 9633 0.77 2.19 0 0.45 0.0 0 73 73
## america 3 9633 0.24 0.77 0 0.07 0.0 0 32 32
## economy 4 9633 0.19 0.94 0 0.00 0.0 0 39 39
## emotresponse 5 9633 0.33 0.80 0 0.16 0.0 0 14 14
## partisan_dummy 6 9633 0.41 0.49 0 0.39 0.0 0 1 1
## america_dummy 7 9633 0.16 0.36 0 0.07 0.0 0 1 1
## economy_dummy 8 9633 0.09 0.29 0 0.00 0.0 0 1 1
## emotresp_dummy 9 9633 0.23 0.42 0 0.16 0.0 0 1 1
## skew kurtosis se
## doc_id* 0.00 -1.20 28.33
## partisanship 17.40 450.13 0.02
## america 11.57 336.32 0.01
## economy 14.07 382.12 0.01
## emotresponse 4.64 38.83 0.01
## partisan_dummy 0.36 -1.87 0.01
## america_dummy 1.88 1.54 0.00
## economy_dummy 2.85 6.14 0.00
## emotresp_dummy 1.31 -0.28 0.00
ytdfm2 <- ytdfm2 %>%
mutate(row_num = row_number())
youtube <- youtube %>%
left_join(ytdfm2)
# Save the combined data to a CSV file
write.csv(youtube, "tut3b.youtube.csv")