\[\\[0.2in]\]

Building custom dictionaries/lexicons

\[\\[0.1in]\]

Housekeeping

Loading needed libraries

library(lexicon)
library(quanteda)
library(psych)
library(dplyr)
library(Hmisc)
library(ggplot2)

\[\\[0.1in]\]

Read YouTube data from a CSV file

youtube <- read.csv("tut3a.youtube.csv", header = T)

\[\\[0.001in]\]

\[\\[0.01in]\]

Creating dictionaries

lexicon_yt <- dictionary(
  list(partisanship = c("trump", "don*", "mitch", "mcconnell", "pence", "bush", "barack", 
                        "obama", "republican*", "democrat*", "gop", "dems", "nancy", 
                        "pelosi", "hillary", "clinton", "bernie", "sanders", "biden", 
                        "liberal*", "sociali*", "conservative*", "potus", "administration"),
       america = c("usa", "america*", "united", "union", "country", "nation"),
       economy = c("econ*", "dollar*", "job*", "tax*", "employ*", "unemploy*", "mone*", 
                   "pay*", "debt*", "homeless*", "poverty"),
       emotresponse = c("lie*", "liar*", "true", "love", "hate", "shit", "sad", "hope", 
                        "bless*", "stupid", "idiot",
                        "fuc*", "fake", "amaz*", "disgust*", "evil", "heaven", "hell")))

Create document-feature matrix

ytdfm <- dfm(youtube$cleaner_text, dictionary = lexicon_yt) 

# Convert the DFM to a data frame
ytdfm2 <- convert(ytdfm, "data.frame")
names(ytdfm2)

## [1] "doc_id"       "partisanship" "america"      "economy"      "emotresponse"

# Let's get familiar with the derived data...
psych::describe(ytdfm2)

##              vars    n    mean      sd median trimmed    mad min  max range
## doc_id*         1 9633 4817.00 2780.95   4817 4817.00 3570.1   1 9633  9632
## partisanship    2 9633    0.77    2.19      0    0.45    0.0   0   73    73
## america         3 9633    0.24    0.77      0    0.07    0.0   0   32    32
## economy         4 9633    0.19    0.94      0    0.00    0.0   0   39    39
## emotresponse    5 9633    0.33    0.80      0    0.16    0.0   0   14    14
##               skew kurtosis    se
## doc_id*       0.00    -1.20 28.33
## partisanship 17.40   450.13  0.02
## america      11.57   336.32  0.01
## economy      14.07   382.12  0.01
## emotresponse  4.64    38.83  0.01

# The correlation matrix for selected columns in the data frame
corPlot(cor(ytdfm2[2:5]))

Trandforming derived variables into dummies

# Let's create dummy variables for the dictionaries-based derived data
ytdfm2$partisan_dummy <- case_when((ytdfm2$partisanship > 0) ~ 1, TRUE ~ 0)
ytdfm2$america_dummy <- case_when((ytdfm2$america > 0) ~ 1, TRUE ~ 0)
ytdfm2$economy_dummy <- case_when((ytdfm2$economy > 0) ~ 1, TRUE ~ 0)
ytdfm2$emotresp_dummy <- case_when((ytdfm2$emotresponse > 0) ~ 1, TRUE ~ 0)

# Let's get familiar with these dummies...

psych::describe(ytdfm2)

##                vars    n    mean      sd median trimmed    mad min  max range
## doc_id*           1 9633 4817.00 2780.95   4817 4817.00 3570.1   1 9633  9632
## partisanship      2 9633    0.77    2.19      0    0.45    0.0   0   73    73
## america           3 9633    0.24    0.77      0    0.07    0.0   0   32    32
## economy           4 9633    0.19    0.94      0    0.00    0.0   0   39    39
## emotresponse      5 9633    0.33    0.80      0    0.16    0.0   0   14    14
## partisan_dummy    6 9633    0.41    0.49      0    0.39    0.0   0    1     1
## america_dummy     7 9633    0.16    0.36      0    0.07    0.0   0    1     1
## economy_dummy     8 9633    0.09    0.29      0    0.00    0.0   0    1     1
## emotresp_dummy    9 9633    0.23    0.42      0    0.16    0.0   0    1     1
##                 skew kurtosis    se
## doc_id*         0.00    -1.20 28.33
## partisanship   17.40   450.13  0.02
## america        11.57   336.32  0.01
## economy        14.07   382.12  0.01
## emotresponse    4.64    38.83  0.01
## partisan_dummy  0.36    -1.87  0.01
## america_dummy   1.88     1.54  0.00
## economy_dummy   2.85     6.14  0.00
## emotresp_dummy  1.31    -0.28  0.00

ytdfm2 <- ytdfm2 %>%
  mutate(row_num = row_number())

youtube <- youtube %>% 
  left_join(ytdfm2)

# Save the combined data to a CSV file
write.csv(youtube, "tut3b.youtube.csv")

DSO2008 2023-2024 Working with Big Data

Eliyahu V. Sapir

23 November 2023

Building custom dictionaries/lexicons

Housekeeping

Loading needed libraries

Read YouTube data from a CSV file

Creating dictionaries

Create document-feature matrix

Trandforming derived variables into dummies