\[\\[0.2in]\]
This course provides a systematic introduction to the tools and analytical methods that are being used by data analysts, with special attention to sentiment analysis, the process of computationally identifying and categorizing opinions expressed in text. Specifically, you can learn how to collect and organize data at scale, and gain new insights on how Big Data analysis can help in addressing high impact research questions.
The code in this document will guide you through three stages of the analysis. You can copy bits and paste them into your R code in RStudio. This code will walk you through scraping/harvesting data, pre-processing and wrangling data and analyzing and reporting results from your analyses.
\[\\[0.1in]\]
if(!require(academictwitteR)){install.packages("academictwitteR");library(academictwitteR)}
if(!require(data.table)){install.packages("data.table");library(data.table)}
if(!require(dplyr)){install.packages("dplyr");library(dplyr)}
if(!require(gmodels)){install.packages("gmodels");library(gmodels)}
if(!require(ggplot2)){install.packages("ggplot2");library(ggplot2)}
if(!require(ggpubr)){install.packages("ggpubr");library(ggpupr)}
if(!require(Hmisc)){install.packages("Hmisc");library(Hmisc)}
if(!require(lexicon)){install.packages("lexicon");library(lexicon)}
if(!require(lm.beta)) {install.packages("lm.beta"); library(lm.beta)}
if(!require(psych)){install.packages("psych");library(psych)}
if(!require(qdapDictionaries)){install.packages("qdapDictionaries");library(qdapDictionaries)}
if(!require(quanteda)){install.packages("quanteda");library(quanteda)}
if(!require(RColorBrewer)){install.packages("RColorBrewer");library(RColorBrewer)}
if(!require(rlang)){install.packages("rlang");library(rlang)}
if(!require(rtweet)){install.packages("rtweet");library(rtweet)}
if(!require(SentimentAnalysis)){install.packages("SentimentAnalysis");library(SentimentAnalysis)}
if(!require(sentimentr)){install.packages("sentimentr");library(sentimentr)}
if(!require(SnowballC)){install.packages("SnowballC");library(SnowballC)}
if(!require(stringr)){install.packages("stringr");library(stringr)}
if(!require(summarytools)){install.packages("summarytools");library(summarytools)}
if(!require(tidyr)){install.packages("tidyr");library(tidyr)}
if(!require(tidyr)){install.packages("tidyr");library(tidyr)}
if(!require(tidytext)){install.packages("tidytext");library(tidytext)}
if(!require(tidyverse)){install.packages("tidyverse");library(tidyverse)}
if(!require(tm)){install.packages("tm");library(tm)}
if(!require(vader)){install.packages("vader");library(vader)}
if(!require(wordcloud2)){install.packages("wordcloud2");library(wordcloud2)}
if(!require(disk.frame)){install.packages("disk.frame");library(disk.frame)}
# in case the disk.frame package does not download or install properly, run the following line:
# devtools::install_github("xiaodaigh/disk.frame")
\[\\[0.1in]\]
\[\\[0.001in]\]
df1 <- search_tweets(q= "inflation OR #inflation", # These are our search terms
n= 10, # Number of tweets we want
include_rts= FALSE, # No re-tweets
lang="en", # Tweets in English only
retryonratelimit = TRUE)
names(df_rtweet) # These are all the variables in the dataframe.
# In WBD we will not use all variables but only a selection of them.
# Selecting WBD variables
df1_slim<- df1 %>%
dplyr::select(user_id,
status_id,
screen_name,
created_at,
text,
source,
reply_to_status_id,
reply_to_user_id,
reply_to_screen_name,
is_quote,
is_retweet,
favorite_count,
retweet_count,
quote_count,
reply_count,
hashtags,
urls_url,
urls_expanded_url,
mentions_user_id,
mentions_screen_name,
lang,
location,
description,
account_created_at,
followers_count,
friends_count,
statuses_count,
profile_url,
profile_image_url)
# Setting variable in date form
df1_slim <- df1_slim %>%
mutate(created_at = as.POSIXct(created_at, format="%Y-%m-%dT%H:%M:%OS"))
write_as_csv(df_slim, "inflation1.csv") # Saving the dataset
# This dataset is available on Canvas
\[\\[0.001in]\]
# NB
#
# Before you can continue, you will need to receive a token from your tutor. Your tutor will explain how to use this token, and once it is identified by the twitter API you can proceed to downloading your data
#
#
#
# Once you have set the token, you can search Twitter using the API
datapath = "inflation_method2/" # specify a path name for where you want to store the data
# Make the API-call
df2 <- get_all_tweets(
query = "inflation OR #inflation", # These are our search terms
n = 10000 # Number of tweets we want
is_retweet=FALSE, # No re-tweets
lang="en", # Tweets in English only
start_tweets = "2019-12-01T00:00:00Z", # Start time
end_tweets = "2021-12-01T00:00:00Z", # End time
file = "inflation",
data_path = datapath,
)
df <- df2 %>% mutate(tweetid = row_number()) # Create ID column
# Unwwnesting
df <- unnest(df, cols= c(entities, public_metrics, attachments, geo))
df_hashes <- unnest(df, cols=hashtags)
df_mentions <- unnest(df, cols=mentions, names_repair=tidyr::tidyr_legacy)
df_urls <- unnest(df, cols=urls)
df_reftweets <- unnest(df, cols=referenced_tweets, names_repair=tidyr::tidyr_legacy)
# Renesting and joining back
df_hashes <- df_hashes %>% group_by(tweetid) %>% summarise(hashes=list(tag))
df <- df %>% left_join(df_hashes)
df_mentions <- df_mentions %>% group_by(tweetid) %>% summarise(mentioned_username=list(username), mentioned_userid = list(id1))
df <- df %>% left_join(df_mentions)
df_urls <- df_urls %>% group_by(tweetid) %>%
summarise(urls_url=list(url), urls_expanded_url= list(expanded_url))
df <- df %>% left_join(df_urls)
df_reftweets <- df_reftweets %>%
group_by(tweetid) %>% summarise(type=list(type), rt_tweetid= list(id1))
df <- df %>% left_join(df_reftweets)
# Now looking at the User's data:
df_users <- bind_tweets(data_path = datapath, user=TRUE)
df_users <- df_users %>% unnest(public_metrics)
df_users <- df_users %>%
rename(user_id = id,
profile_url = url,
account_created_at = created_at,
friends_count = following_count,
screen_name = username,
statuses_count = tweet_count)
names(df_users)
# Merging User data with tweet data
df <- df %>%
rename(user_id = author_id) %>%
inner_join(df_users) %>%
distinct(id, .keep_all=TRUE)
# Transforming, adding, and geting rid of redundant columns
df$type<-as.character(df$type)
df$rt_tweetid<-as.character(df$rt_tweetid)
df <- df %>%
mutate(is_quote = case_when(type == "quoted"~TRUE, TRUE~FALSE),
is_reply = case_when(type == "replied_to"~TRUE, TRUE~FALSE),
is_retweet = case_when(type == "retweeted"~TRUE, TRUE~FALSE))
# Restructure and rename
df <- df %>%
dplyr::select(user_id = user_id,
status_id=id,
screen_name,
created_at,
text,
source,
reply_to_status_id=rt_tweetid,
reply_to_user_id = in_reply_to_user_id,
#reply_to_screen_name,
is_quote,
is_reply,
is_retweet,
favorite_count=like_count,
retweet_count,
quote_count,
reply_count,
hashtags = hashes,
urls_url,
urls_expanded_url,
mentions_user_id = mentioned_userid,
mentions_screen_name = mentioned_username,
lang,
location,
description,
account_created_at,
followers_count,
friends_count,
statuses_count,
profile_url,
profile_image_url)
names(df)[1] <- "user_id"
names(df)[2] <- "status_id"
names(df)[3] <- "screen_name"
names(df)[4] <- "created_at"
names(df)[5] <- "text"
names(df)[6] <- "source"
names(df)[7] <- "reply_to_status_id"
names(df)[8] <- "reply_to_user_id"
names(df)[9] <- "reply_to_screen_name"
names(df)[10] <- "is_quote"
names(df)[11] <- "is_retweet"
names(df)[12] <- "favorite_count"
names(df)[13] <- "retweet_count"
names(df)[14] <- "quote_count"
names(df)[15] <- "reply_count"
names(df)[16] <- "hashtags"
names(df)[17] <- "urls_url"
names(df)[18] <- "urls_expanded_url"
names(df)[19] <- "mentions_user_id"
names(df)[20] <- "mentions_screen_name"
names(df)[21] <- "lang"
names(df)[22] <- "location"
names(df)[23] <- "description"
names(df)[24] <- "account_created_at"
names(df)[25] <- "followers_count"
names(df)[26] <- "friends_count"
names(df)[27] <- "statuses_count"
names(df)[28] <- "profile_url"
names(df)[29] <- "profile_image_url"
# Setting variable in date form
df <- df %>% mutate(created_at = as.POSIXct(created_at, format="%Y-%m-%dT%H:%M:%OS"))
write_as_csv(df_slim, "inflation2.csv") # Saving the dataset
# This dataset is available on Canvas
\[\\[0.01in]\]
#
setup_disk.frame(4)
## The number of workers available for disk.frame is 4
# Setting up four workers (cores)
output_path <- file.path(tempdir(), "dframe.df")
# Chunks will temporarily be saved at tempdir
# You can find tempdir (on windows) at: C:\Users\User\AppData\Local\Temp
start <- Sys.time()
dframe <- csv_to_disk.frame("mydata1.csv")
end <- Sys.time()
# Loading chunked data to secondary memory (tempdir)
end-start
## Time difference of 24.529 secs
# reading the csv into a disk.frame
class(dframe)
## [1] "disk.frame" "disk.frame.folder"
dframe
## path: "C:\Users\EVS\AppData\Local\Temp\RtmpsVISKr\file3cffc5b396f6c.df"
## nchunks: 4
## nrow (at source): 1e+06
## ncol (at source): 95
## nrow (post operations): ???
## ncol (post operations): ???
names(dframe)
## [1] "tweet_id" "text" "author_id"
## [4] "conversation_id" "created_at" "in_reply_to_user"
## [7] "lang" "possibly_sensitive" "retweets"
## [10] "replies" "likes" "quotes"
## [13] "source" "mentions" "hashtags"
## [16] "urls" "type_retweet" "org_tweet_text"
## [19] "org_tweet_id" "org_author_id" "org_conversation_id"
## [22] "org_created_at" "org_retweets" "org_replies"
## [25] "org_likes" "org_quotes" "org_mentions"
## [28] "org_hashtags" "org_urls" "org_account_username"
## [31] "org_account_followers" "org_account_following" "org_account_volume"
## [34] "org_account_bio" "org_account_created_at" "account_username"
## [37] "account_followers" "account_following" "account_volume"
## [40] "account_bio" "account_created_at" "text_lowercase"
## [43] "astra_count" "moderna_count" "pfizer_count"
## [46] "johnson1_count" "johnson2_count" "johnson3_count"
## [49] "johnson4_count" "johnson5_count" "sputnik_count"
## [52] "johnson_count" "total_vax_mention" "urls_lowercase"
## [55] "urls_astra_count" "urls_moderna_count" "urls_pfizer_count"
## [58] "urls_johnson1_count" "urls_johnson2_count" "urls_johnson3_count"
## [61] "urls_johnson4_count" "urls_johnson5_count" "urls_sputnik_count"
## [64] "urls_johnson_count" "urls_total_vax_mention" "vax_mention_text_url"
## [67] "urls_johnson6_count" "urls_johnson7_count" "johnson6_count"
## [70] "johnson7_count" "prop_astra" "prop_moderna"
## [73] "prop_pfizer" "prop_johnson" "prop_sputnik"
## [76] "astra_dummy" "vax_brand" "moderna_dummy"
## [79] "pfizer_dummy" "johnson_dummy" "sputnik_dummy"
## [82] "as.Date.created_at." "daily_mentions" "daily_mentions_astra"
## [85] "daily_mentions_pfizer" "daily_mentions_moderna" "daily_mentions_johnson"
## [88] "daily_mentions_sputnik" "daily_prop_astra" "daily_prop_pfizer"
## [91] "daily_prop_moderna" "daily_prop_johnson" "daily_prop_sputnik"
## [94] "clean_tweets" "dateonly"
daily_tweets <- dframe %>% # Create object on PRIMARY memory
srckeep(c("created_at")) %>% # Use one disk.frame variable
group_by(created_at) %>% # Group by date-time
summarise(n()) %>% # Summarize n per group
collect
# The last verb -- collect -- is extremely important.
# Disk.frame is lazy, and will not execute until you ask for it
# Creating a line plot (time-series) based on the dataframe created in previous step
ts_plot(daily_tweets, by="day", col="red", lwd = 0.8) +
theme_classic() +
labs(x=NULL, y=NULL,
title="Trend line of daily tweets",
caption= "WBD @ Maastricht University")
setup_disk.frame(4)
## The number of workers available for disk.frame is 4
# Setting up four workers (cores)
output_path <- file.path(tempdir(), "diskf.df")
# Chunks will temporarily be saved at tempdir
# You can find tempdir (on windows) at: C:\Users\User\AppData\Local\Temp
start <- Sys.time()
diskf <- csv_to_disk.frame("mydata2.csv")
end <- Sys.time()
# Loading chunked data to secondary memory (tempdir)
end-start
## Time difference of 8.908 secs
\[\\[0.01in]\]
# First we create a dataframe that identifies all unique tweeters
tweeters <- diskf %>%
srckeep("screen_name") %>%
group_by(screen_name) %>%
collect
# NB: Again - since disk.frame is laze, the code will execute only after we call for collect
# Creating a list of top 25 tweeters
tweeters %>%
count(screen_name, sort=TRUE) %>%
top_n(25)
## Selecting by n
## screen_name n
## 1: openletterbot 700
## 2: leftauthbot 320
## 3: eutrophy121 290
## 4: Albert97164279 250
## 5: HRHAdeyemiAdams 210
## 6: Rosenchild 210
## 7: jpg9843 200
## 8: copetrumplost 150
## 9: thejoshuablog 150
## 10: tyrant_free 130
## 11: SpringchatApp 120
## 12: globalissuesweb 110
## 13: KarenChestney 110
## 14: Truth20The 110
## 15: GaryMaryBennet1 100
## 16: JWalkerLoveSTEM 100
## 17: nevermore_007 100
## 18: saraninarymer 100
## 19: TrumpSuperDuper 100
## 20: virenj3 100
## 21: zachsnews 100
## 22: MajicLove 90
## 23: NeilGerardo 90
## 24: NowResign 90
## 25: xiaokaizuishuai 90
## screen_name n
vis_tweeters <- tweeters %>%
count(screen_name, sort=TRUE) %>%
top_n(15) %>%
ggplot(aes(x= reorder(screen_name, n), y= n)) +
geom_bar(stat ="identity", color = "darkorange1", fill = "darkorange1")+
coord_flip()+
theme_classic()+
labs(title="Top 15 tweeters",
x = "Twitter handles",
y = "Number of tweets")
## Selecting by n
# To obtain this graph, run:
# vis_tweeters
# We will create one object with multiple graphs
platforms <- diskf %>%
srckeep("source") %>%
group_by(source) %>%
collect
platforms %>%
count(source, sort=TRUE) %>%
top_n(10)
## Selecting by n
## source n
## 1: Twitter for iPhone 382090
## 2: Twitter Web App 304530
## 3: Twitter for Android 226800
## 4: Twitter for iPad 41890
## 5: TweetDeck 11190
## 6: SocialFlow 3350
## 7: dlvr.it 2730
## 8: SocialNewsDesk 2730
## 9: WordPress.com 1830
## 10: IFTTT 1700
vis_platforms <- platforms %>%
count(source, sort=TRUE) %>%
top_n(10) %>%
ggplot(aes(x= reorder(source, n), y= n)) +
geom_bar(stat ="identity", color = "red", fill = "red")+
coord_flip()+
theme_classic()+
labs(title="Top 10 platforms",
x = "Platform",
y = "Number of tweets")
## Selecting by n
# vis_platforms
mentiones <- diskf %>%
srckeep("mentions_screen_name") %>%
group_by(mentions_screen_name) %>%
collect
mentiones %>%
count(mentions_screen_name, sort=TRUE) %>%
top_n(25) %>%
na.omit()
## Selecting by n
## mentions_screen_name n
## 1: 397100
## 2: realDonaldTrump 13900
## 3: YouTube 5250
## 4: JoeBiden 4120
## 5: tedcruz 2750
## 6: SpeakerPelosi 2610
## 7: CNN 2580
## 8: LindseyGrahamSC 2470
## 9: Mike_Pence 2390
## 10: thehill 2270
## 11: HawleyMO simonschuster 2070
## 12: RepMoBrooks 2040
## 13: FBI 1870
## 14: HawleyMO 1650
## 15: MSNBC 1630
## 16: NBCNews 1600
## 17: brithume 1590
## 18: VP 1580
## 19: SenTedCruz 1550
## 20: nytimes 1480
## 21: Yahoo 1390
## 22: washingtonpost 1340
## 23: AOC 1310
## 24: nypost 1200
## 25: SenSchumer 1200
## mentions_screen_name n
vis_mentiones <- mentiones %>%
count(mentions_screen_name, sort=TRUE) %>%
top_n(15) %>%
na.omit() %>%
ggplot(aes(x= reorder(mentions_screen_name, n), y= n)) +
geom_bar(stat ="identity", color = "dodgerblue1", fill = "dodgerblue1")+
coord_flip()+
theme_classic()+
labs(title="Top 15 mentiones",
x = "Platform",
y = "Number of tweets")
## Selecting by n
# vis_mentiones
quoted <- diskf %>%
srckeep("quoted_screen_name") %>%
group_by(quoted_screen_name) %>%
collect
quoted %>%
count(quoted_screen_name, sort=TRUE) %>%
top_n(25) %>%
na.omit()
## Selecting by n
## quoted_screen_name n
## 1: 910300
## 2: nypost 680
## 3: realDonaldTrump 640
## 4: marcorubio 610
## 5: CNN 590
## 6: atrupar 570
## 7: NBCNews 560
## 8: JoeBiden 550
## 9: thehill 550
## 10: kylegriffin1 510
## 11: FBI 490
## 12: DanScavino 470
## 13: RexChapman 470
## 14: nytimes 440
## 15: HawleyMO 400
## 16: SethAbramson 400
## 17: donwinslow 380
## 18: MSNBC 370
## 19: maggieNYT 360
## 20: tedcruz 360
## 21: LindseyGrahamSC 350
## 22: PhilipinDC 340
## 23: Acosta 330
## 24: AOC 330
## 25: kaitlancollins 330
## quoted_screen_name n
vis_quoted <- quoted %>%
count(quoted_screen_name, sort=TRUE) %>%
top_n(15) %>%
na.omit() %>%
ggplot(aes(x= reorder(quoted_screen_name, n), y= n)) +
geom_bar(stat ="identity", color = "green3", fill = "green3")+
coord_flip()+
theme_classic()+
labs(title="Top 15 quoted screen name",
x = "screen name",
y = "Number of tweets")
## Selecting by n
# vis_quoted
country <- diskf %>%
srckeep("country") %>%
group_by(country) %>%
collect
country %>%
count(country, sort=TRUE) %>%
top_n(25) %>%
na.omit()
## Selecting by n
## country n
## 1: 978760
## 2: United States 16250
## 3: United Kingdom 1290
## 4: Canada 660
## 5: Australia 480
## 6: India 410
## 7: Nigeria 310
## 8: South Africa 240
## 9: Ireland 150
## 10: Germany 110
## 11: Pakistan 80
## 12: Republic of the Philippines 80
## 13: Kenya 70
## 14: The Netherlands 70
## 15: Italy 60
## 16: France 50
## 17: Japan 50
## 18: Malaysia 50
## 19: Norway 50
## 20: People's Republic of China 50
## 21: Israel 40
## 22: Hong Kong 30
## 23: Malawi 30
## 24: Mexico 30
## 25: New Zealand 30
## 26: Spain 30
## 27: Zambia 30
## country n
vis_country <- country %>%
count(country, sort=TRUE) %>%
top_n(15) %>%
na.omit() %>%
ggplot(aes(x= reorder(country, n), y= n)) +
geom_bar(stat ="identity", color = "mediumorchid2", fill = "mediumorchid2")+
coord_flip()+
theme_classic()+
labs(title="Top 15 locations",
x = "country",
y = "Number of tweets")
## Selecting by n
# vis_country
ggarrange(vis_tweeters, vis_platforms, vis_hashtags, vis_mentiones, vis_quoted, vis_country,
ncol = 3, nrow = 2)
# When we're done with the metadata, we can remove the dataframes we have created from the primary memory:
rm(country, daily_tweets, hashtags, mentiones, platforms, quoted, tweeters, vis_country, vis_hashtags,
vis_mentiones, vis_platforms, vis_quoted, vis_tweeters)
\[\\[0.1in]\]
\[\\[0.01in]\]
output_path <- file.path(tempdir(), "dframe")
start <- Sys.time()
dframe <- csv_to_disk.frame("mydata2.csv")
end <- Sys.time()
# Loading chunked data to secondary memory (tempdir)
end-start
## Time difference of 5.468 secs
tweetext <- dframe %>%
srckeep(c("row_num", "created_at", "text")) %>% # These are the three variables we will work with
collect # Don't forget to collect...
# It's useful to create a backup dataframe, just in case something goes wrong:
tweetext_bck <- tweetext
# cleaning the tweets' text
start <- Sys.time()
tweetext <- tweetext %>%
mutate(clean_text = text, # create a new variable - clean_text
clean_text = tolower(clean_text), # change all letters to lowercase
clean_text = gsub("@\\S+", "", clean_text),# remove @ symbols, followed by anything except space
clean_text = gsub("http(s?):\\S+", "", clean_text), # remove URLs, followed by anything except space
clean_text = gsub("#\\S+", "", clean_text), # remove hashtags, followed by anything except space
clean_text = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", clean_text), # remove retweeter identity
clean_text = gsub("[[:punct:]]", "", clean_text), # remove punctuations, including ! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ _ ` { | } ~
clean_text = gsub("<U\\+?[0-9a-fA-F]+>", " ", clean_text), # remove emoticons
clean_text = gsub("[^\x20-\x7E]", "", clean_text), # remove non-ASCII characters
clean_text = gsub("[[:digit:]]", "", clean_text), # remove digits
clean_text = gsub("\n", " ", clean_text), # remove line breaks
clean_text = gsub("^\\s+|\\s+$|\\s+(?=\\s)", "", clean_text, perl=T)) # remove trailing, leading spaces and extra internal whitespace
end <- Sys.time()
end-start
## Time difference of 24.544 secs
# tweetext_bck <- tweetext # Again - backing up the data
tweetext %>%
dplyr::select(row_num, text, clean_text) %>%
group_by(row_num) %>%
head
## # A tibble: 6 x 3
## # Groups: row_num [6]
## row_num text clean_text
## <int> <chr> <chr>
## 1 1 'This was NO surprise': Obama sl~ this was no surprise obama slams ma~
## 2 2 'This was NO surprise': Obama sl~ this was no surprise obama slams ma~
## 3 3 'This was NO surprise': Obama sl~ this was no surprise obama slams ma~
## 4 4 'This was NO surprise': Obama sl~ this was no surprise obama slams ma~
## 5 5 'This was NO surprise': Obama sl~ this was no surprise obama slams ma~
## 6 6 'This was NO surprise': Obama sl~ this was no surprise obama slams ma~
# View(tweetext)
\[\\[0.1in]\] ## Pre-processing
start <- Sys.time()
tweetext2 <- tweetext %>%
unnest_tokens(input = clean_text, output = word) %>%
filter(!word %in% stop_words$word) %>%
group_by(row_num) %>%
summarise(cleaner_text = paste(word, collapse = " "))
end <- Sys.time()
end-start
## Time difference of 15.796 secs
tweetext <- tweetext %>%
left_join(tweetext2)
## Joining, by = "row_num"
# tweetext_bck <- tweetext
tweetext %>%
dplyr::select(row_num, text, clean_text, cleaner_text) %>%
group_by(row_num) %>%
head
## # A tibble: 6 x 4
## # Groups: row_num [6]
## row_num text clean_text cleaner_text
## <int> <chr> <chr> <chr>
## 1 1 'This was NO surprise~ this was no surprise oba~ surprise obama slams~
## 2 2 'This was NO surprise~ this was no surprise oba~ surprise obama slams~
## 3 3 'This was NO surprise~ this was no surprise oba~ surprise obama slams~
## 4 4 'This was NO surprise~ this was no surprise oba~ surprise obama slams~
## 5 5 'This was NO surprise~ this was no surprise oba~ surprise obama slams~
## 6 6 'This was NO surprise~ this was no surprise oba~ surprise obama slams~
\[\\[0.1in]\]
\[\\[0.001in]\]
get_sentiments("nrc")
## # A tibble: 13,901 x 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ... with 13,891 more rows
#look up words
get_sentiments("nrc") %>%
filter(word == "happy")
## # A tibble: 4 x 2
## word sentiment
## <chr> <chr>
## 1 happy anticipation
## 2 happy joy
## 3 happy positive
## 4 happy trust
get_sentiments("nrc") %>%
filter(word == "cry")
## # A tibble: 2 x 2
## word sentiment
## <chr> <chr>
## 1 cry negative
## 2 cry sadness
get_sentiments("nrc") %>%
filter(word == "apple")
## # A tibble: 0 x 2
## # ... with 2 variables: word <chr>, sentiment <chr>
#look up words for a given sentiment
get_sentiments("nrc") %>%
filter(sentiment == "joy")
## # A tibble: 689 x 2
## word sentiment
## <chr> <chr>
## 1 absolution joy
## 2 abundance joy
## 3 abundant joy
## 4 accolade joy
## 5 accompaniment joy
## 6 accomplish joy
## 7 accomplished joy
## 8 achieve joy
## 9 achievement joy
## 10 acrobat joy
## # ... with 679 more rows
get_sentiments("nrc") %>%
filter(sentiment == "fear")
## # A tibble: 1,476 x 2
## word sentiment
## <chr> <chr>
## 1 abandon fear
## 2 abandoned fear
## 3 abandonment fear
## 4 abduction fear
## 5 abhor fear
## 6 abhorrent fear
## 7 abominable fear
## 8 abomination fear
## 9 abortion fear
## 10 absence fear
## # ... with 1,466 more rows
start <- Sys.time()
tweetnrc <- tweetext %>%
mutate(row_num = row_number()) %>%
group_by(row_num) %>%
unnest_tokens(word, cleaner_text) %>%
full_join(get_sentiments("nrc")) %>%
count(sentiment) %>%
spread(sentiment, n, fill=0)
## Joining, by = "word"
end <- Sys.time()
end-start
## Time difference of 1.7657 mins
rcorr(as.matrix(tweetnrc[1:1000000, 2:11]))
## anger anticipation disgust fear joy negative positive sadness
## anger 1.00 0.20 0.58 0.68 0.10 0.76 0.19 0.59
## anticipation 0.20 1.00 0.14 0.23 0.58 0.23 0.47 0.18
## disgust 0.58 0.14 1.00 0.48 0.09 0.60 0.12 0.52
## fear 0.68 0.23 0.48 1.00 0.12 0.70 0.28 0.60
## joy 0.10 0.58 0.09 0.12 1.00 0.10 0.59 0.12
## negative 0.76 0.23 0.60 0.70 0.10 1.00 0.21 0.69
## positive 0.19 0.47 0.12 0.28 0.59 0.21 1.00 0.16
## sadness 0.59 0.18 0.52 0.60 0.12 0.69 0.16 1.00
## surprise 0.26 0.27 0.23 0.21 0.26 0.25 0.19 0.25
## trust 0.17 0.46 0.14 0.27 0.56 0.19 0.74 0.15
## surprise trust
## anger 0.26 0.17
## anticipation 0.27 0.46
## disgust 0.23 0.14
## fear 0.21 0.27
## joy 0.26 0.56
## negative 0.25 0.19
## positive 0.19 0.74
## sadness 0.25 0.15
## surprise 1.00 0.19
## trust 0.19 1.00
##
## n= 1000000
##
##
## P
## anger anticipation disgust fear joy negative positive sadness
## anger 0 0 0 0 0 0 0
## anticipation 0 0 0 0 0 0 0
## disgust 0 0 0 0 0 0 0
## fear 0 0 0 0 0 0 0
## joy 0 0 0 0 0 0 0
## negative 0 0 0 0 0 0 0
## positive 0 0 0 0 0 0 0
## sadness 0 0 0 0 0 0 0
## surprise 0 0 0 0 0 0 0 0
## trust 0 0 0 0 0 0 0 0
## surprise trust
## anger 0 0
## anticipation 0 0
## disgust 0 0
## fear 0 0
## joy 0 0
## negative 0 0
## positive 0 0
## sadness 0 0
## surprise 0
## trust 0
tweetext <- tweetext %>%
left_join(tweetnrc)
## Joining, by = "row_num"
tweetext_bck <- tweetext
start <- Sys.time()
tweetext %>%
mutate(datetime=as.POSIXct(created_at, format="%Y-%m-%d %H:%M:%S"),
interval= as.POSIXct(cut(datetime, breaks = "2 hours"))) %>%
group_by(interval) %>%
summarise(meanjoy = mean(joy),
meanfear = mean(fear),
meananger = mean(anger),
meansurprise = mean(surprise))%>%
ggplot() +
geom_line(aes(x=interval, y=meanjoy, color = "Joy")) +
geom_line(aes(x=interval, y=meanfear, color = "Fear")) +
geom_line(aes(x=interval, y=meananger, color = "Anger")) +
geom_line(aes(x=interval, y=meansurprise, color = "Surprise"))+
#geom_vline(xintercept= as.POSIXct("2020-12-9 17:51:42", format="%Y-%m-%d %H:%M:%S")) +
theme_classic() +
labs(x=NULL, y = NULL,
title = "Time series for NRC scores",
subtitle = "Scores are averages per 2 hour") +
scale_color_manual(name="Sentiments in tweets",
values = c("Joy" = "chartreuse2",
"Fear"= "brown2",
"Anger"= "darkmagenta",
"Surprise" = "deepskyblue1"))
end <- Sys.time()
end-start
## Time difference of 32.313 secs
\[\\[0.01in]\] ### Populism-liberalism lexicon
populism.liberalism.lexicon <- dictionary(
list(populism = c("elit*", "consensus*", "undemocratic*", "referend*",
"corrupt*", "propagand", "politici*", "*deceit*",
"*deceiv*", "*betray*", "shame*", "scandal*",
"truth*", "dishonest*", "establishm*", "ruling*"),
# developed by Ken Benoit
liberalism = c("liber*", "free*", "indiv*", "open*", "law*",
"rules", "order", "rights", "trade", "global",
"inter*", "trans*", "minori*", "exchange", "market*")))
# developed by Cornelius Puschmann & Mario Haim
woke.lexicon <- dictionary(
list(woke_term = c(
"aapi", "accountab*", "ally", "antiracism", "appropriat*", "bame",
"binary", "bipoc", "black", "black lives matter", "blm", "bopo",
"bropropriate", "brown", "cancel culture", "cis", "cisgender", "class",
"clicktivism", "conscious capitalism", "critical race theory", "crt",
"dead name", "decololization", "discriminat*", "disparate impact",
"divers*", "diversity", "drag", "environmental justice", "equity",
"fast fashion", "flexitarianism", "gaslight", "gender", "greenwash",
"hate spe*", "heteronormativ*", "heteronormative", "identity politics",
"implicit bias", "inclusi*", "indigenous", "internalis* ", "internaliz*",
"intersectional*", "latinx", "lgbt*", "mansplain*", "marginaliz*",
"masculinity", "micro agress*", "microagress*", "misogyn*", "nonbinary",
"of color", "pansexual", "pinkwash", "playing field", "poc", "privilege*",
"pronoun*", "queer", "queerbaiting", "rac*", "race", "safe space",
"social justice", "spectrum", "structural", "supremac*", "systemic",
"theme house", "toxic", "trigger*", "unearned", "white fragility",
"whitewash", "zero waste")))
start <- Sys.time()
# Counting the number of occurrences of each lexicon term in the cleaner tweets
# We will create a document-feature matrices for that:
tweetdfm1 <- dfm(tweetext$cleaner_text, dictionary = populism.liberalism.lexicon)
tweetdfm2 <- dfm(tweetext$cleaner_text, dictionary = woke.lexicon)
end <- Sys.time()
end-start
## Time difference of 41.661 secs
# Changing to percentages for comparability
tweetdfm1.prop <- dfm_weight(tweetdfm1, scheme = "prop")
tweetdfm2.prop <- dfm_weight(tweetdfm2, scheme = "prop")
# Converting the document-feature matrices to dataframes
tweetdfm_df1 <- convert(tweetdfm1.prop, "data.frame")
tweetdfm_df2 <- convert(tweetdfm2.prop, "data.frame")
# Indexing these new dataframes by their row numbers
tweetdfm_df1$row_num <- seq.int(nrow(tweetdfm_df1))
tweetdfm_df2$row_num <- seq.int(nrow(tweetdfm_df2))
# Joining them to the main tweetext dataframe
tweetext <- tweetext %>%
left_join(tweetdfm_df1)
## Joining, by = "row_num"
tweetext <- tweetext %>%
left_join(tweetdfm_df2)
## Joining, by = c("row_num", "doc_id")
names(tweetext)
## [1] "row_num" "created_at" "text" "clean_text" "cleaner_text"
## [6] "anger" "anticipation" "disgust" "fear" "joy"
## [11] "negative" "positive" "sadness" "surprise" "trust"
## [16] "<NA>" "doc_id" "populism" "liberalism" "woke_term"
# tweetext_bck <- tweetext # Backup dataframe
# Correlations between all sentiments and lexicon-terms
rcorr(as.matrix(tweetext[ , c(6:10, 13:15, 18:20)]))
## anger anticipation disgust fear joy sadness surprise trust
## anger 1.00 0.20 0.58 0.68 0.10 0.59 0.26 0.17
## anticipation 0.20 1.00 0.14 0.23 0.58 0.18 0.27 0.46
## disgust 0.58 0.14 1.00 0.48 0.09 0.52 0.23 0.14
## fear 0.68 0.23 0.48 1.00 0.12 0.60 0.21 0.27
## joy 0.10 0.58 0.09 0.12 1.00 0.12 0.26 0.56
## sadness 0.59 0.18 0.52 0.60 0.12 1.00 0.25 0.15
## surprise 0.26 0.27 0.23 0.21 0.26 0.25 1.00 0.19
## trust 0.17 0.46 0.14 0.27 0.56 0.15 0.19 1.00
## populism 0.06 0.04 0.13 0.07 0.03 0.11 0.04 0.09
## liberalism 0.07 0.06 0.04 0.07 0.08 0.04 0.04 0.15
## woke_term 0.08 0.09 0.05 0.09 0.12 0.13 0.01 0.13
## populism liberalism woke_term
## anger 0.06 0.07 0.08
## anticipation 0.04 0.06 0.09
## disgust 0.13 0.04 0.05
## fear 0.07 0.07 0.09
## joy 0.03 0.08 0.12
## sadness 0.11 0.04 0.13
## surprise 0.04 0.04 0.01
## trust 0.09 0.15 0.13
## populism 1.00 -0.04 0.01
## liberalism -0.04 1.00 0.03
## woke_term 0.01 0.03 1.00
##
## n= 1000000
##
##
## P
## anger anticipation disgust fear joy sadness surprise trust
## anger 0 0 0 0 0 0 0
## anticipation 0 0 0 0 0 0 0
## disgust 0 0 0 0 0 0 0
## fear 0 0 0 0 0 0 0
## joy 0 0 0 0 0 0 0
## sadness 0 0 0 0 0 0 0
## surprise 0 0 0 0 0 0 0
## trust 0 0 0 0 0 0 0
## populism 0 0 0 0 0 0 0 0
## liberalism 0 0 0 0 0 0 0 0
## woke_term 0 0 0 0 0 0 0 0
## populism liberalism woke_term
## anger 0 0 0
## anticipation 0 0 0
## disgust 0 0 0
## fear 0 0 0
## joy 0 0 0
## sadness 0 0 0
## surprise 0 0 0
## trust 0 0 0
## populism 0 0
## liberalism 0 0
## woke_term 0 0
fwrite(tweetext, "cleantweets.csv")
mydata.csv
\[\\[0.01in]\]
\[\\[0.01in]\]
delete(diskf) # Deleting the disk.frame from your secondary memory
rm(list=ls()) # Deleting all objects from primary memory
\[\\[0.1in]\]
\[\\[0.01in]\]
setup_disk.frame(4)
## The number of workers available for disk.frame is 4
output_path <- file.path(tempdir(), "tw.df")
tw.df <- csv_to_disk.frame("cleantweets.csv") # reading the tweets csv into a diskframe on hard drive
names(tw.df)
## [1] "row_num" "created_at" "text" "clean_text" "cleaner_text"
## [6] "anger" "anticipation" "disgust" "fear" "joy"
## [11] "negative" "positive" "sadness" "surprise" "trust"
## [16] "<NA>" "doc_id" "populism" "liberalism" "woke_term"
political <- tw.df %>%
srckeep(c("woke_term", "liberalism", "populism", "fear", "anger", "joy")) %>%
collect
\[\\[0.01in]\]
# Calculating measures of central tendency and dispersion
tw.df %>%
srckeep("joy") %>%
summarise(mean(joy, na.rm = T)) %>%
collect
## mean(joy, na.rm = T)
## 1 0.33628
tw.df %>%
srckeep("anger") %>%
summarise(median(anger, na.rm = T)) %>%
collect
## median(anger, na.rm = T)
## 1 0
tw.df %>%
srckeep("fear") %>%
summarise(sd(fear, na.rm = T)) %>%
collect
## sd(fear, na.rm = T)
## 1 1.073618
tw.df %>%
srckeep("fear") %>%
summarise(var(fear, na.rm = T)) %>%
collect
## var(fear, na.rm = T)
## 1 1.152655
tw.df %>%
srckeep("fear") %>%
summarise(max(fear, na.rm = T)) %>% # Median Absolute Deviation
collect
## max(fear, na.rm = T)
## 1 14
tw.df %>%
srckeep("anger") %>%
summarise(max(anger, na.rm = T)) %>% # Median Absolute Deviation
collect
## max(anger, na.rm = T)
## 1 14
# Frequncy tables
freq(political$woke_term)
## Frequencies
## political$woke_term
## Type: Integer
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ----------- --------- --------- -------------- --------- --------------
## 0 930950 93.09 93.09 93.09 93.09
## 1 69050 6.91 100.00 6.91 100.00
## <NA> 0 0.00 100.00
## Total 1000000 100.00 100.00 100.00 100.00
freq(political$liberalism)
## Frequencies
## political$liberalism
## Type: Numeric
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ----------------------- --------- --------- -------------- --------- --------------
## 0 926810 92.681 92.681 92.681 92.681
## 0.25 30 0.003 92.684 0.003 92.684
## 0.333333333333333 250 0.025 92.709 0.025 92.709
## 0.5 2740 0.274 92.983 0.274 92.983
## 0.666666666666667 240 0.024 93.007 0.024 93.007
## 0.75 20 0.002 93.009 0.002 93.009
## 1 69910 6.991 100.000 6.991 100.000
## <NA> 0 0.000 100.000
## Total 1000000 100.000 100.000 100.000 100.000
freq(political$populism)
## Frequencies
## political$populism
## Type: Numeric
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ----------------------- --------- --------- -------------- --------- --------------
## 0 964590 96.459 96.459 96.459 96.459
## 0.25 20 0.002 96.461 0.002 96.461
## 0.333333333333333 240 0.024 96.485 0.024 96.485
## 0.5 2740 0.274 96.759 0.274 96.759
## 0.666666666666667 250 0.025 96.784 0.025 96.784
## 0.75 30 0.003 96.787 0.003 96.787
## 1 32130 3.213 100.000 3.213 100.000
## <NA> 0 0.000 100.000
## Total 1000000 100.000 100.000 100.000 100.000
freq(political$fear)
## Frequencies
## political$fear
## Type: Integer
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ----------- --------- --------- -------------- --------- --------------
## 0 500260 50.026 50.026 50.026 50.026
## 1 291750 29.175 79.201 29.175 79.201
## 2 131720 13.172 92.373 13.172 92.373
## 3 48330 4.833 97.206 4.833 97.206
## 4 18710 1.871 99.077 1.871 99.077
## 5 6430 0.643 99.720 0.643 99.720
## 6 1880 0.188 99.908 0.188 99.908
## 7 620 0.062 99.970 0.062 99.970
## 8 210 0.021 99.991 0.021 99.991
## 9 70 0.007 99.998 0.007 99.998
## 10 10 0.001 99.999 0.001 99.999
## 14 10 0.001 100.000 0.001 100.000
## <NA> 0 0.000 100.000
## Total 1000000 100.000 100.000 100.000 100.000
freq(political$anger)
## Frequencies
## political$anger
## Type: Integer
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ----------- --------- --------- -------------- --------- --------------
## 0 520720 52.072 52.072 52.072 52.072
## 1 290670 29.067 81.139 29.067 81.139
## 2 121070 12.107 93.246 12.107 93.246
## 3 44830 4.483 97.729 4.483 97.729
## 4 15830 1.583 99.312 1.583 99.312
## 5 4900 0.490 99.802 0.490 99.802
## 6 1400 0.140 99.942 0.140 99.942
## 7 440 0.044 99.986 0.044 99.986
## 8 60 0.006 99.992 0.006 99.992
## 9 70 0.007 99.999 0.007 99.999
## 14 10 0.001 100.000 0.001 100.000
## <NA> 0 0.000 100.000
## Total 1000000 100.000 100.000 100.000 100.000
freq(political$joy)
## Frequencies
## political$joy
## Type: Integer
##
## Freq % Valid % Valid Cum. % Total % Total Cum.
## ----------- --------- --------- -------------- --------- --------------
## 0 742780 74.278 74.278 74.278 74.278
## 1 197010 19.701 93.979 19.701 93.979
## 2 46530 4.653 98.632 4.653 98.632
## 3 9970 0.997 99.629 0.997 99.629
## 4 2700 0.270 99.899 0.270 99.899
## 5 740 0.074 99.973 0.074 99.973
## 6 190 0.019 99.992 0.019 99.992
## 7 50 0.005 99.997 0.005 99.997
## 9 10 0.001 99.998 0.001 99.998
## 10 10 0.001 99.999 0.001 99.999
## 12 10 0.001 100.000 0.001 100.000
## <NA> 0 0.000 100.000
## Total 1000000 100.000 100.000 100.000 100.000
# Recode into dummy
political$lib_d <- case_when(political$liberalism > 0 ~ 0, TRUE ~ 1)
political$pop_d <- case_when(political$populism > 0 ~ 0, TRUE ~ 1)
# Crosstabs for categorical variables
CrossTable(political$woke_term, political$lib_d, digits = 3)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 1000000
##
##
## | political$lib_d
## political$woke_term | 0 | 1 | Row Total |
## --------------------|-----------|-----------|-----------|
## 0 | 66430 | 864520 | 930950 |
## | 42.726 | 3.374 | |
## | 0.071 | 0.929 | 0.931 |
## | 0.908 | 0.933 | |
## | 0.066 | 0.865 | |
## --------------------|-----------|-----------|-----------|
## 1 | 6760 | 62290 | 69050 |
## | 576.050 | 45.491 | |
## | 0.098 | 0.902 | 0.069 |
## | 0.092 | 0.067 | |
## | 0.007 | 0.062 | |
## --------------------|-----------|-----------|-----------|
## Column Total | 73190 | 926810 | 1000000 |
## | 0.073 | 0.927 | |
## --------------------|-----------|-----------|-----------|
##
##
CrossTable(political$woke_term, political$pop_d, digits = 3)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 1000000
##
##
## | political$pop_d
## political$woke_term | 0 | 1 | Row Total |
## --------------------|-----------|-----------|-----------|
## 0 | 32670 | 898280 | 930950 |
## | 2.639 | 0.097 | |
## | 0.035 | 0.965 | 0.931 |
## | 0.923 | 0.931 | |
## | 0.033 | 0.898 | |
## --------------------|-----------|-----------|-----------|
## 1 | 2740 | 66310 | 69050 |
## | 35.578 | 1.306 | |
## | 0.040 | 0.960 | 0.069 |
## | 0.077 | 0.069 | |
## | 0.003 | 0.066 | |
## --------------------|-----------|-----------|-----------|
## Column Total | 35410 | 964590 | 1000000 |
## | 0.035 | 0.965 | |
## --------------------|-----------|-----------|-----------|
##
##
CrossTable(political$pop_d, political$lib_d, digits = 3)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 1000000
##
##
## | political$lib_d
## political$pop_d | 0 | 1 | Row Total |
## ----------------|-----------|-----------|-----------|
## 0 | 3280 | 32130 | 35410 |
## | 182.823 | 14.438 | |
## | 0.093 | 0.907 | 0.035 |
## | 0.045 | 0.035 | |
## | 0.003 | 0.032 | |
## ----------------|-----------|-----------|-----------|
## 1 | 69910 | 894680 | 964590 |
## | 6.711 | 0.530 | |
## | 0.072 | 0.928 | 0.965 |
## | 0.955 | 0.965 | |
## | 0.070 | 0.895 | |
## ----------------|-----------|-----------|-----------|
## Column Total | 73190 | 926810 | 1000000 |
## | 0.073 | 0.927 | |
## ----------------|-----------|-----------|-----------|
##
##
# Continuous variables broken by
describeBy(political$joy, group = political$woke_term)
##
## Descriptive statistics by group
## group: 0
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 930950 0.31 0.63 0 0.18 0 0 12 12 2.61 10.58 0
## ------------------------------------------------------------
## group: 1
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 69050 0.62 0.87 0 0.47 0 0 6 6 1.63 3.25 0
describeBy(political$anger, group = political$pop_d)
##
## Descriptive statistics by group
## group: 0
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 35410 1.1 1.18 1 0.92 1.48 0 9 9 1.33 2.58 0.01
## ------------------------------------------------------------
## group: 1
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 964590 0.76 1.02 0 0.57 0 0 14 14 1.68 3.77 0
describeBy(political$fear, group = political$lib_d)
##
## Descriptive statistics by group
## group: 0
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 73190 1.1 1.2 1 0.91 1.48 0 8 8 1.25 1.69 0
## ------------------------------------------------------------
## group: 1
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 926810 0.8 1.06 0 0.61 0 0 14 14 1.7 3.95 0
# Bar plots
par(mfrow=c(1,3))
barplot(table(political$woke_term), col = "red")
barplot(table(political$populism), col = "blue")
barplot(table(political$liberalism), col = "green")
par(mfrow=c(1,1))
# Histograms
par(mfrow=c(1,2))
hist(political$anger, col = "red")
hist(political$joy, col = "green")
par(mfrow=c(1,1))
# Boxplot
boxplot(political$joy, political$fear, political$anger,
main = "Average spending in $US",
names = c("Joy", "Fear", "Anger"),
col = c("orange", "purple", "green"),
ylim = c(0, 15))
# Scatterplot with BLUE regression line
plot(political$populism, political$fear,
main = "Fear as a function of populism?",
xlab = "Populist discourse", ylab = "Fear",
frame = FALSE,
col = "blue")
abline(lm(political$fear ~ political$populism, data = political), lwd = 4, col = "red")
\[\\[0.01in]\]
# One sample Chi-sq.
emotions_obs <- c(mean(political$fear), mean(political$anger), mean(political$joy))
barplot(emotions_obs, xlab="Emotion", main="Emotional discourse", col = "coral", names.arg = c("Fear", "Anger", "Joy"))
# Defining two theoretical expectations:
emotions_exp1 <- c(0.5, 0.3, 0.2)
emotions_exp2 <- c(0.3, 0.1, 0.6)
# Testing whether our data fits the expectations:
chisq.test(emotions_obs, p=emotions_exp1)
##
## Chi-squared test for given probabilities
##
## data: emotions_obs
## X-squared = 0.088237, df = 2, p-value = 0.9568
chisq.test(emotions_obs, p=emotions_exp2)
##
## Chi-squared test for given probabilities
##
## data: emotions_obs
## X-squared = 2.4008, df = 2, p-value = 0.3011
# One sample t-test
t.test(political$anger, mu = 0.765)
##
## One Sample t-test
##
## data: political$anger
## t = 2.7839, df = 1e+06, p-value = 0.005371
## alternative hypothesis: true mean is not equal to 0.765
## 95 percent confidence interval:
## 0.7658435 0.7698565
## sample estimates:
## mean of x
## 0.76785
t.test(political$fear, mu = 0.765)
##
## One Sample t-test
##
## data: political$fear
## t = 56.202, df = 1e+06, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0.765
## 95 percent confidence interval:
## 0.8232357 0.8274443
## sample estimates:
## mean of x
## 0.82534
t.test(political$joy, mu = 0.765)
##
## One Sample t-test
##
## data: political$joy
## t = -650.76, df = 1e+06, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0.765
## 95 percent confidence interval:
## 0.3349888 0.3375712
## sample estimates:
## mean of x
## 0.33628
# Measuring associations: categorical
CrossTable(political$woke_term, political$pop_d, expected = T) # statistically dependent
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Expected N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 1000000
##
##
## | political$pop_d
## political$woke_term | 0 | 1 | Row Total |
## --------------------|-----------|-----------|-----------|
## 0 | 32670 | 898280 | 930950 |
## | 32964.940 | 897985.061 | |
## | 2.639 | 0.097 | |
## | 0.035 | 0.965 | 0.931 |
## | 0.923 | 0.931 | |
## | 0.033 | 0.898 | |
## --------------------|-----------|-----------|-----------|
## 1 | 2740 | 66310 | 69050 |
## | 2445.061 | 66604.939 | |
## | 35.578 | 1.306 | |
## | 0.040 | 0.960 | 0.069 |
## | 0.077 | 0.069 | |
## | 0.003 | 0.066 | |
## --------------------|-----------|-----------|-----------|
## Column Total | 35410 | 964590 | 1000000 |
## | 0.035 | 0.965 | |
## --------------------|-----------|-----------|-----------|
##
##
## Statistics for All Table Factors
##
##
## Pearson's Chi-squared test
## ------------------------------------------------------------
## Chi^2 = 39.61933 d.f. = 1 p = 3.08613e-10
##
## Pearson's Chi-squared test with Yates' continuity correction
## ------------------------------------------------------------
## Chi^2 = 39.48512 d.f. = 1 p = 3.305694e-10
##
##
phi(table(political$woke_term, political$pop_d), digits = 4)
## [1] -0.0063
# Correlations
cor.test(x=political$anger, y=political$fear, method = "spearman")
##
## Spearman's rank correlation rho
##
## data: political$anger and political$fear
## S = 5.8875e+16, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.6467492
cor.test(x=political$anger, y=political$fear, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: political$anger and political$fear
## t = 921.39, df = 999998, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6765493 0.6786693
## sample estimates:
## cor
## 0.6776107
lm1 <- lm(political$fear ~ political$anger)
lm2 <- lm(political$fear ~ political$anger + political$populism)
lm3 <- lm(political$fear ~ political$anger + political$populism * political$joy)
lm4 <- lm.beta(lm3)
summary(lm1)
##
## Call:
## lm(formula = political$fear ~ political$anger)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.8328 -0.2797 -0.2797 0.2991 9.7203
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.2796853 0.0009870 283.4 <2e-16 ***
## political$anger 0.7106267 0.0007713 921.4 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7896 on 999998 degrees of freedom
## Multiple R-squared: 0.4592, Adjusted R-squared: 0.4592
## F-statistic: 8.49e+05 on 1 and 999998 DF, p-value: < 2.2e-16
summary(lm2)
##
## Call:
## lm(formula = political$fear ~ political$anger + political$populism)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9897 -0.2753 -0.2753 0.3070 9.7247
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.2753166 0.0009928 277.32 <2e-16 ***
## political$anger 0.7088259 0.0007721 918.04 <2e-16 ***
## political$populism 0.1702928 0.0044304 38.44 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.789 on 999997 degrees of freedom
## Multiple R-squared: 0.46, Adjusted R-squared: 0.46
## F-statistic: 4.258e+05 on 2 and 999997 DF, p-value: < 2.2e-16
summary(lm3)
##
## Call:
## lm(formula = political$fear ~ political$anger + political$populism *
## political$joy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9265 -0.3331 -0.2522 0.3401 9.7478
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.2522212 0.0010519 239.787 < 2e-16 ***
## political$anger 0.7038352 0.0007738 909.543 < 2e-16 ***
## political$populism 0.1551010 0.0051157 30.319 < 2e-16 ***
## political$joy 0.0808887 0.0012306 65.730 < 2e-16 ***
## political$populism:political$joy 0.0156735 0.0058338 2.687 0.00722 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7872 on 999995 degrees of freedom
## Multiple R-squared: 0.4624, Adjusted R-squared: 0.4624
## F-statistic: 2.151e+05 on 4 and 999995 DF, p-value: < 2.2e-16
summary(lm4)
##
## Call:
## lm(formula = political$fear ~ political$anger + political$populism *
## political$joy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9265 -0.3331 -0.2522 0.3401 9.7478
##
## Coefficients:
## Estimate Standardized Std. Error t value
## (Intercept) 0.2522212 0.0000000 0.0010519 239.787
## political$anger 0.7038352 0.6711347 0.0007738 909.543
## political$populism 0.1551010 0.0257742 0.0051157 30.319
## political$joy 0.0808887 0.0496354 0.0012306 65.730
## political$populism:political$joy 0.0156735 0.0023298 0.0058338 2.687
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## political$anger < 2e-16 ***
## political$populism < 2e-16 ***
## political$joy < 2e-16 ***
## political$populism:political$joy 0.00722 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7872 on 999995 degrees of freedom
## Multiple R-squared: 0.4624, Adjusted R-squared: 0.4624
## F-statistic: 2.151e+05 on 4 and 999995 DF, p-value: < 2.2e-16