Make a new column based on subsetting or grouping the original data.
Use string searches to help with this.
Fosen <- read.csv("/Users/ellievanhattem/Desktop/Anthro630/Fosen-tweets.csv")
SamiRights <- read.csv("/Users/ellievanhattem/Desktop/Anthro630/Twitter_Hashtags_ANTH630/SamiRights-tweets.csv")
StandwithSápmi <- read.csv("/Users/ellievanhattem/Desktop/Anthro630/Twitter_Hashtags_ANTH630/StandWithSápmi-tweets.csv")
Sami<- read.csv("/Users/ellievanhattem/Desktop/Anthro630/Twitter_Hashtags_ANTH630/Sami-tweets.csv")
Sápmi <- read.csv("/Users/ellievanhattem/Desktop/Anthro630/Twitter_Hashtags_ANTH630/sápmi-tweets.csv")
WindMills <- read.csv("/Users/ellievanhattem/Desktop/Anthro630/Twitter_Hashtags_ANTH630/WindFarms-tweets.csv")
IndigenousPeoples <- read.csv("/Users/ellievanhattem/Desktop/Anthro630/Twitter_Hashtags_ANTH630/IndigenousPeoples-tweets.csv")
Samilandrights <- read.csv("/Users/ellievanhattem/Desktop/Anthro630/Twitter_Hashtags_ANTH630/saamilandrights-tweets.csv")
Samilandrights1 <-Samilandrights
IndigenousPeoples1 <- IndigenousPeoples
WindMills1 <- WindMills
Sápmi1 <- Sápmi
Sami1 <- Sami
StandwithSápmi1 <- StandwithSápmi
SamiRights1 <- SamiRights
Fosen1 <- Fosen
#str(Samilandrights1)
#str(IndigenousPeoples1)
#str(WindMills1)
#str(Sápmi1)
#str(StandwithSápmi1)
#str(SamiRights1)
#str(Fosen1)
# Split "date" into two columns that will separate the time and the date values
# Tag 1
Fosen1 <- Fosen1 %>%
tidyr::separate(date, into = c("date1", "time"), sep = 11)
# Fosen1[c("date1", "time")] <-sapply(strsplit(Fosen1$date, " "),c)
Fosen1$time <- substr(Fosen1$time, 1, 8)
# Remove original datetime column
### Tag 2
SamiRights1 <- SamiRights1 %>%
tidyr::separate(date, into = c("date1", "time"), sep = 11)
# SamiRights1[c("date1", "time")] <-sapply(strsplit(SamiRights1$date, " "),c)
SamiRights1$time <- substr(SamiRights1$time, 1, 8)
# Remove original datetime column
# Tag 3
StandwithSápmi1 <- StandwithSápmi1 %>%
tidyr::separate(date, into = c("date1", "time"), sep = 11)
# StandwithSápmi1[c("date1", "time")] <-sapply(strsplit(StandwithSápmi1$date, " "),c)
StandwithSápmi1$time <- substr(StandwithSápmi1$time, 1, 8)
# Remove original datetime column
# Tag 4
Sami1 <- Sami1 %>%
tidyr::separate(date, into = c("date1", "time"), sep = 11)
# Sami1[c("date1", "time")] <-sapply(strsplit(Sami1$date, " "),c)
Sami1$time <- substr(Sami1$time, 1, 8)
# Remove original datetime column
# Tag 5
Sápmi1 <- Sápmi1 %>%
tidyr::separate(date, into = c("date1", "time"), sep = 11)
# Sápmi1[c("date1", "time")] <-sapply(strsplit(Sápmi1$date, " "),c)
Sápmi1$time <- substr(Sápmi1$time, 1, 8)
# Remove original datetime column
# Tag 6
WindMills1 <- WindMills1 %>%
tidyr::separate(date, into = c("date1", "time"), sep = 11)
# WindMills1 [c("date1", "time")] <-sapply(strsplit(WindMills1 $date, " "),c)
WindMills1 $time <- substr(WindMills1 $time, 1, 8)
# Remove original datetime column
# Tag 7
IndigenousPeoples1 <- IndigenousPeoples1 %>%
tidyr::separate(date, into = c("date1", "time"), sep = 11)
# IndigenousPeoples1 [c("date1", "time")] <-sapply(strsplit(IndigenousPeoples1$date, " "),c)
IndigenousPeoples1$time <- substr(IndigenousPeoples1$time, 1, 8)
# Remove original datetime column
# Tag 8
Samilandrights1 <- Samilandrights1 %>%
tidyr::separate(date, into = c("date1", "time"), sep = 11)
# Samilandrights1 [c("date1", "time")] <-sapply(strsplit(Samilandrights1$date, " "),c)
Samilandrights1$time <- substr(Samilandrights1$time, 1, 8)
# Remove original datetime column
Convert ‘content’ txt column to lowercase
Fosen1$content <- tolower(Fosen1$content)
Sami1$content <- tolower(Sami1$content)
Samilandrights1$content <- tolower(Samilandrights1$content)
SamiRights1$content <- tolower(SamiRights1$content)
IndigenousPeoples1$content <- tolower(IndigenousPeoples1$content)
WindMills1$content <- tolower(WindMills1$content)
Sápmi1$content <- tolower(Sápmi1$content)
StandwithSápmi1$content <- tolower(StandwithSápmi1$content)
# Defined the keywords to search for
keywords <- c("statsministeren", "politiet", "protestors", "protesting", "windmillparks", "vindmølleparker", "distriktspolitiet", "solidarity", "vindmøllene", "regjeringen", "government")
# Defined countries to search for in the tweets
country <- c("Finland", "America", "Norway", "Sweden", "Fosen", "Norwegian", "American", "Swedish", "Finish")
# Create a new topic column based on the matches for the different tags dataset
matches_fosen <- sapply(keywords, grepl, x = Fosen1$content, ignore.case = TRUE)
Fosen1$topic <- ifelse(rowSums(matches_fosen) > 0, colnames(matches_fosen)[max.col(matches_fosen)], "Other")
matches_sami <- sapply(keywords, grepl, x = Sami1$content, ignore.case = TRUE)
Sami1$topic <- ifelse(rowSums(matches_sami) > 0, colnames(matches_sami)[max.col(matches_sami)], "Other")
matches_samilandrights <- sapply(keywords, grepl, x = Samilandrights1$content, ignore.case = TRUE)
Samilandrights1$topic <- ifelse(rowSums(matches_samilandrights) > 0, colnames(matches_samilandrights)[max.col(matches_samilandrights)], "Other")
matches_samirights <- sapply(keywords, grepl, x = SamiRights1$content, ignore.case = TRUE)
SamiRights1$topic <- ifelse(rowSums(matches_samirights) > 0, colnames(matches_samirights)[max.col(matches_samirights)], "Other")
matches_indigenous <- sapply(keywords, grepl, x = IndigenousPeoples1$content, ignore.case = TRUE)
IndigenousPeoples1$topic <- ifelse(rowSums(matches_indigenous) > 0, colnames(matches_indigenous)[max.col(matches_indigenous)], "Other")
matches_windmills <- sapply(keywords, grepl, x = WindMills1$content, ignore.case = TRUE)
WindMills1$topic <- ifelse(rowSums(matches_windmills) > 0, colnames(matches_windmills)[max.col(matches_windmills)], "Other")
matches_sapmi <- sapply(keywords, grepl, x = Sápmi1$content, ignore.case = TRUE)
Sápmi1$topic <- ifelse(rowSums(matches_sapmi) > 0, colnames(matches_sapmi)[max.col(matches_sapmi)], "Other")
matches_standwithsapmi <- sapply(keywords, grepl, x = StandwithSápmi1$content, ignore.case = TRUE)
StandwithSápmi1$topic <- ifelse(rowSums(matches_standwithsapmi) > 0, colnames(matches_standwithsapmi)[max.col(matches_standwithsapmi)], "Other")
Pivot all or part of the dataframe into either wide or long
format.
# Use pivot_wider to convert the dataframe to wide format
Fosen1_wide <- tidyr::pivot_wider(Fosen1, names_from = "topic", values_from = "username")
Sami1_wide <- tidyr::pivot_wider(Fosen1, names_from = "topic", values_from = "username")
Samilandrights1_wide <- tidyr::pivot_wider(Fosen1, names_from = "topic", values_from = "username")
SamiRights1_wide <- tidyr::pivot_wider(Fosen1, names_from = "topic", values_from = "username")
IndigenousPeoples1_wide <- tidyr::pivot_wider(Fosen1, names_from = "topic", values_from = "username")
WindMills1_wide <- tidyr::pivot_wider(Fosen1, names_from = "topic", values_from = "username")
Sápmi1_wide <- tidyr::pivot_wider(Fosen1, names_from = "topic", values_from = "username")
StandwithSápmi1_wide <- tidyr::pivot_wider(Fosen1, names_from = "topic", values_from = "username")
Create a custom stopword list and augment with existing Norwegian
stopword list
#knitr::opts_chunk$set(echo = TRUE) Using this arugment did not help present the data in nice columns like you can have in the markdown before knitting.
library(tidytext)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Define your custom stop words
my_stopwords_en <- c("the", "and", "or", "but", "a", "an", "to", "of", "in", "is", "this", "that", "it", "with", "for", "https", "http", "t.co")
# Create a new data frame with the custom stop words
my_stopwords_en_df <- data.frame(word = my_stopwords_en, language = "en")
no_stopwords <- stopwords::stopwords(language = "norwegian")
no_stopwords_df <- data.frame(word = no_stopwords, lexicon = "no_stopwords")
my_stopwords_en_df <- my_stopwords_en_df %>% mutate(lexicon = "my_stopwords_en")
no_stopwords_df <- no_stopwords_df %>% mutate(language = "no")
# Combine the custom stop word list with the default stop word list
combined_stopwords <- rbind(my_stopwords_en_df, no_stopwords_df)
# Use the combined stop word list in your analysis
head(combined_stopwords)
## word language lexicon
## 1 the en my_stopwords_en
## 2 and en my_stopwords_en
## 3 or en my_stopwords_en
## 4 but en my_stopwords_en
## 5 a en my_stopwords_en
## 6 an en my_stopwords_en
library(dplyr)
library(tidytext)
# Convert the matrix to a dataframe
fosen_df <- as.data.frame(matches_fosen)
sami_df <- as.data.frame(matches_sami)
samilandrights_df <-as.data.frame(matches_samilandrights)
samirights_df <- as.data.frame(matches_samirights)
indigenous_df <- as.data.frame(matches_indigenous)
windmills_df<- as.data.frame(matches_windmills)
sapmi_df<- as.data.frame(matches_sapmi)
standwithsapmi_df <- as.data.frame(matches_standwithsapmi)
# Reshape the data into a tidy format
fosen_tidy <- fosen_df %>%
pivot_longer(cols = everything(), names_to = "word", values_to = "count") %>%
mutate(lexicon = ifelse(word %in% combined_stopwords, "combined_stopwords", "none"), dataset = "matches_fosen")
sami_tidy <- sami_df %>%
pivot_longer(cols = everything(), names_to = "word", values_to = "count") %>%
mutate(lexicon = ifelse(word %in% combined_stopwords, "combined_stopwords", "none"), dataset = "matches_sami")
samilandrights_tidy <- samilandrights_df %>%
pivot_longer(cols = everything(), names_to = "word", values_to = "count") %>%
mutate(lexicon = ifelse(word %in% combined_stopwords, "combined_stopwords", "none"), dataset = "matches_samilandrights")
samirights_tidy <- samirights_df %>%
pivot_longer(cols = everything(), names_to = "word", values_to = "count") %>%
mutate(lexicon = ifelse(word %in% combined_stopwords, "combined_stopwords", "none"), dataset = "samirights_fosen")
indigenous_tidy <- indigenous_df %>%
pivot_longer(cols = everything(), names_to = "word", values_to = "count") %>%
mutate(lexicon = ifelse(word %in% combined_stopwords, "combined_stopwords", "none"), dataset = "matches_indigenous")
windmills_tidy <- windmills_df %>%
pivot_longer(cols = everything(), names_to = "word", values_to = "count") %>%
mutate(lexicon = ifelse(word %in% combined_stopwords, "combined_stopwords", "none"), dataset = "matches_windmills")
sapmi_tidy <- sapmi_df %>%
pivot_longer(cols = everything(), names_to = "word", values_to = "count") %>%
mutate(lexicon = ifelse(word %in% combined_stopwords, "combined_stopwords", "none"), dataset = "matches_sapmi")
standwithsapmi_tidy <- standwithsapmi_df %>%
pivot_longer(cols = everything(), names_to = "word", values_to = "count") %>%
mutate(lexicon = ifelse(word %in% combined_stopwords, "combined_stopwords", "none"), dataset = "matches_standwithsapmi")
Remove stop words from column “content”
library(dplyr)
# Remove stop words from the content column
fosen1_tidy <- Fosen1 %>%
unnest_tokens(word, content) %>%
anti_join(combined_stopwords)
## Joining with `by = join_by(word)`
# Remove stop words from the content column
sami1_tidy <- Sami1 %>%
unnest_tokens(word, content) %>%
anti_join(combined_stopwords)
## Joining with `by = join_by(word)`
# Remove stop words from the content column
samilandrights1_tidy <- Samilandrights1 %>%
unnest_tokens(word, content) %>%
anti_join(combined_stopwords)
## Joining with `by = join_by(word)`
# Remove stop words from the content column
samirights1_tidy <- SamiRights1 %>%
unnest_tokens(word, content) %>%
anti_join(combined_stopwords)
## Joining with `by = join_by(word)`
# Remove stop words from the content column
indigenouspeoples1_tidy <- IndigenousPeoples1 %>%
unnest_tokens(word, content) %>%
anti_join(combined_stopwords)
## Joining with `by = join_by(word)`
# Remove stop words from the content column
windmills1_tidy <- WindMills1 %>%
unnest_tokens(word, content) %>%
anti_join(combined_stopwords)
## Joining with `by = join_by(word)`
# Remove stop words from the content column
sápmi1_tidy <- Sápmi1 %>%
unnest_tokens(word, content) %>%
anti_join(combined_stopwords)
## Joining with `by = join_by(word)`
# Remove stop words from the content column
StandwithSápmi1_tidy <- StandwithSápmi1 %>%
unnest_tokens(word, content) %>%
anti_join(combined_stopwords)
## Joining with `by = join_by(word)`
library(ggplot2)
library(dplyr)
# Read in all datasets and add a column specifying the dataset name
samirights1_tidy <- samirights1_tidy %>%
mutate(dataset = "SamiRights")
StandwithSápmi1_tidy <- StandwithSápmi1_tidy %>%
mutate(dataset = "StandwithSápmi")
sami1_tidy <- sami1_tidy %>%
mutate(dataset = "Sami")
sápmi1_tidy <- sápmi1_tidy %>%
mutate(dataset = "Sápmi")
windmills1_tidy <- windmills1_tidy %>%
mutate(dataset = "WindMills")
indigenouspeoples1_tidy <- indigenouspeoples1_tidy %>%
mutate(dataset = "IndigenousPeoples")
samilandrights1_tidy <- samilandrights1_tidy %>%
mutate(dataset = "Samilandrights")
fosen1_tidy <- fosen1_tidy %>%
mutate(dataset = "Fosen")
# Combine all datasets into a single dataframe
all_tweets <- bind_rows(samilandrights1_tidy, indigenouspeoples1_tidy, windmills1_tidy, sápmi1_tidy, StandwithSápmi1_tidy, samirights1_tidy, fosen1_tidy, sami1_tidy)
# Set the seed for reproducibility
set.seed(123)
# Create a sample of approximately 100 rows per dataset
sample_tweets <- all_tweets %>%
group_by(dataset) %>%
sample_n(min(100, n()))