EQ1: In what ways, and to what extent, has the Learning Differences program impacted three distinct participant groups: students, parents, and educators over time?
EQ2: What components of the Learning Differences program are most efficacious over time?
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.1.3
library(textdata)
## Warning: package 'textdata' was built under R version 4.1.3
library(readxl)
## Warning: package 'readxl' was built under R version 4.1.3
library(wordcloud2)
## Warning: package 'wordcloud2' was built under R version 4.1.3
library(SnowballC)
library(topicmodels)
library(stm)
## stm v1.3.6 successfully loaded. See ?stm for help.
## Papers, resources, and other materials at structuraltopicmodel.com
library(ldatuning)
library(knitr)
library(LDAvis)
Data was imported from th e oakcourses.csv source file
and selected for the unit number, post content and poster.
raw_summer15 <- read.csv("data/oakcourses_forumposts.csv") |>
filter(course_id == 8) |>
select(!c(user_email,username, user_firstname, user_lastname, course_shortname, course_id, course_name, unit_name))
50 entries were randomly selected from each unit, then stitched back together into a single document. This was exported as a .csv to Dion for his analysis.
set.seed(2015)
#Unit 1
unit1 <- raw_summer15 |>
filter(forum_id == 102) |>
sample_n(50)
#Unit 2
unit2 <- raw_summer15 |>
filter(forum_id == 104) |>
sample_n(50)
#Unit 3
unit3 <- raw_summer15 |>
filter(forum_id == 108) |>
sample_n(50)
#Unit 4
unit4 <- raw_summer15 |>
filter(forum_id == 114) |>
sample_n(50)
#Unit 5
unit5 <- raw_summer15 |>
filter(forum_id == 116) |>
sample_n(50)
#Unit 6
unit6 <- raw_summer15 |>
filter(forum_id == 132) |>
sample_n(50)
#recombine into single dataframe
summer15_sample <- rbind(unit1, unit2, unit3, unit4, unit5, unit6)
rm(unit1, unit2, unit3, unit4, unit5, unit6)
To prepare the text for tokenizing, HTML tags were first stripped from these samples. Then the text was tokenized, filtered for stop words, then relabeled to include a title for each unit (rather than the code number).
#strip HTML tags
summer15_sample$post_content <- gsub("<[^>]+>","", summer15_sample$post_content)
#tokenize & stop
summer15_tidy <- summer15_sample |>
select(post_content, post_id, forum_name) |>
unnest_tokens(output = word,
input = post_content) |>
anti_join(stop_words, by = "word") |>
select(forum_name, post_id, word)
#custom stops
summer15_top_n <- summer15_tidy |>
select(word) |>
count(word, sort = TRUE)
custom_stops <- data.frame(word = c("teacher","teachers","student","students","molly","molly's","travis","travis's","travis'","matt","matt's","wyatt","wyatt's","nbsp"))
summer15_tidy <- anti_join(summer15_tidy, custom_stops, by = "word")
#relabel by unit name
summer15_tidy <- summer15_tidy |>
mutate(forum_name = str_replace(forum_name, "Discuss: The Myth of Average", "Unit 1")) |>
mutate(forum_name = str_replace(forum_name, "Discuss: Molly's Story", "Unit 2")) |>
mutate(forum_name = str_replace(forum_name, "Discuss: Matt's Story", "Unit 3")) |>
mutate(forum_name = str_replace(forum_name, "Discuss: Travis's Story", "Unit 4")) |>
mutate(forum_name = str_replace(forum_name, "Discuss: From Your Student's Eyes", "Unit 5")) |>
mutate(forum_name = str_replace(forum_name, "Discuss: Wyatt's Story", "Unit 6"))
Word clouds can be conducted for individual units or for the entire
course. The function filter(forum_name == "Unit X") can be
used to focus on a specific unit.
#entire course
summer15cloud <- summer15_tidy |>
select(word) |>
count(word, sort = TRUE) |>
slice(1:50)
wordcloud2(summer15cloud)
A Latent-Dirichlet Allocation analysis revealed the following terms
grouped by category, again for the entire course as well as by unit.
First, we cast each sample into a document term matrix and used
FindTopicsNumber to identify k, the most coherent
number of topics to request the LDA algorithm to produce:
#entire course
summer15_dtm <- summer15_tidy |>
count(post_id, word) |>
cast_dtm(post_id, word, n)
k_metrics <- FindTopicsNumber(
summer15_dtm,
topics = seq(5, 20, by = 1),
metrics = "Griffiths2004",
method = "Gibbs",
control = list(),
mc.cores = NA,
return_models = FALSE,
verbose = FALSE,
libpath = NULL)
FindTopicsNumber_plot(k_metrics)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
#unit 1
unit1_dtm <- summer15_tidy |>
filter(forum_name == "Unit 1") |>
count(post_id, word) |>
cast_dtm(post_id, word, n)
k_metrics <- FindTopicsNumber(
unit1_dtm,
topics = seq(5, 20, by = 1),
metrics = "Griffiths2004",
method = "Gibbs",
control = list(),
mc.cores = NA,
return_models = FALSE,
verbose = FALSE,
libpath = NULL)
FindTopicsNumber_plot(k_metrics)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
#unit 2
unit2_dtm <- summer15_tidy |>
filter(forum_name == "Unit 2") |>
count(post_id, word) |>
cast_dtm(post_id, word, n)
k_metrics <- FindTopicsNumber(
unit2_dtm,
topics = seq(5, 20, by = 1),
metrics = "Griffiths2004",
method = "Gibbs",
control = list(),
mc.cores = NA,
return_models = FALSE,
verbose = FALSE,
libpath = NULL)
FindTopicsNumber_plot(k_metrics)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
#unit 3
unit3_dtm <- summer15_tidy |>
filter(forum_name == "Unit 3") |>
count(post_id, word) |>
cast_dtm(post_id, word, n)
k_metrics <- FindTopicsNumber(
unit3_dtm,
topics = seq(5, 20, by = 1),
metrics = "Griffiths2004",
method = "Gibbs",
control = list(),
mc.cores = NA,
return_models = FALSE,
verbose = FALSE,
libpath = NULL)
FindTopicsNumber_plot(k_metrics)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
#unit 4
unit4_dtm <- summer15_tidy |>
filter(forum_name == "Unit 4") |>
count(post_id, word) |>
cast_dtm(post_id, word, n)
k_metrics <- FindTopicsNumber(
unit4_dtm,
topics = seq(5, 20, by = 1),
metrics = "Griffiths2004",
method = "Gibbs",
control = list(),
mc.cores = NA,
return_models = FALSE,
verbose = FALSE,
libpath = NULL)
FindTopicsNumber_plot(k_metrics)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
#unit 5
unit5_dtm <- summer15_tidy |>
filter(forum_name == "Unit 5") |>
count(post_id, word) |>
cast_dtm(post_id, word, n)
k_metrics <- FindTopicsNumber(
unit5_dtm,
topics = seq(5, 20, by = 1),
metrics = "Griffiths2004",
method = "Gibbs",
control = list(),
mc.cores = NA,
return_models = FALSE,
verbose = FALSE,
libpath = NULL)
FindTopicsNumber_plot(k_metrics)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
#unit 6
unit6_dtm <- summer15_tidy |>
filter(forum_name == "Unit 6") |>
count(post_id, word) |>
cast_dtm(post_id, word, n)
k_metrics <- FindTopicsNumber(
unit6_dtm,
topics = seq(5, 20, by = 1),
metrics = "Griffiths2004",
method = "Gibbs",
control = list(),
mc.cores = NA,
return_models = FALSE,
verbose = FALSE,
libpath = NULL)
FindTopicsNumber_plot(k_metrics)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
Having identified the variable k that is most coherent (highest on the y axis) for each unit’s sample, we then applied an LDA to each unit and plotted the resulting terms to a faceted bar chart:
#LDA
#entire course
summer15_lda <- LDA(summer15_dtm,
k = 15,
control = list(seed = 2015))
summer15_lda <- tidy(summer15_lda)
summer15_top_lda <- summer15_lda |>
group_by(topic) |>
slice_max(beta, n = 5, with_ties = FALSE) |>
ungroup() |>
arrange(topic, -beta)
summer15_top_lda |>
mutate(term = reorder_within(term, beta, topic)) |>
group_by(topic, term) |>
arrange(desc(beta)) |>
ungroup() |>
ggplot(aes(beta, term, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
scale_y_reordered() +
labs(title = "Summer 2015: Top 5 LDA Terms",
x = expression(beta), y = NULL) +
facet_wrap(~ topic, ncol = 4, scales = "free")
#unit 1
unit1_lda <- LDA(unit1_dtm,
k = 14,
control = list(seed = 2015))
unit1_lda <- tidy(unit1_lda)
unit1_top_lda <- unit1_lda |>
group_by(topic) |>
slice_max(beta, n = 5, with_ties = FALSE) |>
ungroup() |>
arrange(topic, -beta)
unit1_top_lda |>
mutate(term = reorder_within(term, beta, topic)) |>
group_by(topic, term) |>
arrange(desc(beta)) |>
ungroup() |>
ggplot(aes(beta, term, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
scale_y_reordered() +
labs(title = "Unit 1:Top 5 LDA Terms",
x = expression(beta), y = NULL) +
facet_wrap(~ topic, ncol = 4, scales = "free")
#unit 2
unit2_lda <- LDA(unit2_dtm,
k = 18,
control = list(seed = 2015))
unit2_lda <- tidy(unit2_lda)
unit2_top_lda <- unit2_lda |>
group_by(topic) |>
slice_max(beta, n = 5, with_ties = FALSE) |>
ungroup() |>
arrange(topic, -beta)
unit2_top_lda |>
mutate(term = reorder_within(term, beta, topic)) |>
group_by(topic, term) |>
arrange(desc(beta)) |>
ungroup() |>
ggplot(aes(beta, term, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
scale_y_reordered() +
labs(title = "Unit 2:Top 5 LDA Terms",
x = expression(beta), y = NULL) +
facet_wrap(~ topic, ncol = 4, scales = "free")
#unit 3
unit3_lda <- LDA(unit3_dtm,
k = 19,
control = list(seed = 2015))
unit3_lda <- tidy(unit3_lda)
unit3_top_lda <- unit3_lda |>
group_by(topic) |>
slice_max(beta, n = 5, with_ties = FALSE) |>
ungroup() |>
arrange(topic, -beta)
unit3_top_lda |>
mutate(term = reorder_within(term, beta, topic)) |>
group_by(topic, term) |>
arrange(desc(beta)) |>
ungroup() |>
ggplot(aes(beta, term, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
scale_y_reordered() +
labs(title = "Unit 3:Top 5 LDA Terms",
x = expression(beta), y = NULL) +
facet_wrap(~ topic, ncol = 4, scales = "free")
#unit 4
unit4_lda <- LDA(unit4_dtm,
k = 12,
control = list(seed = 2015))
unit4_lda <- tidy(unit4_lda)
unit4_top_lda <- unit4_lda |>
group_by(topic) |>
slice_max(beta, n = 5, with_ties = FALSE) |>
ungroup() |>
arrange(topic, -beta)
unit4_top_lda |>
mutate(term = reorder_within(term, beta, topic)) |>
group_by(topic, term) |>
arrange(desc(beta)) |>
ungroup() |>
ggplot(aes(beta, term, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
scale_y_reordered() +
labs(title = "Unit 4:Top 5 LDA Terms",
x = expression(beta), y = NULL) +
facet_wrap(~ topic, ncol = 4, scales = "free")
#unit 5
unit5_lda <- LDA(unit5_dtm,
k = 16,
control = list(seed = 2015))
unit5_lda <- tidy(unit5_lda)
unit5_top_lda <- unit5_lda |>
group_by(topic) |>
slice_max(beta, n = 5, with_ties = FALSE) |>
ungroup() |>
arrange(topic, -beta)
unit5_top_lda |>
mutate(term = reorder_within(term, beta, topic)) |>
group_by(topic, term) |>
arrange(desc(beta)) |>
ungroup() |>
ggplot(aes(beta, term, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
scale_y_reordered() +
labs(title = "Unit 5:Top 5 LDA Terms",
x = expression(beta), y = NULL) +
facet_wrap(~ topic, ncol = 4, scales = "free")
#unit 6
unit6_lda <- LDA(unit6_dtm,
k = 18,
control = list(seed = 2015))
unit6_lda <- tidy(unit6_lda)
unit6_top_lda <- unit6_lda |>
group_by(topic) |>
slice_max(beta, n = 5, with_ties = FALSE) |>
ungroup() |>
arrange(topic, -beta)
unit6_top_lda |>
mutate(term = reorder_within(term, beta, topic)) |>
group_by(topic, term) |>
arrange(desc(beta)) |>
ungroup() |>
ggplot(aes(beta, term, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
scale_y_reordered() +
labs(title = "Unit 6:Top 5 LDA Terms",
x = expression(beta), y = NULL) +
facet_wrap(~ topic, ncol = 4, scales = "free")
Sentiment analysis applies a numerical value to each token according to a chosen set or spectrum of sentiments (e.g. positive, negative, fear, anger, etc.), then aggregating these scores. This analysis uses the AFINN library, which assigns negative and positive values from -5 to 5, respectively. The following code is what we used to show how sentiment changed across the six units:
#SENTIMENT
afinn <- get_sentiments("afinn")
summer15_sentiment <- inner_join(summer15_tidy, afinn, by = "word")
summer15_sentiment_summary <- summer15_sentiment |>
group_by(forum_name) |>
summarise(sentiment = sum(value))
summer15_sentiment_summary |>
ggplot(aes(x = forum_name, y = sentiment)) +
geom_col() +
xlab("Unit Number") +
ylab("AFINN Sentiment Value") +
ggtitle("Aggregate Sentiment Value Across All Summer 2015 Units")
Note: All sentiment was net positive for this sample.