Now lets take a look at some ggplot2 barplots
We’ll start with making a dataframe based on the tooth data
df <- data.frame(dose = c("D0.5", "D1", "D2"),
len = c(4.2, 10, 29.5))
df
## dose len
## 1 D0.5 4.2
## 2 D1 10.0
## 3 D2 29.5
And now lets make a second dataframe
df2 <- data.frame(supp = rep(c("VC", "OJ"), each = 3),
dose = rep(c("D0.5", "D1", "D2"), 2),
len = c(6.8, 15, 33, 4.2, 10, 29.5))
df2
## supp dose len
## 1 VC D0.5 6.8
## 2 VC D1 15.0
## 3 VC D2 33.0
## 4 OJ D0.5 4.2
## 5 OJ D1 10.0
## 6 OJ D2 29.5
Lets load up ggplot2
library(ggplot2)
Lets set our parameters for ggplot
theme_set(
theme_classic() +
theme(legend.position = "top")
)
Lets start with some basic barplots using the tooth data
f <- ggplot(df, aes(x = dose, y = len))
f + geom_col()
Now lets change the fill, and add labels to the top
f + geom_col(fill = "darkblue") +
geom_text(aes(label = len), vjust = -0.3)
f + geom_col(fill = "darkblue") +
geom_text(aes(label = len), vjust = 1.6, color = "white")
Now lets change the barplot colors by group
f + geom_col(aes(color = dose), fill = "white") +
scale_color_manual(values = c("blue", "gold", "red"))
This is kinda hard to see, so lets change the fill
f + geom_col(aes(fill = dose)) +
scale_fill_manual(values = c("blue", "gold", "red"))
Ok how do we do this with multiple groups
ggplot(df2, aes(x = dose, y = len)) +
geom_col(aes(color = supp, fill = supp), position = position_stack()) +
scale_color_manual(values = c("blue", "gold")) +
scale_fill_manual(values = c("blue", "gold"))
p <- ggplot(df2, aes(x = dose, y = len)) +
geom_col(aes(color = supp, fill = supp), position = position_dodge(0.8), width = 0.7) +
scale_color_manual(values = c("blue", "gold")) +
scale_fill_manual(values = c("blue", "gold"))
p
Now lets add those labels to the dodged barplot
p + geom_text(
aes(label = len, group = supp),
position = position_dodge(0.8),
vjust = -0.3, size = 3.5
)
Now what if we want to add labels to our stacked barplots? For this we need dplyr
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df2 <- df2 %>%
group_by(dose) %>%
arrange(dose, desc(supp)) %>%
mutate(lab_ypos = cumsum(len) - 0.5 * len)
df2
## # A tibble: 6 × 4
## # Groups: dose [3]
## supp dose len lab_ypos
## <chr> <chr> <dbl> <dbl>
## 1 VC D0.5 6.8 3.4
## 2 OJ D0.5 4.2 8.9
## 3 VC D1 15 7.5
## 4 OJ D1 10 20
## 5 VC D2 33 16.5
## 6 OJ D2 29.5 47.8
Now lets recreate our stacked graphs
ggplot(df2, aes(x = dose, y = len)) +
geom_col(aes(fill = supp), width = 0.7) +
geom_text(aes(y = lab_ypos, label = len, group = supp), color = "white") +
scale_color_manual(values = c("blue", "gold")) +
scale_fill_manual(values = c("blue", "gold"))
Lets look at some boxplots
data("ToothGrowth")
Lets change thr dose to a factor, and look at the top of the dataframe
ToothGrowth$dose <- as.factor(ToothGrowth$dose)
head(ToothGrowth, 4)
## len supp dose
## 1 4.2 VC 0.5
## 2 11.5 VC 0.5
## 3 7.3 VC 0.5
## 4 5.8 VC 0.5
Lets load ggplot
library(ggplot2)
Lets set the theme for our plots to classic
theme_set(
theme_bw() +
theme(legend.position = "top")
)
Lets start with a very basic boxplot with dose vs length
tg <- ggplot(ToothGrowth, aes(x = dose, y = len))
tg + geom_boxplot()
Now lets look at a boxplot with points for the mean
tg + geom_boxplot(notch = TRUE, fill = "lightgrey") +
stat_summary(fun.y = mean, geom = "point", shape = 18, size = 2.5, color = "indianred")
## Warning: The `fun.y` argument of `stat_summary()` is deprecated as of ggplot2 3.3.0.
## ℹ Please use the `fun` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
We can also change the scale number of variables included, and their order
tg + geom_boxplot() +
scale_x_discrete(limits = c("0.5", "2"))
## Warning: Removed 20 rows containing missing values (`stat_boxplot()`).
Lets put our x axis in descending order
tg + geom_boxplot() +
scale_x_discrete(limits = c("2", "1", "0.5"))
We can also change our boxplot colors by groups
tg + geom_boxplot(aes(color = dose)) +
scale_color_manual(values = c("indianred", "blue4", "green4"))
What if we want to display our data subset by oj vs vitamin C?
tg2 <- tg + geom_boxplot(aes(fill = supp), position = position_dodge(0.9)) +
scale_fill_manual(values = c("#999999", "#E69F00"))
tg2
We can also arrange this as two plots with facet_wrap
tg2 + facet_wrap(~supp)
This is one of my favorite functions so far ^^^
set.seed(1234)
wdata = data.frame(
sex = factor(rep(c("F", "M"), each = 200)),
weight = c(rnorm(200, 56), rnorm(200, 58))
)
head(wdata, 4)
## sex weight
## 1 F 54.79293
## 2 F 56.27743
## 3 F 57.08444
## 4 F 53.65430
Now lets load dplyr
library(dplyr)
mu <- wdata %>%
group_by(sex) %>%
summarise(grp.mean = mean(weight))
Now lets load the plotting package
library(ggplot2)
theme_set(
theme_classic() +
theme(legend.position = "bottom")
)
Now lets create a ggplot object
a <- ggplot(wdata, aes(x = weight))
a + geom_histogram(bins = 30, color = "black", fill = "grey") +
geom_vline(aes(xintercept = mean(weight)),
linetype = "dashed", size = 0.6)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
First lets load the required packages
library(ggplot2)
Lets set our theme
theme_set(
theme_dark() +
theme(legend.position = "top")
)
First lets initiate a ggplot object called TG
data("ToothGrowth")
ToothGrowth$dose <- as.factor(ToothGrowth$dose)
tg <- ggplot(ToothGrowth, aes(x = dose, y = len))
lets create a dotplot with a summary statistic
tg + geom_dotplot(binaxis = "y", stackdir = "center", fill = "white") +
stat_summary(fun = mean, fun.args = list(mult = 1))
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
## Warning: Removed 3 rows containing missing values (`geom_segment()`).
Lets add a boxplot and a dotplot together
tg + geom_boxplot(width = 0.5) +
geom_dotplot(binaxis = "y", stackdir = "center", fill = "white")
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
tg + geom_violin(trim = FALSE) +
geom_dotplot(binaxis = "y", stackdir = "center", fill = "#999999") +
stat_summary(fun = mean, fun.args = list(mult = 1))
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
## Warning: Removed 3 rows containing missing values (`geom_segment()`).
Lets create a dotplot with multiple groups
tg + geom_boxplot(width = 0.5) +
geom_dotplot(aes(fill = supp), binaxis = 'y', stackdir = 'center') +
scale_fill_manual(values = c("indianred", "lightblue1"))
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
tg + geom_boxplot(aes(color = supp), width = 0.5, position = position_dodge(0.8)) +
geom_dotplot(aes(fill = supp, color = supp), binaxis = 'y', stackdir = 'center',
dotsize = 0.8, position = position_dodge(0.8)) +
scale_fill_manual(values = c("#00AFBB", "#E7B800")) +
scale_color_manual(values = c("#00AFBB", "#E7B800"))
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
Now lets change it up and look at some lineplots
We’ll start by mkaing a custom dataframe like the tooth dataset. This way we can see the lines and stuff that we’re modifying
df <- data.frame(dose = c("D0.5", "D1", "D2"),
len = c(4.2, 10, 29.5))
Now lets create a second dataframe for plotting by groups
df2 <- data.frame(supp = rep(c("VC", "OJ"), each = 3),
dose = rep(c("D0.5", "D1,", "D2"), 2),
len = c(6.8, 15, 33, 4.2, 10, 29.5))
df2
## supp dose len
## 1 VC D0.5 6.8
## 2 VC D1, 15.0
## 3 VC D2 33.0
## 4 OJ D0.5 4.2
## 5 OJ D1, 10.0
## 6 OJ D2 29.5
Now lets again load ggplot2 and set a theme
library(ggplot2)
theme_set(
theme_gray() +
theme(legend.position = "right")
)
Now lets do some basic line plots. First we will build a function to display all the different line types.
generateRLineTypes <- function(){
oldPar <- par()
par(font = 2, mar = c(0,0,0,0))
plot(1, pch = "", ylim = c(0.,6), xlim = c(0,0.7), axes = FALSE, xlab = "", ylab = "")
for(i in 0:6) lines(c(0.3,0.7), c(i,i), lty = i, lwd = 3)
text(rep(0.1,6), 0:6, labels = c("0.'Blank'", "1.'solid'", "2.'dashed'", "3.'dotted'",
"4.'dotdash'", "5.'longdash'", "6,'twodash'"))
par(mar=oldPar$mar, font = oldPar$font)
}
generateRLineTypes()
Now lets build a basic line plot
p <- ggplot(data = df, aes(x = dose, y = len, group = 1))
p + geom_line() + geom_point()
Now lets modify the line type and color
p + geom_line(linetype = "dashed", color = "steelblue") +
geom_point(color = "steelblue")
Now lets try a step graph, which indicates a threshold type progression
p + geom_step() + geom_point()
Now lets move on to mkaing multiple groups. First we’ll make our ggplot object
p <- ggplot(df2, aes(x = dose, y = len, group = supp))
Now lets change line types and point shapes by group
p + geom_line(aes(linetype = supp, color = supp)) +
geom_point(aes(shape = supp, color = supp)) +
scale_color_manual(values = c("red", "blue"))
Now lets look at line plots with a numeric x axis
df3 <- data.frame(supp = rep(c("VC", "OJ"), each = 3),
dose = rep(c("0.5", "1", "2"), 2),
len = c(6.8, 15, 33, 4.2, 10, 29.5))
df3
## supp dose len
## 1 VC 0.5 6.8
## 2 VC 1 15.0
## 3 VC 2 33.0
## 4 OJ 0.5 4.2
## 5 OJ 1 10.0
## 6 OJ 2 29.5
Now lets plot where both axises are treated as continuous labels
df3$dose <- as.numeric(as.vector (df3$dose))
ggplot(data = df3, aes(x = dose, y = len, group = supp, color = supp)) +
geom_line() + geom_point()
Now lets look at a line graph with having the x axis as dates. We’ll use the built in economics time series for this example.
head(economics)
## # A tibble: 6 × 6
## date pce pop psavert uempmed unemploy
## <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1967-07-01 507. 198712 12.6 4.5 2944
## 2 1967-08-01 510. 198911 12.6 4.7 2945
## 3 1967-09-01 516. 199113 11.9 4.6 2958
## 4 1967-10-01 512. 199311 12.9 4.9 3143
## 5 1967-11-01 517. 199498 12.8 4.7 3066
## 6 1967-12-01 525. 199657 11.8 4.8 3018
ggplot(data = economics, aes(x = date, y = pop)) +
geom_line()
Now lets subset the data
ss <- subset(economics, date > as.Date("2006-1-1"))
ggplot(data = ss, aes(x = date, y = pop)) + geom_line()
We can also change the line size, for instance by another varable like unemployment rate
ggplot(data = economics, aes(x = date, y = pop)) +
geom_line(aes(size = unemploy/pop))
We can also plot multiple time series data
ggplot(economics, aes(x = date)) +
geom_line(aes(y = psavert), color = "darkred") +
geom_line(aes(y = uempmed), color = "steelblue", linetype = "twodash")
Lastly, lets make this into an area plot
ggplot(economics, aes(x=date)) +
geom_area(aes(y = psavert), fill = "#999999",
color = "#999999", alpha = 0.5) +
geom_area(aes(y = uempmed), fill = "#E69F00",
color = "#E69F00", alpha = 0.5)
library(ggplot2)
library(ggridges)
Now lets load some sample data
?airquality
air <- ggplot(airquality) + aes(Temp, Month, group = Month) + geom_density_ridges()
air
## Picking joint bandwidth of 2.65
Now lets add some pazzaz to our graph
library(viridis)
## Loading required package: viridisLite
ggplot(airquality) + aes(Temp, Month, group = Month, fill = ..x..) +
geom_density_ridges_gradient() +
scale_fill_viridis(option = "C", name = "Temp")
## Warning: The dot-dot notation (`..x..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(x)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Picking joint bandwidth of 2.65
Last thing we will do is create a facet plot for all of our data.
library(tidyr)
airquality %>%
gather(key = "Measurement", value = "value", Ozone, Solar.R, Wind, Temp) %>%
ggplot() + aes(value, Month, group = Month) +
geom_density_ridges() +
facet_wrap(~ Measurement, scales = "free")
## Picking joint bandwidth of 11
## Picking joint bandwidth of 40.1
## Picking joint bandwidth of 2.65
## Picking joint bandwidth of 1.44
## Warning: Removed 44 rows containing non-finite values
## (`stat_density_ridges()`).
A density plot is a nice alternative to a histogram
set.seed(1234)
wdata = data.frame(
sex = factor(rep(c("F", "M"), each = 200)),
weight = c(rnorm(200, 55), rnorm(200, 58))
)
library(dplyr)
mu <- wdata %>%
group_by(sex) %>%
summarise(grp.mean = mean(weight))
Now lets load the graphing package
library(ggplot2)
theme_set(
theme_classic() +
theme(legend.position = "right")
)
Now lets do the basic plot function. First we will create a ggplot object
d <- ggplot(wdata, aes(x = weight))
Now lets do a basic density plot
d + geom_density() +
geom_vline(aes(xintercept = mean(weight)), linetype = "dashed")
Now lets change the y axis to count instead of density
a + geom_density(aes(y = stat(count)), fill = "lightgrey") +
geom_vline(aes(xintercept = mean(weight)), linetype = "dashed")
## Warning: `stat(count)` was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
d + geom_density(aes(color = sex)) +
scale_color_manual(values = c("darkgrey", "gold"))
Lastly, lets fill the density plots
d + geom_density(aes(fill = sex), alpha = 0.4) +
geom_vline(aes(xintercept = grp.mean, color = sex), data = mu, linetype = "dashed") +
scale_color_manual(values = c("grey", "gold")) +
scale_fill_manual(values = c("grey", "gold"))
First lets load our required package
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
Lets start with a scatterplot of the Orange dataset
Orange <- as.data.frame(Orange)
plot_ly(data = Orange, x = ~age, y = ~circumference)
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
Now lets add some more info
plot_ly(data = Orange, x = ~age, y = ~circumference,
color = ~Tree, size = ~age,
text = ~paste("Tree ID:", Tree, "<br>Age", age, "circ:", circumference)
)
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
Now lets create a random distribution and add it to our dataset
trace_1 <- rnorm(35, mean = 120, sd = 10)
new_data <- data.frame(Orange, trace_1)
We’ll use the random numbers as lines on the graph
plot_ly(data = new_data, x = ~age, y = ~circumference, color = ~Tree, size = ~age,
text = ~paste("Tree ID:", Tree, "<br>Age:", age, "<br>Circ:", circumference)) %>%
add_trace(y = ~trace_1, mode = 'lines') %>%
add_trace(y = ~circumference, mode = 'markers')
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
Now lets create a graph with the option of showing as a scatter or a line and add labels.
plot_ly(data = Orange, x = ~age, y = ~circumference,
color = ~Tree, size = ~circumference,
text = ~paste("Tree ID:", Tree, "<br>Age:", age, "<br>Circ:", circumference)) %>%
add_trace(y = ~circumference, mode = 'markers') %>%
layout(
title = "Plot or Orange data with switchable trace",
updatemenus = list(
list(
type = "downdrop",
y = 0.8,
buttons = list(
list(method = "restyle",
args = list("mode", "markers"),
label = "Marker"
),
list(method = "restyle",
args = list("mode", "lines"),
labels = "Lines"
)
)
)
)
)
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plotly.com/r/reference/#scatter
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
## Warning: `line.width` does not currently support multiple values.
First lets load pur required packages
library(plotly)
Now lets create a random 3D matrix
d <- data.frame(
x <- seq(1,10, by = 0.5),
y <- seq(1,10, by = 0.5)
)
z <- matrix(rnorm(length(d$x) * length(d$y)), nrow = length(d$x), ncol = length(d$y))
Now lets plot our 3D data
plot_ly(d, x = ~x, y = ~y, z = ~z) %>%
add_surface()
Lets add s=more aspects to it, such as topography
plot_ly(d, x = ~x, y = ~y, z = ~z) %>%
add_surface(
contours = list(
z = list(
show = TRUE,
usecolormap = TRUE,
highlightcolor = "FF0000",
project = list(z = TRUE)
)
)
)
Now lets look at a 3D scatter plot
plot_ly(longley, x = ~GNP, y = ~Population, z = ~Employed, marker = list(color = ~GNP)) %>%
add_markers()
First, lets load our required libraries
library(ggplot2)
library(dplyr)
library(plotrix)
theme_set(
theme_classic() +
theme(legend.position = 'top')
)
Lets again use the tooth data for this exercise
df <- ToothGrowth
df$dose <- as.factor(df$dose)
Now lets use dplyr for manipulation purposes
df.summary <- df %>%
group_by(dose) %>%
summarise(
sd = sd(len, na.rm = TRUE),
len = mean(len),
stderr = std.error(len, na.rm = TRUE)
)
df.summary
## # A tibble: 3 × 4
## dose sd len stderr
## <fct> <dbl> <dbl> <dbl>
## 1 0.5 4.50 10.6 NA
## 2 1 4.42 19.7 NA
## 3 2 3.77 26.1 NA
Lets now look at some key functions
lets start by creating a ggplot object
tg <- ggplot(
df.summary,
aes(x = dose, y = len, ymin = len - sd, ymax = len + sd)
)
Now lets look at the most basic error bars
tg + geom_pointrange()
tg + geom_errorbar(width = 0.2) +
geom_point(size = 1.5)
Now lets create horizontal error bars by manipulating our graph
ggplot(df.summary, aes(x = len, y = dose, xmin = len-sd, xmax = len+sd)) +
geom_point() +
geom_errorbar(height = 0.2)
## Warning in geom_errorbar(height = 0.2): Ignoring unknown parameters: `height`
This just gives you an idea of error bars on the horizontal axis
Now lets look at adding jitter points (actual measurements) to our data.
ggplot(df, aes(dose, len)) +
geom_jitter(position = position_jitter(0.2), color = "darkgrey") +
geom_pointrange(aes(ymin = len-sd, ymax = len+sd), data = df.summary)
Now lets try error bars on a violin plot
ggplot(df, aes(dose, len)) +
geom_violin(color = "darkgrey", trim = FALSE) +
geom_pointrange(aes(ymin = len-sd, ymax = len+sd), data = df.summary)
Now how about w/ a line graph?
ggplot(df.summary, aes(dose, len)) +
geom_line(aes(group = 1)) + # always specidy this when you have 1 line
geom_errorbar(aes(ymin = len-stderr, ymax = len+stderr), width = 0.2) +
geom_point(size = 2)
#Where are my error bars???
Now lets make a bar graph with half error bars
ggplot(df.summary, aes(dose, len)) +
geom_col(fill = "lightgrey", color = "black") +
geom_errorbar(aes(ymin = len, ymax = len+stderr), width = 0.2)
You can see that by not specifying wmin = len-stderr, we have in essence cut our error bar in half.
How about we add jitter points to line plots? We need to use the original df for the jitter plot, and the summary df for the geom layers.
ggplot(df, aes(dose, len)) +
geom_jitter(position = position_jitter(0.2), color = "darkgrey") +
geom_line(aes(group = 1), data = df.summary) +
geom_errorbar(
aes(ymin = len - stderr, ymax = len + stderr),
data = df.summary, width = 0.2) +
geom_point(data = df.summary, size = 0.2)
What about adding jitterpoints to a barplot?
ggplot(df, aes(dose, len)) +
geom_col(data = df.summary, fill = NA, color = "black") +
geom_jitter(position = position_jitter(0.2), color = "darkgrey") +
geom_errorbar(aes(ymin = len - stderr, ymax = len + stderr),
data = df.summary, width = 0.2)
What if we wanted to have our error bars per group? (OJ vs VC)
df.summary2 <- df %>%
group_by(dose, supp) %>%
summarise(
sd = sd(len),
stderr = std.error(len),
len = mean(len)
)
## `summarise()` has grouped output by 'dose'. You can override using the
## `.groups` argument.
df.summary2
## # A tibble: 6 × 5
## # Groups: dose [3]
## dose supp sd stderr len
## <fct> <fct> <dbl> <dbl> <dbl>
## 1 0.5 OJ 4.46 1.41 13.2
## 2 0.5 VC 2.75 0.869 7.98
## 3 1 OJ 3.91 1.24 22.7
## 4 1 VC 2.52 0.795 16.8
## 5 2 OJ 2.66 0.840 26.1
## 6 2 VC 4.80 1.52 26.1
Now you can see we have mean and error for each dose and supp
ggplot(df.summary2, aes(dose, len)) +
geom_pointrange(
aes(ymin = len - stderr, ymax = len + stderr, color = supp),
position = position_dodge(0.3)) +
scale_color_manual(values = c("indianred", "lightblue"))
How about line plots with multiple error bars
ggplot(df.summary2, aes(dose, len)) +
geom_line(aes(linetype = supp, group = supp)) +
geom_point() +
geom_errorbar(aes(ymin = len - stderr, ymax = len + stderr, group = supp), width = 0.2)
And the same with a bar plot
ggplot(df.summary2, aes(dose, len)) +
geom_col(aes(fill = supp), position = position_dodge(0.8), width = 0.7) +
geom_errorbar(
aes(ymin = len - stderr, ymax = len + stderr, group = supp),
width = 0.2, position = position_dodge(0.8)) +
scale_fill_manual(values = c("indianred", "lightblue"))
Now lets add some jitterpoints
ggplot(df, aes(dose, len, color = supp)) +
geom_jitter(position = position_dodge(0.2)) +
geom_line(aes(group = supp), data = df.summary2) +
geom_point() +
geom_errorbar(aes(ymin = len - stderr, ymax = len + stderr, group = supp), data = df.summary2, width = 0.2)
“Last one” he says
ggplot(df, aes(dose, len, color = supp)) +
geom_col(data = df.summary2, position = position_dodge(0.8), width = 0.7, fill = "white") +
geom_jitter(
position = position_jitterdodge(jitter.width = 0.2, dodge.width = 0.8)) +
geom_errorbar(
aes(ymin = len - stderr, ymax = len + stderr), data = df.summary2,
width = 0.2, position = position_dodge(0.8)) +
scale_color_manual(values = c("indianred", "lightblue")) +
theme(legend.position = "top")
Now lets do an empirical cumulative distribution function. This reports any given number percentile of individuals that are above or below the threshold.
set.seed(1234)
wdata = data.frame(
sex = factor(rep(c("F", "M"), each = 200)),
weight = c(rnorm(200, 55), rnorm(200, 58)))
Now lets look at our dataframe
head(wdata, 5)
## sex weight
## 1 F 53.79293
## 2 F 55.27743
## 3 F 56.08444
## 4 F 52.65430
## 5 F 55.42912
Now lets load our plotting package
library(ggplot2)
theme_set(
theme_classic() +
theme(legend.position = "bottom")
)
Now lets create our ECDF plot
ggplot(wdata, aes(x = weight)) +
stat_ecdf(aes(color = sex, linetype = sex),
geom = "step", size = 1.5) +
scale_color_manual(values = c("#00AFBB", "#E7B900")) +
labs(y = "weight")
Now lets look at qqplots. These are used to determine if the given data follows a normal distribution.
#install.packages("ggpubr")
set.seed(1234)
Now lets randomly generate some data
wdata = data.frame(
sex = factor(rep(c("F", "M"), each = 200)),
weight = c(rnorm(200, 55), rnorm(200, 58))
)
Lets set our theme for the graphing with ggplot
library(ggplot2)
theme_set(
theme_classic() +
theme(legend.position = "top")
)
create a qq plot of the weight
ggplot(wdata, aes(sample = weight)) +
stat_qq(aes(color = sex)) +
scale_color_manual(values = c("indianred", "lightblue")) +
labs(y = "weight")
#Inidan red and light blue are better than mustard yellow :)
library(ggpubr)
ggqqplot(wdata, x = "weight",
color = "sex",
palettes = c("indianred", "lightblue"),
ggtheme = theme_pubclean())
Now what would a norm distribution look like?
#install.packages(mnonr)
library(mnonr)
data2 <- mnonr::mnonr(n = 1000, p = 2, ms = 3, mk = 61, Sigma = matrix(c(1,0.5,0.5,1), 2,2), initial = NULL)
data2 <- as.data.frame(data2)
Now lets plot the non normal data
ggplot(data2, aes(sample = V1)) +
stat_qq()
Now, normal plotted data
ggqqplot(data2, x = "V1",
palette = "indianred",
ggtheme = theme_pubclean())
Lets look at how to put multiple plots together into a single figure
library(ggpubr)
library(ggplot2)
theme_set(
theme_classic() +
theme(legend.position = "top")
)
First lets create a nice boxplot, lets load the data
df <- ToothGrowth
df$dose <- as.factor(df$dose)
and create the plot object
p <- ggplot(df, aes(x = dose, y = len)) +
geom_boxplot(aes(fill = supp), position = position_dodge(0.9)) +
scale_fill_manual(values = c("indianred", "lightblue"))
p
Now lets look at the gvgplot facet function
p + facet_grid(rows = vars(supp))
Now lets do a facet with mult. variables
p + facet_grid(rows = vars(dose), cols = vars(supp))
p
Now lets look at the facet_wrap function. This allows facets to be placed side by side.
p + facet_wrap(vars(dose), ncol = 3)
Now how do we combine multiple plots using ggarange()
lets start by making some basic plots. First we will define a color palette and data
my3cols <- c("indianred", "lightblue", "green")
ToothGrowth$dose <- as.factor(ToothGrowth$dose)
Now lets make some basic plots
p <- ggplot(ToothGrowth, aes(x = dose, y = len))
bxp <- p + geom_boxplot(aes(color = dose)) +
scale_color_manual(values = my3cols)
Now lets do a dotplot
dp <- p + geom_dotplot(aes(color = dose, fill = dose),
binaxis = 'y', stackdir = 'center') +
scale_color_manual(values = my3cols) +
scale_fill_manual(values = my3cols)
Now lastly lets create a lineplot
lp <- ggplot(economics, aes(x = date, y = psavert)) +
geom_line(color = "indianred")
Now we can make the figure
figure <- ggarrange(bxp, dp, lp, labels = c("A", "B", "C"), ncol = 2, nrow = 2)
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
figure
This looks great, but we can make it look even better
figure2 <- ggarrange(
lp,
ggarrange(bxp, dp, ncol = 2, labels = c("B", "C")),
nrow = 2,
labels = "A")
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
figure2
ok this looks really good, but you’ll notice that there are 2 legends that are the same
ggarrange(
bxp, dp, labels = c("A", "B"),
common.legend = TRUE, legend = "bottom")
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
Lastly, we should export the plot
ggexport(figure2, filename = "facetfigure.pdf")
## file saved to facetfigure.pdf
We can also export mult. figures to a pdf
ggexport(box, lp, dp, filename = "multi.pdf")
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
## file saved to multi.pdf
Lastly, we can export to pdf w/ mult. pages and mult. columns
ggexport(bxp, dp, lp, bxp, filename = "test2.pdf", nrow = 2, ncol = 1)
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
## file saved to test2.pdf
Lets get started w/ heatmaps
library(heatmap3)
Now lets get our data
data <- ldeaths
data2 <- do.call(cbind, split(data, cycle(data)))
dimnames(data2) <- dimnames(.preformat.ts(data))
Now lets generate a heatmap
heatmap(data2)
heatmap(data2, Rowv = NA, Colv = NA)
Now lets play with colors
rc <- rainbow(nrow(data2), start = 0, end = 0.3)
cc <- rainbow(ncol(data2), start = 0, end = 0.3)
Now lets apply our color selections
heatmap(data2, ColSideColors = cc)
library(RColorBrewer)
heatmap(data2, ColSideColors = cc,
col = colorRampPalette(brewer.pal(8, "PiYG")) (25))
Theres more that we can customize
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:plotrix':
##
## plotCI
## The following object is masked from 'package:stats':
##
## lowess
heatmap.2(data2, ColSideColors = cc,
col = colorRampPalette(brewer.pal(8, "PiYG"))(25))
Missing Values
If you encounter an unusual value in your dataset, and simply want to move on to the rest of your analysis, you have 2 options:
Drop the entire row with the strange values:
library(dplyr)
library(ggplot2)
diamonds <- diamonds
diamonds2 <- diamonds %>%
filter(between(y,3,20))
In this instance, y is the width of the diamond, so anything 3 mm or above 20 mm is exlcuded.
I don’t recommend this option; just because there is one bad measurement doesn’t mean they are all bad.
Instead, I recommend replacing the unusual values with missing values
diamonds3 <- diamonds %>%
mutate(y = ifelse(y < 3 | y > 20, NA, y))
Like R, ggplot2 subscribes to the idea that missing values should not pass silently into the night.
ggplot(data = diamonds3, mapping = aes(x = x, y = y)) +
geom_point()
## Warning: Removed 9 rows containing missing values (`geom_point()`).
If you want to suppress that warning you can use na.rm = TRUE
ggplot(data = diamonds3, mapping = aes(x = x, y = y)) +
geom_point(na.rm = TRUE)
Other times you want to understand what makes observations with missing values different to the observation with recorded values. For example, in the NYCflights13 dataset, missing values in the dep_time variable indicate that the flight was cancelled. So, you might want to compare the scheduled departure times for canelled and non-cancelled times.
library(nycflights13)
nycflights13::flights %>%
mutate(
cancelled = is.na(dep_time),
sched_hour = sched_dep_time %/% 100,
sched_min = sched_dep_time %% 100,
sched_dep_time = sched_hour + sched_min / 60
) %>%
ggplot(mapping = aes (sched_dep_time)) +
geom_freqpoly(mapping = aes(color = cancelled), bindwidth = 1/4)
## Warning in geom_freqpoly(mapping = aes(color = cancelled), bindwidth = 1/4):
## Ignoring unknown parameters: `bindwidth`
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
What if we want to know what our outliers are?
First we need to load the required libraries.
library(outliers)
library(ggplot2)
library(readxl)
And reload the dataset because we removed outliers
Air_data <- read_xlsx("AirQualityUCI.xlsx")
Lets create a function using the grubb test to identify all outliers. The grubbs test identifies outliers in a univariate dataset that is presumed to come from a normal distribution.
grubbs.flag <- function(x) {
#lets create a variable called outliers and save nothing in it, and add to the variable as we identify them
outliers <- NULL
# we'll create a variable called test to identify which univariate we are testing
test <- x
# now using the outliers package, use grubbs.test to find outliers in our variable
grubbs.result <- grubbs.test(test)
# lets get the p values of all tested variables
pv <- grubbs.result$p.value
# now lets search through our p values for ones that are outside of 0.5
while(pv < 0.05) {
# anything with a p value less than 0.5, we add to our empty outliers vector
outliers <- c(outliers, as.numeric(strsplit(grubbs.result$alternative, " ")[[1]][3]))
# now we want to remove those outliers from our test variable
test <- x[!x %in% outliers]
# and run the grubbs test again without the outliers
grubbs.result <- grubbs.test(test)
# and save the new p values
pv <- grubbs.result$p.value
}
return(data.frame(x=x, outliers = (x %in% outliers)))
}
#identified_outliers <- grubbs.flag(Air_data$set)
Now we can create a histogram where the outliers were
ggplot(grubbs.flag(Air_data$AH), aes(x = Air_data$AH, color = outliers, fill = outliers)) +
geom_histogram(bindwidth = diff(range(Air_data$AH))) +
theme_bw()
## Warning in geom_histogram(bindwidth = diff(range(Air_data$AH))): Ignoring
## unknown parameters: `bindwidth`
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
library(ggplot2)
ggplot(data = diamonds, mapping = aes(x = price)) +
geom_freqpoly(mapping = aes(color = cut), binwidth = 500)
Its hard to see the difference in distribution because the counts differ so much.
ggplot(diamonds)+
geom_bar(mapping = aes(x = cut))
To make comparison easier, we need to swap the display on the y-axis. Instead of displaying count, we’ll display density, which is the count standardized so that the area under the curve is one.
ggplot(data = diamonds, mapping = aes(x = price, y = ..density..)) +
geom_freqpoly(mapping = aes(color = cut), binwidth = 500)
It appears that fair diamonds (the lowest cut quality) have the highest average price. But maybe thats because the frequency of polygons are a little hard to interpret.
Another alternative is the boxplot. A boxplot is a type of visual shorthand for a distribution of values.
ggplot(data = diamonds, mapping = aes(x = cut, y = price)) +
geom_boxplot()
We see much less info about the distribution, but the boxplots are much more compact, so we can more easily compare them. It supports the counterintuitive finding that better quality diamonds are cheaper on average!
Lets look at some car data
ggplot(data = mpg, mapping = aes(x = class, y = hwy)) +
geom_boxplot()
ggplot(data = mpg) +
geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy))
If you have long variable names, you can switch the axis and flip it 90 degrees.
ggplot(data = mpg) +
geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy))+
coord_flip()
To visualize the correltion between two continuous variables, we can use a scatter plot.
ggplot(data = diamonds) +
geom_point(mapping = aes(x = carat, y = price))
Scatterplots become less useful as the size of your dataset grows, because we get overplot. We can fix this by using the alpha aesthetic
ggplot(data = diamonds) +
geom_point(mapping = aes(x = carat, y = price), alpha = 1/100)
First lets load a required library
library(RCurl)
##
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
##
## complete
library(dplyr)
Now lets get our data
site <- "https://raw.githubusercontent.com/nytimes/covid-19-data/master/colleges/colleges.csv"
College_Data <- read.csv(site)
First lets use the str function, this shows the structure of the object
str(College_Data)
## 'data.frame': 1948 obs. of 9 variables:
## $ date : chr "2021-05-26" "2021-05-26" "2021-05-26" "2021-05-26" ...
## $ state : chr "Alabama" "Alabama" "Alabama" "Alabama" ...
## $ county : chr "Madison" "Montgomery" "Limestone" "Lee" ...
## $ city : chr "Huntsville" "Montgomery" "Athens" "Auburn" ...
## $ ipeds_id : chr "100654" "100724" "100812" "100858" ...
## $ college : chr "Alabama A&M University" "Alabama State University" "Athens State University" "Auburn University" ...
## $ cases : int 41 2 45 2742 220 4 263 137 49 76 ...
## $ cases_2021: int NA NA 10 567 80 NA 49 53 10 35 ...
## $ notes : chr "" "" "" "" ...
What if we want to arrange our dataset alphabetically by college?
alphabetical <- (College_Data) %>%
arrange(College_Data$college)
The glimpse package is another way to preview data
glimpse(College_Data)
## Rows: 1,948
## Columns: 9
## $ date <chr> "2021-05-26", "2021-05-26", "2021-05-26", "2021-05-26", "20…
## $ state <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Ala…
## $ county <chr> "Madison", "Montgomery", "Limestone", "Lee", "Montgomery", …
## $ city <chr> "Huntsville", "Montgomery", "Athens", "Auburn", "Montgomery…
## $ ipeds_id <chr> "100654", "100724", "100812", "100858", "100830", "102429",…
## $ college <chr> "Alabama A&M University", "Alabama State University", "Athe…
## $ cases <int> 41, 2, 45, 2742, 220, 4, 263, 137, 49, 76, 67, 0, 229, 19, …
## $ cases_2021 <int> NA, NA, 10, 567, 80, NA, 49, 53, 10, 35, 5, NA, 10, NA, 19,…
## $ notes <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",…
We can also subset with select()
College_Cases <- select(College_Data, college, cases)
We can also filter or subset with the filter function
Louisiana_Cases <- filter(College_Data, state == "Louisiana")
Lets filter out a smaller amount of states
South_Cases <- filter(College_Data, state == "Louisiana" | state == "Texas" | state == "Arkansas" | state == "Mississippi")
Lets look at some time series data
First we’ll load required libararies
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:plotrix':
##
## rescale
## The following object is masked from 'package:viridis':
##
## viridis_pal
Now lets load some data
state_site <- "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv"
State_Data <- read.csv(state_site)
Lets create group_by object using the state column
state_cases <- group_by(State_Data, state)
class(state_cases)
## [1] "grouped_df" "tbl_df" "tbl" "data.frame"
How many measurements were made by state? This gives us an idea of when states starting reporting.
Days_since_first_reported <- tally(state_cases)
Lets visualize some data
First lets start with some definitions
Data - obvious - the stuff we want to visualize
Layer - made of geometric elements and requisite statistical info. Include geometric objects which represents the plot.
Scales = used to map values in the data space that is used for creation of values (color, size, shape, ect.)
Coordinate system - describes how the data coordinates are mapped together in relation to the plan on the graphing
Faceting - how to break up data in to subsets to display multiple types or groups of data
theme - controls the finer points of the display, such as font size and background color
options(repr.plot.width = 6, repr.plot.height = 6)
class(College_Data)
## [1] "data.frame"
head(College_Data)
## date state county city ipeds_id
## 1 2021-05-26 Alabama Madison Huntsville 100654
## 2 2021-05-26 Alabama Montgomery Montgomery 100724
## 3 2021-05-26 Alabama Limestone Athens 100812
## 4 2021-05-26 Alabama Lee Auburn 100858
## 5 2021-05-26 Alabama Montgomery Montgomery 100830
## 6 2021-05-26 Alabama Walker Jasper 102429
## college cases cases_2021 notes
## 1 Alabama A&M University 41 NA
## 2 Alabama State University 2 NA
## 3 Athens State University 45 10
## 4 Auburn University 2742 567
## 5 Auburn University at Montgomery 220 80
## 6 Bevill State Community College 4 NA
summary(College_Data)
## date state county city
## Length:1948 Length:1948 Length:1948 Length:1948
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## ipeds_id college cases cases_2021
## Length:1948 Length:1948 Min. : 0.0 Min. : 0.0
## Class :character Class :character 1st Qu.: 32.0 1st Qu.: 23.0
## Mode :character Mode :character Median : 114.5 Median : 65.0
## Mean : 363.5 Mean : 168.1
## 3rd Qu.: 303.0 3rd Qu.: 159.0
## Max. :9914.0 Max. :3158.0
## NA's :337
## notes
## Length:1948
## Class :character
## Mode :character
##
##
##
##
Now lets take a look at a different dataset
iris <- as.data.frame(iris)
class(iris)
## [1] "data.frame"
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
Lets start by creating a scatter plot of the college data
ggplot(data = College_Data, aes(x = cases, y = cases_2021)) +
geom_point() +
theme_minimal()
## Warning: Removed 337 rows containing missing values (`geom_point()`).
Now lets do the iris data
ggplot(data = iris, aes(x = Sepal.Width, y = Sepal.Length)) +
geom_point() +
theme_minimal()
Lets color coordinate our college data
ggplot(data = College_Data, aes(x = cases, y = cases_2021, color = state)) +
geom_point() +
theme_minimal()
## Warning: Removed 337 rows containing missing values (`geom_point()`).
Lets color coordinate iris data
ggplot(data = iris, aes(x = Sepal.Width, y = Sepal.Length, color = Species)) +
geom_point() +
theme_minimal()
Lets run a simple histogram of our Louisiana Case Data
hist(Louisiana_Cases$cases, freq = NULL, density = NULL, breaks = 10, xlab = "Total Cases", ylab = "Frequency", main = "Total College Covid-19 Infections (Louisiana)")
Lets run a simple histogram for the iris data
hist(iris$Sepal.Width, freq = NULL, density = NULL, breaks = 10, xlab = "Sepal Width", ylab = "Frequency", main = "Iris Sepal Width")
histogram_college <- ggplot(data = Louisiana_Cases, aes(x = cases))
histogram_college + geom_histogram(bindwidth = 100, color = "black", aes(fill = county)) +
xlab("cases") + ylab("Frequency") + ggtitle("Histogram of Covid-19 Cases in Louisiana")
## Warning in geom_histogram(bindwidth = 100, color = "black", aes(fill =
## county)): Ignoring unknown parameters: `bindwidth`
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Lets create a ggplot for the iris data
histogram_iris <- ggplot(data = iris, aes(x = Sepal.Width))
histogram_iris + geom_histogram(bindiwth = 0.2, color = "black", aes(fill = Species)) +
xlab("Sepal Width") + ylab("Frequency") + ggtitle("Histogram of Iris Sepal Width by Species")
## Warning in geom_histogram(bindiwth = 0.2, color = "black", aes(fill =
## Species)): Ignoring unknown parameters: `bindiwth`
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Maybe a density plot makes more sense for our college data
ggplot(South_Cases) +
geom_density(aes(x = cases, fill = state), alpha = 0.25)
Lets do it with the iris data
ggplot(iris) +
geom_density(aes(x = Sepal.Width, fill = Species), alpha = 0.25)
Lets look at violin plots for iris
ggplot(data = iris, aes(x = Species, y = Sepal.Length, color = Species)) +
geom_violin() +
theme_classic() +
theme(legend.position = "none")
Now lets try the south data
ggplot(data = South_Cases, aes(x = state, y = cases, color = state)) +
geom_violin() +
theme_gray() +
theme(legend.position = "none")
Now lets take a look at risidual plots. This is a graph that displays the residuals on the vertical axis, and the independent variable on the horizontal. In the event that the points in a residual plot are dispersed in a random manner aroind the horizontal axis, it is appropriate to use a linear regression. If they are not randomly dispersed, a non linear model is more appropriate.
Lets start with the iris data
ggplot(lm(Sepal.Length ~ Sepal.Width, data = iris)) +
geom_point(aes(x = .fitted, y = .resid))
Now lets look at the souther state cases
ggplot(lm(cases ~ cases_2021, data = South_Cases)) +
geom_point(aes(x = .fitted, y = .resid))
A linear model is not a good call for the state cases
Now lets do some correlations
obesity <- read.csv("Obesity_insurance.csv")
library(tidyr)
library(dplyr)
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following object is masked from 'package:ggpubr':
##
## mutate
## The following objects are masked from 'package:plotly':
##
## arrange, mutate, rename, summarise
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
Lets look at the structure of the dataset
str(obesity)
## 'data.frame': 1338 obs. of 7 variables:
## $ age : int 19 18 28 33 32 31 46 37 37 60 ...
## $ sex : chr "female" "male" "male" "male" ...
## $ bmi : num 27.9 33.8 33 22.7 28.9 ...
## $ children: int 0 1 3 0 0 0 1 3 2 0 ...
## $ smoker : chr "yes" "no" "no" "no" ...
## $ region : chr "southwest" "southeast" "southeast" "northwest" ...
## $ charges : num 16885 1726 4449 21984 3867 ...
Lets look at the column classes
class(obesity)
## [1] "data.frame"
And get a summary of distribution of variables
summary(obesity)
## age sex bmi children
## Min. :18.00 Length:1338 Min. :15.96 Min. :0.000
## 1st Qu.:27.00 Class :character 1st Qu.:26.30 1st Qu.:0.000
## Median :39.00 Mode :character Median :30.40 Median :1.000
## Mean :39.21 Mean :30.66 Mean :1.095
## 3rd Qu.:51.00 3rd Qu.:34.69 3rd Qu.:2.000
## Max. :64.00 Max. :53.13 Max. :5.000
## smoker region charges
## Length:1338 Length:1338 Min. : 1122
## Class :character Class :character 1st Qu.: 4740
## Mode :character Mode :character Median : 9382
## Mean :13270
## 3rd Qu.:16640
## Max. :63770
Now lets look at the distribution for insurance charges
hist(obesity$charges)
We can also get an idea of the distribution using a boxplot
boxplot(obesity$charges)
boxplot(obesity$bmi)
Now lets look at correlations. The cor() command is used to determine correlations between two vectors, all of the columns of a data frame, or 2 data frames. The cov() command on the other hand examines the covariance. The cor.test() command carries out a test as to the significance of the correlation
cor(obesity$charges, obesity$bmi)
## [1] 0.198341
This test uses a spearman Rho correltation, or you can use Kendall’s tau by specifying it
cor(obesity$charges, obesity$bmi, method = "kendall")
## [1] 0.08252397
This correlation measures strength of a correlation between -1 and 1.
Noe lets look at the Tietjen = Moore test. This is used for univariate datasets. The algorithm depicts the detection of the outliers in a univariate dataset.
TietjenMoore <- function(dataSeries, k)
{
n = length(dataSeries)
# compute the absolute residuals
r = abs(dataSeries - mean(dataSeries))
# sort data according to size of residual
df = data.frame(dataSeries, r)
dfs = df[order(df$r),]
#create a subset of the data without the largest values
klarge = c((n-k+1):n)
subdataSeries = dfs$dataSeries[-klarge]
#Compuet the sums of squares.
ksub = (subdataSeries = mean(subdataSeries))**2
all = (df$dataSeries - mean(df$dataSeries))**2
# compute the test statistic.
sum(ksub)/sum(all)
}
This function helps to compute the absolute residuals and sorts the data according to the size of the residuals. Later, we will focus on the computation of sum of squares.
FindOutliersTietjenMooreTest <- function(dataSeries, k, alpha = 0.5){
ek <- TietjenMoore(dataSeries, k)
# Compute critical values based on simulation.
test = c(1:10000)
for (i in 1:10000){
dataSeriesdataSeries = rnorm(length(dataSeries))
test[i] = TietjenMoore(dataSeriesdataSeries, k)}
Talpha = quantile(test, alpha)
list(T = ek, Talpha = Talpha)
}
This function helps us to compute critical values based on simulation data. Now lets demonstrate these functions with simple data and obesity dataset for evaluating this algorithm.
The critical region for the Teirtjen-Moore is determined by simulation. The simulation is performed by generating a standard normal random sample of size n and comuting the Tietjen-Moore test statistic. Typically, 10,000 random samples are used. The values of the Tietjen_Moore statistic obtained from the data is compared to this reference distribution. The values of the test stat is between zero and one. If there are no outliers in the data, the test stat is close to 1. Ig there are outliers, the test stat will be closer to 0. Thus, the test is always a lower, one-tailed test regardless of which test stat is used, Lk or Ek.
First we will look at charges
boxplot(obesity$charges)
FindOutliersTietjenMooreTest(obesity$charges, 100)
## $T
## [1] 0.0005906641
##
## $Talpha
## 50%
## 3.166558e-07
Let check out bmi
boxplot(obesity$bmi)
FindOutliersTietjenMooreTest(obesity$bmi, 7)
## $T
## [1] 0.01878681
##
## $Talpha
## 50%
## 2.634795e-07
Probability Plots
library(ggplot2)
library(tigerstats)
## Loading required package: abd
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## Loading required package: lattice
## Loading required package: grid
## Loading required package: mosaic
## Registered S3 method overwritten by 'mosaic':
## method from
## fortify.SpatialPolygonsDataFrame ggplot2
##
## The 'mosaic' package masks several functions from core packages in order to add
## additional features. The original behavior of these functions should not be affected by this.
##
## Attaching package: 'mosaic'
## The following object is masked from 'package:Matrix':
##
## mean
## The following object is masked from 'package:plyr':
##
## count
## The following object is masked from 'package:scales':
##
## rescale
## The following object is masked from 'package:plotrix':
##
## rescale
## The following object is masked from 'package:plotly':
##
## do
## The following objects are masked from 'package:dplyr':
##
## count, do, tally
## The following object is masked from 'package:ggplot2':
##
## stat
## The following objects are masked from 'package:stats':
##
## binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
## quantile, sd, t.test, var
## The following objects are masked from 'package:base':
##
## max, mean, min, prod, range, sample, sum
## Welcome to tigerstats!
## To learn more about this package, consult its website:
## http://homerhanumat.github.io/tigerstats
We will use the probability plot function and their output dnorm: density function of the normal distribution. Using the density, it is possible to determine the probability of events. Or for example, you may wonder “what is the likelihood that a person has a BMI of exactly ___? In this case, you would need to retrieve the density of the BMI distribution at values 140. The BMI distribution can be modeled with a mean of 100 and a standard deviation of 15. The corresponding density is:
bmi.mean <- mean(obesity$bmi)
bmi.sd <- sd(obesity$bmi)
Lets create a plot of our normal distribution
bmi.dist <- dnorm(obesity$bmi, mean = bmi.mean, sd = bmi.sd)
bmi.df <- data.frame("bmi" = obesity$bmi, "Density" = bmi.dist)
ggplot(bmi.df, aes(x = bmi, y = Density)) +
geom_point()
This gives us the probability of every single point occurring
Now lets use the pnorm function for more info
bmi.dist <- pnorm(obesity$bmi, mean = bmi.mean, sd = bmi.sd)
bmi.df <- data.frame("bmi" = obesity$bmi, "Density" = bmi.dist)
ggplot(bmi.df, aes(x = bmi, y = Density)) +
geom_point()
What if we want to find the probability of the bmi being greater than 40 in our distribution?
pp_greater <- function(x) {
paste(round(100 * pnorm(x, mean = 30.66339, sd = 6.09818, lower.tail = FALSE), 2), "%")
}
pp_greater(40)
## [1] "6.29 %"
pnormGC(40, region = "above", mean = 30.66339, sd = 6.09818, graph = TRUE)
## [1] 0.06287869
What about the probability that a bmi is less than 40 in our population?
pp_less <- function(x) {
paste(round(100 *(1-pnorm(x, mean = 30.66339, sd = 6.09818, lower.tail = FALSE)),2), "%")
}
pp_less(40)
## [1] "93.71 %"
pnormGC(40, region = "below", mean = 30.66339, sd = 6.09818, graph = TRUE)
## [1] 0.9371213
What if we want to find the area in between?
pnormGC(c(20, 40), region = "between", mean = 30.66339, sd = 6.09818, graph = TRUE)
## [1] 0.8969428
What if we want to know the quantiles? Lets use the pnorm function. We need to assume a normal distribution for this.
What bmi represents the lowest 1% of the population?
qnorm(0.01, mean = 30.66339, sd = 6.09818, lower.tail = TRUE)
## [1] 16.4769
What if you want a random sampling of values within your distribution?
subset <- rnorm(50, mean = 30.66339, sd = 6.09818)
hist(subset)
subset2 <- rnorm(5000, mean = 30.66339, sd = 6.09818)
hist(subset2)
Shapiro-Wilk Test
so now we know how to generate a normal distribution, how do we tell if our samples came from a normal distribution?
shapiro.test(obesity$charges[1:5])
##
## Shapiro-Wilk normality test
##
## data: obesity$charges[1:5]
## W = 0.84164, p-value = 0.1695
You can see here, with a small sample size, we would reject the null hypothesis that the samples came from a normal distribution. We can increase the power of the test by increasing the sample size.
shapiro.test(obesity$charges[1:1000])
##
## Shapiro-Wilk normality test
##
## data: obesity$charges[1:1000]
## W = 0.8119, p-value < 2.2e-16
Now lets check out age
shapiro.test(obesity$age[1:1000])
##
## Shapiro-Wilk normality test
##
## data: obesity$age[1:1000]
## W = 0.94406, p-value < 2.2e-16
And lastly bmi
shapiro.test(obesity$bmi[1:1000])
##
## Shapiro-Wilk normality test
##
## data: obesity$bmi[1:1000]
## W = 0.99471, p-value = 0.001426
Time series data
First lets load our packages
library(readr)
##
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
##
## col_factor
library(readxl)
#Air_data <- read_xlsx("AirQualityUCI.xlsx")
Date - date of measurement time - time of measurement CO(GT) - average hourly CO2 PT08,s1(CO) - tin oxide hourly average sensor response NMHC - average hourly non-metallic hydrocarbon concentration C6HC - average benzene concentration PT08.S3 (NMHC)- titania average hourly sensor response NOX - average hourly NOX conc. NO2 - average hourly NO2 conc. T - Temper RH - relative humidity AH - Absolute humidity
#str(Air_data)
library(tidyr)
library(dplyr)
library(lubridate)
library(hms)
##
## Attaching package: 'hms'
## The following object is masked from 'package:lubridate':
##
## hms
library(ggplot2)
Lets get rid of the date in the time column
#Air_data$Time <- as_hms(Air_data$time)
#glimpse(Air_data)
#plot(Air_data$Ah, Air_data$RH, main = "Humidity Analysis", xlab = "Absolute Humidity", ylab = "Relative Humidity")
Notice we have an outlier in our data
#t.test(Air_data$RH, Air_data$AH)
First we’ll look at the unnest_token function
Lets start by looking at an EMily Dickenson passage
text <- c("Because I could not stop from Death - ",
"He kindly stopped for me - ",
"The Carriage held but just Ourselves -",
"and Immortality")
text
## [1] "Because I could not stop from Death - "
## [2] "He kindly stopped for me - "
## [3] "The Carriage held but just Ourselves -"
## [4] "and Immortality"
This is a typical character vector that we might want to analyze. In order to turn it into a tidytext dataset, we first need to put it into a dataframe.
library(dplyr)
text_df <- tibble(line = 1:4, text = text)
text_df
## # A tibble: 4 × 2
## line text
## <int> <chr>
## 1 1 "Because I could not stop from Death - "
## 2 2 "He kindly stopped for me - "
## 3 3 "The Carriage held but just Ourselves -"
## 4 4 "and Immortality"
Reminder: A tibble is a modern class of data frame within R. Its available in the dplyr and tibble packages, that has a convienent print method, will not convert strongs to factors, and does not use row names. Tibbles are great for use with tidy tools.
Next we will use the ‘unnest_tokens’ functions.
First we have the output column name that will be crerated as the text is unnested into it
library(tidytext)
text_df %>%
unnest_tokens(word, text)
## # A tibble: 20 × 2
## line word
## <int> <chr>
## 1 1 because
## 2 1 i
## 3 1 could
## 4 1 not
## 5 1 stop
## 6 1 from
## 7 1 death
## 8 2 he
## 9 2 kindly
## 10 2 stopped
## 11 2 for
## 12 2 me
## 13 3 the
## 14 3 carriage
## 15 3 held
## 16 3 but
## 17 3 just
## 18 3 ourselves
## 19 4 and
## 20 4 immortality
Lets use the janeaustenr package to analysze some Jane Austen texts. There are 6 books in this package.
library(janeaustenr)
library(dplyr)
library(stringr)
original_books <- austen_books() %>%
group_by(book) %>%
dplyr::mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup()
original_books
## # A tibble: 73,422 × 4
## text book linenumber chapter
## <chr> <fct> <int> <int>
## 1 "SENSE AND SENSIBILITY" Sense & Sensibility 1 0
## 2 "" Sense & Sensibility 2 0
## 3 "by Jane Austen" Sense & Sensibility 3 0
## 4 "" Sense & Sensibility 4 0
## 5 "(1811)" Sense & Sensibility 5 0
## 6 "" Sense & Sensibility 6 0
## 7 "" Sense & Sensibility 7 0
## 8 "" Sense & Sensibility 8 0
## 9 "" Sense & Sensibility 9 0
## 10 "CHAPTER 1" Sense & Sensibility 10 1
## # ℹ 73,412 more rows
To work with this as a tiday dataset, we need to restructure it n the on-token-per-row format, which as we saw earlier is done with the unnest_tokens() function
library(tidytext)
tidy_books <- original_books %>%
unnest_tokens(word,text)
tidy_books
## # A tibble: 725,055 × 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Sense & Sensibility 1 0 sense
## 2 Sense & Sensibility 1 0 and
## 3 Sense & Sensibility 1 0 sensibility
## 4 Sense & Sensibility 3 0 by
## 5 Sense & Sensibility 3 0 jane
## 6 Sense & Sensibility 3 0 austen
## 7 Sense & Sensibility 5 0 1811
## 8 Sense & Sensibility 10 1 chapter
## 9 Sense & Sensibility 10 1 1
## 10 Sense & Sensibility 13 1 the
## # ℹ 725,045 more rows
This function uses the tokenziers package to separate each line of text in the original dataframe into tokens.
The default tokenizing is for words, but other options including characters, n-grams, sentences, lines, or paragraphs can be used.
Now that the data is in a one-word-per-row format, we can manipulate it with tools like dplyr.
Often in text analysis, we will want to remove stop words. Stop words are words that are NOT useful for an analysis. These include words like the, of, to, and, and so forth.
We can remove stop words (kept in the tidytext dataset ’stop_words) with an anti_join().
data(stop_words)
tidy_books <- tidy_books
The stop words dataset in the tidytext package contains stop words from three lexicons. We can use them all together, as we have three, or filter() to only use one set of stop words if thats more appropriate for your analysis.
tidy_books %>%
count(word, sort = TRUE)
## # A tibble: 14,520 × 2
## word n
## <chr> <int>
## 1 the 26351
## 2 to 24044
## 3 and 22515
## 4 of 21178
## 5 a 13408
## 6 her 13055
## 7 i 12006
## 8 in 11217
## 9 was 11204
## 10 it 10234
## # ℹ 14,510 more rows
Beacuse we’ve been using tidy tools, our word counts are stored in a tidy data frame. This allows us to pipe this directly into ggplot2. For example, we can create a visualization of the most common words.
library(ggplot2)
tidy_books %>%
count(word, sort = TRUE) %>%
filter(n > 600) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word)) +
geom_col() +
labs(y = NULL, x = "word count")
The gutenbergr package
This package provides access to the public domain works from the gutenberg project (www.gutenber.org). This package includes tools for both downloading books and a complete dataset of project gutenberg metadata that can be used to find works of interest. We will mostly use the function gutenberg_download().
Word frequencies
Lets look at some biology texts, starting with Darwin
The voyage of the Beagle - 944 On the origin of species by the means of natural selection - 1228 The expression of emotions in man and animals - 1227 The decent of man, and selection in relation to sex - 2300
We can access these works susing the gutenberg_download() and the Project Gutenberg IDnumbers
library(gutenbergr)
darwin <- gutenberg_download(c(944, 1227, 1228, 2300), mirror = "http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/")
## Warning: ! Could not download a book at
## http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg//1/2/2/1228/1228.zip.
## ℹ The book may have been archived.
## ℹ Alternatively, You may need to select a different mirror.
## → See https://www.gutenberg.org/MIRRORS.ALL for options.
Lets break these into tokens
tidy_darwin <- darwin %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
## Joining with `by = join_by(word)`
Lets check out what the most common darwin words are.
tidy_darwin %>%
count(word, sort = TRUE)
## # A tibble: 22,315 × 2
## word n
## <chr> <int>
## 1 male 1620
## 2 species 1452
## 3 males 1286
## 4 female 1173
## 5 birds 1124
## 6 sexes 1062
## 7 animals 1013
## 8 females 1003
## 9 sexual 749
## 10 vol 712
## # ℹ 22,305 more rows
Now lets get some work from THomas Hunt Morgan, who is credited with discovering chromosomes.
Regeneration - 57198 The genetic and operative evidence relating to secondary sexual characteristics - 57460 Evolution and Adaptation - 63540
morgan <- gutenberg_download(c(57198, 57460, 63540), mirror = "http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/")
Lets tokenize THM
tidy_morgan <- morgan %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
## Joining with `by = join_by(word)`
What are THM’s most common words?
tidy_morgan %>%
count(word, sort = TRUE)
## # A tibble: 13,855 × 2
## word n
## <chr> <int>
## 1 species 869
## 2 regeneration 814
## 3 piece 702
## 4 cut 669
## 5 male 668
## 6 forms 631
## 7 selection 604
## 8 cells 576
## 9 found 552
## 10 development 546
## # ℹ 13,845 more rows
Lastly, lets look at Thomas Henry Huxley
Evidence as to mans place in nature - 2931 On the reception of the Origin of Species - 2089 Evolution and Ethics, and other essays - 2940 Science and Culture, and other essays - 52344
huxley <- gutenberg_download(c(2931, 2940, 52344), mirror = "http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/")
## Warning: ! Could not download a book at
## http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg//2/9/4/2940/2940.zip.
## ℹ The book may have been archived.
## ℹ Alternatively, You may need to select a different mirror.
## → See https://www.gutenberg.org/MIRRORS.ALL for options.
tidy_huxley <- huxley %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
## Joining with `by = join_by(word)`
tidy_huxley %>%
count(word, sort = TRUE)
## # A tibble: 11,251 × 2
## word n
## <chr> <int>
## 1 knowledge 184
## 2 de 170
## 3 animals 155
## 4 animal 151
## 5 science 150
## 6 life 149
## 7 time 135
## 8 body 126
## 9 nature 111
## 10 physical 105
## # ℹ 11,241 more rows
Now lets calculate the frequency for each word for the works of Darwin, Morgan, and Huxley by binding the frames together.
library(tidyr)
frequency <- bind_rows(mutate(tidy_morgan, author = "Thomas Hunt Morgan"),
mutate(tidy_darwin, author = "Charles Darwin"),
mutate(tidy_huxley, author = "Thomas Henry Huxley")) %>%
mutate(word = str_extract(word, "[a-z']+")) %>%
count(author, word) %>%
group_by(author) %>%
mutate(proportion = n/ sum(n)) %>%
select(-n) %>%
pivot_wider(names_from = author, values_from = proportion) %>%
pivot_longer('Thomas Hunt Morgan': 'Charles Darwin', names_to = "author", values_to = "proportion")
frequency
## # A tibble: 87,084 × 3
## word author proportion
## <chr> <chr> <dbl>
## 1 a Thomas Hunt Morgan 0.000638
## 2 a Thomas Henry Huxley 0.0000163
## 3 a Charles Darwin 0.0000489
## 4 ab Thomas Hunt Morgan 0.0000512
## 5 ab Thomas Henry Huxley 0.0000186
## 6 ab Charles Darwin 0.00000466
## 7 abaiss Thomas Hunt Morgan NA
## 8 abaiss Thomas Henry Huxley NA
## 9 abaiss Charles Darwin 0.00000466
## 10 abandon Thomas Hunt Morgan 0.00000233
## # ℹ 87,074 more rows
Now we need to change the table so that each author has its own row
frequency2 <- pivot_wider(frequency, names_from = author, values_from = proportion)
frequency2
## # A tibble: 29,028 × 4
## word `Thomas Hunt Morgan` `Thomas Henry Huxley` `Charles Darwin`
## <chr> <dbl> <dbl> <dbl>
## 1 a 0.000638 0.0000163 0.0000489
## 2 ab 0.0000512 0.0000186 0.00000466
## 3 abaiss NA NA 0.00000466
## 4 abandon 0.00000233 NA 0.00000233
## 5 abandoned 0.00000466 0.00000233 0.00000233
## 6 abashed NA NA 0.00000233
## 7 abatement NA 0.00000233 0.00000233
## 8 abbot NA 0.00000466 0.00000233
## 9 abbott NA NA 0.00000466
## 10 abbreviated NA NA 0.00000931
## # ℹ 29,018 more rows
Now lets plot
library(tidyr)
frequency <- bind_rows(mutate(tidy_morgan, author = "Thomas Hunt Morgan"),
mutate(tidy_darwin, author = "Charles Darwin"),
mutate(tidy_huxley, author = "Thomas Henry Huxley")) %>%
mutate(word = str_extract(word, "[a-z]+")) %>%
count(author, word) %>%
group_by(author) %>%
mutate(proportion = n/ sum(n)) %>%
select(-n) %>%
pivot_wider(names_from = author, values_from = proportion) %>%
pivot_longer('Thomas Hunt Morgan': 'Charles Darwin', names_to = "author", values_to = "proportion")
frequency
## # A tibble: 86,541 × 3
## word author proportion
## <chr> <chr> <dbl>
## 1 a Thomas Hunt Morgan 0.000638
## 2 a Thomas Henry Huxley 0.0000163
## 3 a Charles Darwin 0.0000489
## 4 ab Thomas Hunt Morgan 0.0000512
## 5 ab Thomas Henry Huxley 0.0000186
## 6 ab Charles Darwin 0.00000466
## 7 abaiss Thomas Hunt Morgan NA
## 8 abaiss Thomas Henry Huxley NA
## 9 abaiss Charles Darwin 0.00000466
## 10 abandon Thomas Hunt Morgan 0.00000233
## # ℹ 86,531 more rows
The sentiments datasets
There are a variety of methods and dictionaries that exist for evaluating the opinion or emotion of the text.
AFFIN bing nrc
bing categorizes words in a binary fashion into positive or negative nrc categorizes into positive, negative, anger, anticpation, disgust, fear, joy, sadness, surprise and trust. AFFIN assigns a score between -5 and 5, with negative indicating negative sentiment, and 5 positive.
The function get_sentiments() allows us to get the specific sentiments with the measures for each one.
installed.packages("textdata")
## Package LibPath Version Priority Depends Imports LinkingTo Suggests
## Enhances License License_is_FOSS License_restricts_use OS_type Archs
## MD5sum NeedsCompilation Built
library(tidytext)
library(textdata)
afinn <- read.csv("afinn.csv")
afinn
## word value
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## 11 abilities 2
## 12 ability 2
## 13 aboard 1
## 14 absentee -1
## 15 absentees -1
## 16 absolve 2
## 17 absolved 2
## 18 absolves 2
## 19 absolving 2
## 20 absorbed 1
## 21 abuse -3
## 22 abused -3
## 23 abuses -3
## 24 abusive -3
## 25 accept 1
## 26 accepted 1
## 27 accepting 1
## 28 accepts 1
## 29 accident -2
## 30 accidental -2
## 31 accidentally -2
## 32 accidents -2
## 33 accomplish 2
## 34 accomplished 2
## 35 accomplishes 2
## 36 accusation -2
## 37 accusations -2
## 38 accuse -2
## 39 accused -2
## 40 accuses -2
## 41 accusing -2
## 42 ache -2
## 43 achievable 1
## 44 aching -2
## 45 acquit 2
## 46 acquits 2
## 47 acquitted 2
## 48 acquitting 2
## 49 acrimonious -3
## 50 active 1
## 51 adequate 1
## 52 admire 3
## 53 admired 3
## 54 admires 3
## 55 admiring 3
## 56 admit -1
## 57 admits -1
## 58 admitted -1
## 59 admonish -2
## 60 admonished -2
## 61 adopt 1
## 62 adopts 1
## 63 adorable 3
## 64 adore 3
## 65 adored 3
## 66 adores 3
## 67 advanced 1
## 68 advantage 2
## 69 advantages 2
## 70 adventure 2
## 71 adventures 2
## 72 adventurous 2
## 73 affected -1
## 74 affection 3
## 75 affectionate 3
## 76 afflicted -1
## 77 affronted -1
## 78 afraid -2
## 79 aggravate -2
## 80 aggravated -2
## 81 aggravates -2
## 82 aggravating -2
## 83 aggression -2
## 84 aggressions -2
## 85 aggressive -2
## 86 aghast -2
## 87 agog 2
## 88 agonise -3
## 89 agonised -3
## 90 agonises -3
## 91 agonising -3
## 92 agonize -3
## 93 agonized -3
## 94 agonizes -3
## 95 agonizing -3
## 96 agree 1
## 97 agreeable 2
## 98 agreed 1
## 99 agreement 1
## 100 agrees 1
## 101 alarm -2
## 102 alarmed -2
## 103 alarmist -2
## 104 alarmists -2
## 105 alas -1
## 106 alert -1
## 107 alienation -2
## 108 alive 1
## 109 allergic -2
## 110 allow 1
## 111 alone -2
## 112 amaze 2
## 113 amazed 2
## 114 amazes 2
## 115 amazing 4
## 116 ambitious 2
## 117 ambivalent -1
## 118 amuse 3
## 119 amused 3
## 120 amusement 3
## 121 amusements 3
## 122 anger -3
## 123 angers -3
## 124 angry -3
## 125 anguish -3
## 126 anguished -3
## 127 animosity -2
## 128 annoy -2
## 129 annoyance -2
## 130 annoyed -2
## 131 annoying -2
## 132 annoys -2
## 133 antagonistic -2
## 134 anti -1
## 135 anticipation 1
## 136 anxiety -2
## 137 anxious -2
## 138 apathetic -3
## 139 apathy -3
## 140 apeshit -3
## 141 apocalyptic -2
## 142 apologise -1
## 143 apologised -1
## 144 apologises -1
## 145 apologising -1
## 146 apologize -1
## 147 apologized -1
## 148 apologizes -1
## 149 apologizing -1
## 150 apology -1
## 151 appalled -2
## 152 appalling -2
## 153 appease 2
## 154 appeased 2
## 155 appeases 2
## 156 appeasing 2
## 157 applaud 2
## 158 applauded 2
## 159 applauding 2
## 160 applauds 2
## 161 applause 2
## 162 appreciate 2
## 163 appreciated 2
## 164 appreciates 2
## 165 appreciating 2
## 166 appreciation 2
## 167 apprehensive -2
## 168 approval 2
## 169 approved 2
## 170 approves 2
## 171 ardent 1
## 172 arrest -2
## 173 arrested -3
## 174 arrests -2
## 175 arrogant -2
## 176 ashame -2
## 177 ashamed -2
## 178 ass -4
## 179 assassination -3
## 180 assassinations -3
## 181 asset 2
## 182 assets 2
## 183 assfucking -4
## 184 asshole -4
## 185 astonished 2
## 186 astound 3
## 187 astounded 3
## 188 astounding 3
## 189 astoundingly 3
## 190 astounds 3
## 191 attack -1
## 192 attacked -1
## 193 attacking -1
## 194 attacks -1
## 195 attract 1
## 196 attracted 1
## 197 attracting 2
## 198 attraction 2
## 199 attractions 2
## 200 attracts 1
## 201 audacious 3
## 202 authority 1
## 203 avert -1
## 204 averted -1
## 205 averts -1
## 206 avid 2
## 207 avoid -1
## 208 avoided -1
## 209 avoids -1
## 210 await -1
## 211 awaited -1
## 212 awaits -1
## 213 award 3
## 214 awarded 3
## 215 awards 3
## 216 awesome 4
## 217 awful -3
## 218 awkward -2
## 219 axe -1
## 220 axed -1
## 221 backed 1
## 222 backing 2
## 223 backs 1
## 224 bad -3
## 225 badass -3
## 226 badly -3
## 227 bailout -2
## 228 bamboozle -2
## 229 bamboozled -2
## 230 bamboozles -2
## 231 ban -2
## 232 banish -1
## 233 bankrupt -3
## 234 bankster -3
## 235 banned -2
## 236 bargain 2
## 237 barrier -2
## 238 bastard -5
## 239 bastards -5
## 240 battle -1
## 241 battles -1
## 242 beaten -2
## 243 beatific 3
## 244 beating -1
## 245 beauties 3
## 246 beautiful 3
## 247 beautifully 3
## 248 beautify 3
## 249 belittle -2
## 250 belittled -2
## 251 beloved 3
## 252 benefit 2
## 253 benefits 2
## 254 benefitted 2
## 255 benefitting 2
## 256 bereave -2
## 257 bereaved -2
## 258 bereaves -2
## 259 bereaving -2
## 260 best 3
## 261 betray -3
## 262 betrayal -3
## 263 betrayed -3
## 264 betraying -3
## 265 betrays -3
## 266 better 2
## 267 bias -1
## 268 biased -2
## 269 big 1
## 270 bitch -5
## 271 bitches -5
## 272 bitter -2
## 273 bitterly -2
## 274 bizarre -2
## 275 blah -2
## 276 blame -2
## 277 blamed -2
## 278 blames -2
## 279 blaming -2
## 280 bless 2
## 281 blesses 2
## 282 blessing 3
## 283 blind -1
## 284 bliss 3
## 285 blissful 3
## 286 blithe 2
## 287 block -1
## 288 blockbuster 3
## 289 blocked -1
## 290 blocking -1
## 291 blocks -1
## 292 bloody -3
## 293 blurry -2
## 294 boastful -2
## 295 bold 2
## 296 boldly 2
## 297 bomb -1
## 298 boost 1
## 299 boosted 1
## 300 boosting 1
## 301 boosts 1
## 302 bore -2
## 303 bored -2
## 304 boring -3
## 305 bother -2
## 306 bothered -2
## 307 bothers -2
## 308 bothersome -2
## 309 boycott -2
## 310 boycotted -2
## 311 boycotting -2
## 312 boycotts -2
## 313 brainwashing -3
## 314 brave 2
## 315 breakthrough 3
## 316 breathtaking 5
## 317 bribe -3
## 318 bright 1
## 319 brightest 2
## 320 brightness 1
## 321 brilliant 4
## 322 brisk 2
## 323 broke -1
## 324 broken -1
## 325 brooding -2
## 326 bullied -2
## 327 bullshit -4
## 328 bully -2
## 329 bullying -2
## 330 bummer -2
## 331 buoyant 2
## 332 burden -2
## 333 burdened -2
## 334 burdening -2
## 335 burdens -2
## 336 calm 2
## 337 calmed 2
## 338 calming 2
## 339 calms 2
## 340 can't stand -3
## 341 cancel -1
## 342 cancelled -1
## 343 cancelling -1
## 344 cancels -1
## 345 cancer -1
## 346 capable 1
## 347 captivated 3
## 348 care 2
## 349 carefree 1
## 350 careful 2
## 351 carefully 2
## 352 careless -2
## 353 cares 2
## 354 cashing in -2
## 355 casualty -2
## 356 catastrophe -3
## 357 catastrophic -4
## 358 cautious -1
## 359 celebrate 3
## 360 celebrated 3
## 361 celebrates 3
## 362 celebrating 3
## 363 censor -2
## 364 censored -2
## 365 censors -2
## 366 certain 1
## 367 chagrin -2
## 368 chagrined -2
## 369 challenge -1
## 370 chance 2
## 371 chances 2
## 372 chaos -2
## 373 chaotic -2
## 374 charged -3
## 375 charges -2
## 376 charm 3
## 377 charming 3
## 378 charmless -3
## 379 chastise -3
## 380 chastised -3
## 381 chastises -3
## 382 chastising -3
## 383 cheat -3
## 384 cheated -3
## 385 cheater -3
## 386 cheaters -3
## 387 cheats -3
## 388 cheer 2
## 389 cheered 2
## 390 cheerful 2
## 391 cheering 2
## 392 cheerless -2
## 393 cheers 2
## 394 cheery 3
## 395 cherish 2
## 396 cherished 2
## 397 cherishes 2
## 398 cherishing 2
## 399 chic 2
## 400 childish -2
## 401 chilling -1
## 402 choke -2
## 403 choked -2
## 404 chokes -2
## 405 choking -2
## 406 clarifies 2
## 407 clarity 2
## 408 clash -2
## 409 classy 3
## 410 clean 2
## 411 cleaner 2
## 412 clear 1
## 413 cleared 1
## 414 clearly 1
## 415 clears 1
## 416 clever 2
## 417 clouded -1
## 418 clueless -2
## 419 cock -5
## 420 cocksucker -5
## 421 cocksuckers -5
## 422 cocky -2
## 423 coerced -2
## 424 collapse -2
## 425 collapsed -2
## 426 collapses -2
## 427 collapsing -2
## 428 collide -1
## 429 collides -1
## 430 colliding -1
## 431 collision -2
## 432 collisions -2
## 433 colluding -3
## 434 combat -1
## 435 combats -1
## 436 comedy 1
## 437 comfort 2
## 438 comfortable 2
## 439 comforting 2
## 440 comforts 2
## 441 commend 2
## 442 commended 2
## 443 commit 1
## 444 commitment 2
## 445 commits 1
## 446 committed 1
## 447 committing 1
## 448 compassionate 2
## 449 compelled 1
## 450 competent 2
## 451 competitive 2
## 452 complacent -2
## 453 complain -2
## 454 complained -2
## 455 complains -2
## 456 comprehensive 2
## 457 conciliate 2
## 458 conciliated 2
## 459 conciliates 2
## 460 conciliating 2
## 461 condemn -2
## 462 condemnation -2
## 463 condemned -2
## 464 condemns -2
## 465 confidence 2
## 466 confident 2
## 467 conflict -2
## 468 conflicting -2
## 469 conflictive -2
## 470 conflicts -2
## 471 confuse -2
## 472 confused -2
## 473 confusing -2
## 474 congrats 2
## 475 congratulate 2
## 476 congratulation 2
## 477 congratulations 2
## 478 consent 2
## 479 consents 2
## 480 consolable 2
## 481 conspiracy -3
## 482 constrained -2
## 483 contagion -2
## 484 contagions -2
## 485 contagious -1
## 486 contempt -2
## 487 contemptuous -2
## 488 contemptuously -2
## 489 contend -1
## 490 contender -1
## 491 contending -1
## 492 contentious -2
## 493 contestable -2
## 494 controversial -2
## 495 controversially -2
## 496 convince 1
## 497 convinced 1
## 498 convinces 1
## 499 convivial 2
## 500 cool 1
## 501 cool stuff 3
## 502 cornered -2
## 503 corpse -1
## 504 costly -2
## 505 courage 2
## 506 courageous 2
## 507 courteous 2
## 508 courtesy 2
## 509 cover-up -3
## 510 coward -2
## 511 cowardly -2
## 512 coziness 2
## 513 cramp -1
## 514 crap -3
## 515 crash -2
## 516 crazier -2
## 517 craziest -2
## 518 crazy -2
## 519 creative 2
## 520 crestfallen -2
## 521 cried -2
## 522 cries -2
## 523 crime -3
## 524 criminal -3
## 525 criminals -3
## 526 crisis -3
## 527 critic -2
## 528 criticism -2
## 529 criticize -2
## 530 criticized -2
## 531 criticizes -2
## 532 criticizing -2
## 533 critics -2
## 534 cruel -3
## 535 cruelty -3
## 536 crush -1
## 537 crushed -2
## 538 crushes -1
## 539 crushing -1
## 540 cry -1
## 541 crying -2
## 542 cunt -5
## 543 curious 1
## 544 curse -1
## 545 cut -1
## 546 cute 2
## 547 cuts -1
## 548 cutting -1
## 549 cynic -2
## 550 cynical -2
## 551 cynicism -2
## 552 damage -3
## 553 damages -3
## 554 damn -4
## 555 damned -4
## 556 damnit -4
## 557 danger -2
## 558 daredevil 2
## 559 daring 2
## 560 darkest -2
## 561 darkness -1
## 562 dauntless 2
## 563 dead -3
## 564 deadlock -2
## 565 deafening -1
## 566 dear 2
## 567 dearly 3
## 568 death -2
## 569 debonair 2
## 570 debt -2
## 571 deceit -3
## 572 deceitful -3
## 573 deceive -3
## 574 deceived -3
## 575 deceives -3
## 576 deceiving -3
## 577 deception -3
## 578 decisive 1
## 579 dedicated 2
## 580 defeated -2
## 581 defect -3
## 582 defects -3
## 583 defender 2
## 584 defenders 2
## 585 defenseless -2
## 586 defer -1
## 587 deferring -1
## 588 defiant -1
## 589 deficit -2
## 590 degrade -2
## 591 degraded -2
## 592 degrades -2
## 593 dehumanize -2
## 594 dehumanized -2
## 595 dehumanizes -2
## 596 dehumanizing -2
## 597 deject -2
## 598 dejected -2
## 599 dejecting -2
## 600 dejects -2
## 601 delay -1
## 602 delayed -1
## 603 delight 3
## 604 delighted 3
## 605 delighting 3
## 606 delights 3
## 607 demand -1
## 608 demanded -1
## 609 demanding -1
## 610 demands -1
## 611 demonstration -1
## 612 demoralized -2
## 613 denied -2
## 614 denier -2
## 615 deniers -2
## 616 denies -2
## 617 denounce -2
## 618 denounces -2
## 619 deny -2
## 620 denying -2
## 621 depressed -2
## 622 depressing -2
## 623 derail -2
## 624 derailed -2
## 625 derails -2
## 626 deride -2
## 627 derided -2
## 628 derides -2
## 629 deriding -2
## 630 derision -2
## 631 desirable 2
## 632 desire 1
## 633 desired 2
## 634 desirous 2
## 635 despair -3
## 636 despairing -3
## 637 despairs -3
## 638 desperate -3
## 639 desperately -3
## 640 despondent -3
## 641 destroy -3
## 642 destroyed -3
## 643 destroying -3
## 644 destroys -3
## 645 destruction -3
## 646 destructive -3
## 647 detached -1
## 648 detain -2
## 649 detained -2
## 650 detention -2
## 651 determined 2
## 652 devastate -2
## 653 devastated -2
## 654 devastating -2
## 655 devoted 3
## 656 diamond 1
## 657 dick -4
## 658 dickhead -4
## 659 die -3
## 660 died -3
## 661 difficult -1
## 662 diffident -2
## 663 dilemma -1
## 664 dipshit -3
## 665 dire -3
## 666 direful -3
## 667 dirt -2
## 668 dirtier -2
## 669 dirtiest -2
## 670 dirty -2
## 671 disabling -1
## 672 disadvantage -2
## 673 disadvantaged -2
## 674 disappear -1
## 675 disappeared -1
## 676 disappears -1
## 677 disappoint -2
## 678 disappointed -2
## 679 disappointing -2
## 680 disappointment -2
## 681 disappointments -2
## 682 disappoints -2
## 683 disaster -2
## 684 disasters -2
## 685 disastrous -3
## 686 disbelieve -2
## 687 discard -1
## 688 discarded -1
## 689 discarding -1
## 690 discards -1
## 691 disconsolate -2
## 692 disconsolation -2
## 693 discontented -2
## 694 discord -2
## 695 discounted -1
## 696 discouraged -2
## 697 discredited -2
## 698 disdain -2
## 699 disgrace -2
## 700 disgraced -2
## 701 disguise -1
## 702 disguised -1
## 703 disguises -1
## 704 disguising -1
## 705 disgust -3
## 706 disgusted -3
## 707 disgusting -3
## 708 disheartened -2
## 709 dishonest -2
## 710 disillusioned -2
## 711 disinclined -2
## 712 disjointed -2
## 713 dislike -2
## 714 dismal -2
## 715 dismayed -2
## 716 disorder -2
## 717 disorganized -2
## 718 disoriented -2
## 719 disparage -2
## 720 disparaged -2
## 721 disparages -2
## 722 disparaging -2
## 723 displeased -2
## 724 dispute -2
## 725 disputed -2
## 726 disputes -2
## 727 disputing -2
## 728 disqualified -2
## 729 disquiet -2
## 730 disregard -2
## 731 disregarded -2
## 732 disregarding -2
## 733 disregards -2
## 734 disrespect -2
## 735 disrespected -2
## 736 disruption -2
## 737 disruptions -2
## 738 disruptive -2
## 739 dissatisfied -2
## 740 distort -2
## 741 distorted -2
## 742 distorting -2
## 743 distorts -2
## 744 distract -2
## 745 distracted -2
## 746 distraction -2
## 747 distracts -2
## 748 distress -2
## 749 distressed -2
## 750 distresses -2
## 751 distressing -2
## 752 distrust -3
## 753 distrustful -3
## 754 disturb -2
## 755 disturbed -2
## 756 disturbing -2
## 757 disturbs -2
## 758 dithering -2
## 759 dizzy -1
## 760 dodging -2
## 761 dodgy -2
## 762 does not work -3
## 763 dolorous -2
## 764 dont like -2
## 765 doom -2
## 766 doomed -2
## 767 doubt -1
## 768 doubted -1
## 769 doubtful -1
## 770 doubting -1
## 771 doubts -1
## 772 douche -3
## 773 douchebag -3
## 774 downcast -2
## 775 downhearted -2
## 776 downside -2
## 777 drag -1
## 778 dragged -1
## 779 drags -1
## 780 drained -2
## 781 dread -2
## 782 dreaded -2
## 783 dreadful -3
## 784 dreading -2
## 785 dream 1
## 786 dreams 1
## 787 dreary -2
## 788 droopy -2
## 789 drop -1
## 790 drown -2
## 791 drowned -2
## 792 drowns -2
## 793 drunk -2
## 794 dubious -2
## 795 dud -2
## 796 dull -2
## 797 dumb -3
## 798 dumbass -3
## 799 dump -1
## 800 dumped -2
## 801 dumps -1
## 802 dupe -2
## 803 duped -2
## 804 dysfunction -2
## 805 eager 2
## 806 earnest 2
## 807 ease 2
## 808 easy 1
## 809 ecstatic 4
## 810 eerie -2
## 811 eery -2
## 812 effective 2
## 813 effectively 2
## 814 elated 3
## 815 elation 3
## 816 elegant 2
## 817 elegantly 2
## 818 embarrass -2
## 819 embarrassed -2
## 820 embarrasses -2
## 821 embarrassing -2
## 822 embarrassment -2
## 823 embittered -2
## 824 embrace 1
## 825 emergency -2
## 826 empathetic 2
## 827 emptiness -1
## 828 empty -1
## 829 enchanted 2
## 830 encourage 2
## 831 encouraged 2
## 832 encouragement 2
## 833 encourages 2
## 834 endorse 2
## 835 endorsed 2
## 836 endorsement 2
## 837 endorses 2
## 838 enemies -2
## 839 enemy -2
## 840 energetic 2
## 841 engage 1
## 842 engages 1
## 843 engrossed 1
## 844 enjoy 2
## 845 enjoying 2
## 846 enjoys 2
## 847 enlighten 2
## 848 enlightened 2
## 849 enlightening 2
## 850 enlightens 2
## 851 ennui -2
## 852 enrage -2
## 853 enraged -2
## 854 enrages -2
## 855 enraging -2
## 856 enrapture 3
## 857 enslave -2
## 858 enslaved -2
## 859 enslaves -2
## 860 ensure 1
## 861 ensuring 1
## 862 enterprising 1
## 863 entertaining 2
## 864 enthral 3
## 865 enthusiastic 3
## 866 entitled 1
## 867 entrusted 2
## 868 envies -1
## 869 envious -2
## 870 envy -1
## 871 envying -1
## 872 erroneous -2
## 873 error -2
## 874 errors -2
## 875 escape -1
## 876 escapes -1
## 877 escaping -1
## 878 esteemed 2
## 879 ethical 2
## 880 euphoria 3
## 881 euphoric 4
## 882 eviction -1
## 883 evil -3
## 884 exaggerate -2
## 885 exaggerated -2
## 886 exaggerates -2
## 887 exaggerating -2
## 888 exasperated 2
## 889 excellence 3
## 890 excellent 3
## 891 excite 3
## 892 excited 3
## 893 excitement 3
## 894 exciting 3
## 895 exclude -1
## 896 excluded -2
## 897 exclusion -1
## 898 exclusive 2
## 899 excuse -1
## 900 exempt -1
## 901 exhausted -2
## 902 exhilarated 3
## 903 exhilarates 3
## 904 exhilarating 3
## 905 exonerate 2
## 906 exonerated 2
## 907 exonerates 2
## 908 exonerating 2
## 909 expand 1
## 910 expands 1
## 911 expel -2
## 912 expelled -2
## 913 expelling -2
## 914 expels -2
## 915 exploit -2
## 916 exploited -2
## 917 exploiting -2
## 918 exploits -2
## 919 exploration 1
## 920 explorations 1
## 921 expose -1
## 922 exposed -1
## 923 exposes -1
## 924 exposing -1
## 925 extend 1
## 926 extends 1
## 927 exuberant 4
## 928 exultant 3
## 929 exultantly 3
## 930 fabulous 4
## 931 fad -2
## 932 fag -3
## 933 faggot -3
## 934 faggots -3
## 935 fail -2
## 936 failed -2
## 937 failing -2
## 938 fails -2
## 939 failure -2
## 940 failures -2
## 941 fainthearted -2
## 942 fair 2
## 943 faith 1
## 944 faithful 3
## 945 fake -3
## 946 fakes -3
## 947 faking -3
## 948 fallen -2
## 949 falling -1
## 950 falsified -3
## 951 falsify -3
## 952 fame 1
## 953 fan 3
## 954 fantastic 4
## 955 farce -1
## 956 fascinate 3
## 957 fascinated 3
## 958 fascinates 3
## 959 fascinating 3
## 960 fascist -2
## 961 fascists -2
## 962 fatalities -3
## 963 fatality -3
## 964 fatigue -2
## 965 fatigued -2
## 966 fatigues -2
## 967 fatiguing -2
## 968 favor 2
## 969 favored 2
## 970 favorite 2
## 971 favorited 2
## 972 favorites 2
## 973 favors 2
## 974 fear -2
## 975 fearful -2
## 976 fearing -2
## 977 fearless 2
## 978 fearsome -2
## 979 fed up -3
## 980 feeble -2
## 981 feeling 1
## 982 felonies -3
## 983 felony -3
## 984 fervent 2
## 985 fervid 2
## 986 festive 2
## 987 fiasco -3
## 988 fidgety -2
## 989 fight -1
## 990 fine 2
## 991 fire -2
## 992 fired -2
## 993 firing -2
## 994 fit 1
## 995 fitness 1
## 996 flagship 2
## 997 flees -1
## 998 flop -2
## 999 flops -2
## 1000 flu -2
## 1001 flustered -2
## 1002 focused 2
## 1003 fond 2
## 1004 fondness 2
## 1005 fool -2
## 1006 foolish -2
## 1007 fools -2
## 1008 forced -1
## 1009 foreclosure -2
## 1010 foreclosures -2
## 1011 forget -1
## 1012 forgetful -2
## 1013 forgive 1
## 1014 forgiving 1
## 1015 forgotten -1
## 1016 fortunate 2
## 1017 frantic -1
## 1018 fraud -4
## 1019 frauds -4
## 1020 fraudster -4
## 1021 fraudsters -4
## 1022 fraudulence -4
## 1023 fraudulent -4
## 1024 free 1
## 1025 freedom 2
## 1026 frenzy -3
## 1027 fresh 1
## 1028 friendly 2
## 1029 fright -2
## 1030 frightened -2
## 1031 frightening -3
## 1032 frikin -2
## 1033 frisky 2
## 1034 frowning -1
## 1035 frustrate -2
## 1036 frustrated -2
## 1037 frustrates -2
## 1038 frustrating -2
## 1039 frustration -2
## 1040 ftw 3
## 1041 fuck -4
## 1042 fucked -4
## 1043 fucker -4
## 1044 fuckers -4
## 1045 fuckface -4
## 1046 fuckhead -4
## 1047 fucking -4
## 1048 fucktard -4
## 1049 fud -3
## 1050 fuked -4
## 1051 fuking -4
## 1052 fulfill 2
## 1053 fulfilled 2
## 1054 fulfills 2
## 1055 fuming -2
## 1056 fun 4
## 1057 funeral -1
## 1058 funerals -1
## 1059 funky 2
## 1060 funnier 4
## 1061 funny 4
## 1062 furious -3
## 1063 futile 2
## 1064 gag -2
## 1065 gagged -2
## 1066 gain 2
## 1067 gained 2
## 1068 gaining 2
## 1069 gains 2
## 1070 gallant 3
## 1071 gallantly 3
## 1072 gallantry 3
## 1073 generous 2
## 1074 genial 3
## 1075 ghost -1
## 1076 giddy -2
## 1077 gift 2
## 1078 glad 3
## 1079 glamorous 3
## 1080 glamourous 3
## 1081 glee 3
## 1082 gleeful 3
## 1083 gloom -1
## 1084 gloomy -2
## 1085 glorious 2
## 1086 glory 2
## 1087 glum -2
## 1088 god 1
## 1089 goddamn -3
## 1090 godsend 4
## 1091 good 3
## 1092 goodness 3
## 1093 grace 1
## 1094 gracious 3
## 1095 grand 3
## 1096 grant 1
## 1097 granted 1
## 1098 granting 1
## 1099 grants 1
## 1100 grateful 3
## 1101 gratification 2
## 1102 grave -2
## 1103 gray -1
## 1104 great 3
## 1105 greater 3
## 1106 greatest 3
## 1107 greed -3
## 1108 greedy -2
## 1109 green wash -3
## 1110 green washing -3
## 1111 greenwash -3
## 1112 greenwasher -3
## 1113 greenwashers -3
## 1114 greenwashing -3
## 1115 greet 1
## 1116 greeted 1
## 1117 greeting 1
## 1118 greetings 2
## 1119 greets 1
## 1120 grey -1
## 1121 grief -2
## 1122 grieved -2
## 1123 gross -2
## 1124 growing 1
## 1125 growth 2
## 1126 guarantee 1
## 1127 guilt -3
## 1128 guilty -3
## 1129 gullibility -2
## 1130 gullible -2
## 1131 gun -1
## 1132 ha 2
## 1133 hacked -1
## 1134 haha 3
## 1135 hahaha 3
## 1136 hahahah 3
## 1137 hail 2
## 1138 hailed 2
## 1139 hapless -2
## 1140 haplessness -2
## 1141 happiness 3
## 1142 happy 3
## 1143 hard -1
## 1144 hardier 2
## 1145 hardship -2
## 1146 hardy 2
## 1147 harm -2
## 1148 harmed -2
## 1149 harmful -2
## 1150 harming -2
## 1151 harms -2
## 1152 harried -2
## 1153 harsh -2
## 1154 harsher -2
## 1155 harshest -2
## 1156 hate -3
## 1157 hated -3
## 1158 haters -3
## 1159 hates -3
## 1160 hating -3
## 1161 haunt -1
## 1162 haunted -2
## 1163 haunting 1
## 1164 haunts -1
## 1165 havoc -2
## 1166 healthy 2
## 1167 heartbreaking -3
## 1168 heartbroken -3
## 1169 heartfelt 3
## 1170 heaven 2
## 1171 heavenly 4
## 1172 heavyhearted -2
## 1173 hell -4
## 1174 help 2
## 1175 helpful 2
## 1176 helping 2
## 1177 helpless -2
## 1178 helps 2
## 1179 hero 2
## 1180 heroes 2
## 1181 heroic 3
## 1182 hesitant -2
## 1183 hesitate -2
## 1184 hid -1
## 1185 hide -1
## 1186 hides -1
## 1187 hiding -1
## 1188 highlight 2
## 1189 hilarious 2
## 1190 hindrance -2
## 1191 hoax -2
## 1192 homesick -2
## 1193 honest 2
## 1194 honor 2
## 1195 honored 2
## 1196 honoring 2
## 1197 honour 2
## 1198 honoured 2
## 1199 honouring 2
## 1200 hooligan -2
## 1201 hooliganism -2
## 1202 hooligans -2
## 1203 hope 2
## 1204 hopeful 2
## 1205 hopefully 2
## 1206 hopeless -2
## 1207 hopelessness -2
## 1208 hopes 2
## 1209 hoping 2
## 1210 horrendous -3
## 1211 horrible -3
## 1212 horrific -3
## 1213 horrified -3
## 1214 hostile -2
## 1215 huckster -2
## 1216 hug 2
## 1217 huge 1
## 1218 hugs 2
## 1219 humerous 3
## 1220 humiliated -3
## 1221 humiliation -3
## 1222 humor 2
## 1223 humorous 2
## 1224 humour 2
## 1225 humourous 2
## 1226 hunger -2
## 1227 hurrah 5
## 1228 hurt -2
## 1229 hurting -2
## 1230 hurts -2
## 1231 hypocritical -2
## 1232 hysteria -3
## 1233 hysterical -3
## 1234 hysterics -3
## 1235 idiot -3
## 1236 idiotic -3
## 1237 ignorance -2
## 1238 ignorant -2
## 1239 ignore -1
## 1240 ignored -2
## 1241 ignores -1
## 1242 ill -2
## 1243 illegal -3
## 1244 illiteracy -2
## 1245 illness -2
## 1246 illnesses -2
## 1247 imbecile -3
## 1248 immobilized -1
## 1249 immortal 2
## 1250 immune 1
## 1251 impatient -2
## 1252 imperfect -2
## 1253 importance 2
## 1254 important 2
## 1255 impose -1
## 1256 imposed -1
## 1257 imposes -1
## 1258 imposing -1
## 1259 impotent -2
## 1260 impress 3
## 1261 impressed 3
## 1262 impresses 3
## 1263 impressive 3
## 1264 imprisoned -2
## 1265 improve 2
## 1266 improved 2
## 1267 improvement 2
## 1268 improves 2
## 1269 improving 2
## 1270 inability -2
## 1271 inaction -2
## 1272 inadequate -2
## 1273 incapable -2
## 1274 incapacitated -2
## 1275 incensed -2
## 1276 incompetence -2
## 1277 incompetent -2
## 1278 inconsiderate -2
## 1279 inconvenience -2
## 1280 inconvenient -2
## 1281 increase 1
## 1282 increased 1
## 1283 indecisive -2
## 1284 indestructible 2
## 1285 indifference -2
## 1286 indifferent -2
## 1287 indignant -2
## 1288 indignation -2
## 1289 indoctrinate -2
## 1290 indoctrinated -2
## 1291 indoctrinates -2
## 1292 indoctrinating -2
## 1293 ineffective -2
## 1294 ineffectively -2
## 1295 infatuated 2
## 1296 infatuation 2
## 1297 infected -2
## 1298 inferior -2
## 1299 inflamed -2
## 1300 influential 2
## 1301 infringement -2
## 1302 infuriate -2
## 1303 infuriated -2
## 1304 infuriates -2
## 1305 infuriating -2
## 1306 inhibit -1
## 1307 injured -2
## 1308 injury -2
## 1309 injustice -2
## 1310 innovate 1
## 1311 innovates 1
## 1312 innovation 1
## 1313 innovative 2
## 1314 inquisition -2
## 1315 inquisitive 2
## 1316 insane -2
## 1317 insanity -2
## 1318 insecure -2
## 1319 insensitive -2
## 1320 insensitivity -2
## 1321 insignificant -2
## 1322 insipid -2
## 1323 inspiration 2
## 1324 inspirational 2
## 1325 inspire 2
## 1326 inspired 2
## 1327 inspires 2
## 1328 inspiring 3
## 1329 insult -2
## 1330 insulted -2
## 1331 insulting -2
## 1332 insults -2
## 1333 intact 2
## 1334 integrity 2
## 1335 intelligent 2
## 1336 intense 1
## 1337 interest 1
## 1338 interested 2
## 1339 interesting 2
## 1340 interests 1
## 1341 interrogated -2
## 1342 interrupt -2
## 1343 interrupted -2
## 1344 interrupting -2
## 1345 interruption -2
## 1346 interrupts -2
## 1347 intimidate -2
## 1348 intimidated -2
## 1349 intimidates -2
## 1350 intimidating -2
## 1351 intimidation -2
## 1352 intricate 2
## 1353 intrigues 1
## 1354 invincible 2
## 1355 invite 1
## 1356 inviting 1
## 1357 invulnerable 2
## 1358 irate -3
## 1359 ironic -1
## 1360 irony -1
## 1361 irrational -1
## 1362 irresistible 2
## 1363 irresolute -2
## 1364 irresponsible 2
## 1365 irreversible -1
## 1366 irritate -3
## 1367 irritated -3
## 1368 irritating -3
## 1369 isolated -1
## 1370 itchy -2
## 1371 jackass -4
## 1372 jackasses -4
## 1373 jailed -2
## 1374 jaunty 2
## 1375 jealous -2
## 1376 jeopardy -2
## 1377 jerk -3
## 1378 jesus 1
## 1379 jewel 1
## 1380 jewels 1
## 1381 jocular 2
## 1382 join 1
## 1383 joke 2
## 1384 jokes 2
## 1385 jolly 2
## 1386 jovial 2
## 1387 joy 3
## 1388 joyful 3
## 1389 joyfully 3
## 1390 joyless -2
## 1391 joyous 3
## 1392 jubilant 3
## 1393 jumpy -1
## 1394 justice 2
## 1395 justifiably 2
## 1396 justified 2
## 1397 keen 1
## 1398 kill -3
## 1399 killed -3
## 1400 killing -3
## 1401 kills -3
## 1402 kind 2
## 1403 kinder 2
## 1404 kiss 2
## 1405 kudos 3
## 1406 lack -2
## 1407 lackadaisical -2
## 1408 lag -1
## 1409 lagged -2
## 1410 lagging -2
## 1411 lags -2
## 1412 lame -2
## 1413 landmark 2
## 1414 laugh 1
## 1415 laughed 1
## 1416 laughing 1
## 1417 laughs 1
## 1418 laughting 1
## 1419 launched 1
## 1420 lawl 3
## 1421 lawsuit -2
## 1422 lawsuits -2
## 1423 lazy -1
## 1424 leak -1
## 1425 leaked -1
## 1426 leave -1
## 1427 legal 1
## 1428 legally 1
## 1429 lenient 1
## 1430 lethargic -2
## 1431 lethargy -2
## 1432 liar -3
## 1433 liars -3
## 1434 libelous -2
## 1435 lied -2
## 1436 lifesaver 4
## 1437 lighthearted 1
## 1438 like 2
## 1439 liked 2
## 1440 likes 2
## 1441 limitation -1
## 1442 limited -1
## 1443 limits -1
## 1444 litigation -1
## 1445 litigious -2
## 1446 lively 2
## 1447 livid -2
## 1448 lmao 4
## 1449 lmfao 4
## 1450 loathe -3
## 1451 loathed -3
## 1452 loathes -3
## 1453 loathing -3
## 1454 lobby -2
## 1455 lobbying -2
## 1456 lol 3
## 1457 lonely -2
## 1458 lonesome -2
## 1459 longing -1
## 1460 loom -1
## 1461 loomed -1
## 1462 looming -1
## 1463 looms -1
## 1464 loose -3
## 1465 looses -3
## 1466 loser -3
## 1467 losing -3
## 1468 loss -3
## 1469 lost -3
## 1470 lovable 3
## 1471 love 3
## 1472 loved 3
## 1473 lovelies 3
## 1474 lovely 3
## 1475 loving 2
## 1476 lowest -1
## 1477 loyal 3
## 1478 loyalty 3
## 1479 luck 3
## 1480 luckily 3
## 1481 lucky 3
## 1482 lugubrious -2
## 1483 lunatic -3
## 1484 lunatics -3
## 1485 lurk -1
## 1486 lurking -1
## 1487 lurks -1
## 1488 mad -3
## 1489 maddening -3
## 1490 made-up -1
## 1491 madly -3
## 1492 madness -3
## 1493 mandatory -1
## 1494 manipulated -1
## 1495 manipulating -1
## 1496 manipulation -1
## 1497 marvel 3
## 1498 marvelous 3
## 1499 marvels 3
## 1500 masterpiece 4
## 1501 masterpieces 4
## 1502 matter 1
## 1503 matters 1
## 1504 mature 2
## 1505 meaningful 2
## 1506 meaningless -2
## 1507 medal 3
## 1508 mediocrity -3
## 1509 meditative 1
## 1510 melancholy -2
## 1511 menace -2
## 1512 menaced -2
## 1513 mercy 2
## 1514 merry 3
## 1515 mess -2
## 1516 messed -2
## 1517 messing up -2
## 1518 methodical 2
## 1519 mindless -2
## 1520 miracle 4
## 1521 mirth 3
## 1522 mirthful 3
## 1523 mirthfully 3
## 1524 misbehave -2
## 1525 misbehaved -2
## 1526 misbehaves -2
## 1527 misbehaving -2
## 1528 mischief -1
## 1529 mischiefs -1
## 1530 miserable -3
## 1531 misery -2
## 1532 misgiving -2
## 1533 misinformation -2
## 1534 misinformed -2
## 1535 misinterpreted -2
## 1536 misleading -3
## 1537 misread -1
## 1538 misreporting -2
## 1539 misrepresentation -2
## 1540 miss -2
## 1541 missed -2
## 1542 missing -2
## 1543 mistake -2
## 1544 mistaken -2
## 1545 mistakes -2
## 1546 mistaking -2
## 1547 misunderstand -2
## 1548 misunderstanding -2
## 1549 misunderstands -2
## 1550 misunderstood -2
## 1551 moan -2
## 1552 moaned -2
## 1553 moaning -2
## 1554 moans -2
## 1555 mock -2
## 1556 mocked -2
## 1557 mocking -2
## 1558 mocks -2
## 1559 mongering -2
## 1560 monopolize -2
## 1561 monopolized -2
## 1562 monopolizes -2
## 1563 monopolizing -2
## 1564 moody -1
## 1565 mope -1
## 1566 moping -1
## 1567 moron -3
## 1568 motherfucker -5
## 1569 motherfucking -5
## 1570 motivate 1
## 1571 motivated 2
## 1572 motivating 2
## 1573 motivation 1
## 1574 mourn -2
## 1575 mourned -2
## 1576 mournful -2
## 1577 mourning -2
## 1578 mourns -2
## 1579 mumpish -2
## 1580 murder -2
## 1581 murderer -2
## 1582 murdering -3
## 1583 murderous -3
## 1584 murders -2
## 1585 myth -1
## 1586 n00b -2
## 1587 naive -2
## 1588 nasty -3
## 1589 natural 1
## 1590 na\xefve -2
## 1591 needy -2
## 1592 negative -2
## 1593 negativity -2
## 1594 neglect -2
## 1595 neglected -2
## 1596 neglecting -2
## 1597 neglects -2
## 1598 nerves -1
## 1599 nervous -2
## 1600 nervously -2
## 1601 nice 3
## 1602 nifty 2
## 1603 niggas -5
## 1604 nigger -5
## 1605 no -1
## 1606 no fun -3
## 1607 noble 2
## 1608 noisy -1
## 1609 nonsense -2
## 1610 noob -2
## 1611 nosey -2
## 1612 not good -2
## 1613 not working -3
## 1614 notorious -2
## 1615 novel 2
## 1616 numb -1
## 1617 nuts -3
## 1618 obliterate -2
## 1619 obliterated -2
## 1620 obnoxious -3
## 1621 obscene -2
## 1622 obsessed 2
## 1623 obsolete -2
## 1624 obstacle -2
## 1625 obstacles -2
## 1626 obstinate -2
## 1627 odd -2
## 1628 offend -2
## 1629 offended -2
## 1630 offender -2
## 1631 offending -2
## 1632 offends -2
## 1633 offline -1
## 1634 oks 2
## 1635 ominous 3
## 1636 once-in-a-lifetime 3
## 1637 opportunities 2
## 1638 opportunity 2
## 1639 oppressed -2
## 1640 oppressive -2
## 1641 optimism 2
## 1642 optimistic 2
## 1643 optionless -2
## 1644 outcry -2
## 1645 outmaneuvered -2
## 1646 outrage -3
## 1647 outraged -3
## 1648 outreach 2
## 1649 outstanding 5
## 1650 overjoyed 4
## 1651 overload -1
## 1652 overlooked -1
## 1653 overreact -2
## 1654 overreacted -2
## 1655 overreaction -2
## 1656 overreacts -2
## 1657 oversell -2
## 1658 overselling -2
## 1659 oversells -2
## 1660 oversimplification -2
## 1661 oversimplified -2
## 1662 oversimplifies -2
## 1663 oversimplify -2
## 1664 overstatement -2
## 1665 overstatements -2
## 1666 overweight -1
## 1667 oxymoron -1
## 1668 pain -2
## 1669 pained -2
## 1670 panic -3
## 1671 panicked -3
## 1672 panics -3
## 1673 paradise 3
## 1674 paradox -1
## 1675 pardon 2
## 1676 pardoned 2
## 1677 pardoning 2
## 1678 pardons 2
## 1679 parley -1
## 1680 passionate 2
## 1681 passive -1
## 1682 passively -1
## 1683 pathetic -2
## 1684 pay -1
## 1685 peace 2
## 1686 peaceful 2
## 1687 peacefully 2
## 1688 penalty -2
## 1689 pensive -1
## 1690 perfect 3
## 1691 perfected 2
## 1692 perfectly 3
## 1693 perfects 2
## 1694 peril -2
## 1695 perjury -3
## 1696 perpetrator -2
## 1697 perpetrators -2
## 1698 perplexed -2
## 1699 persecute -2
## 1700 persecuted -2
## 1701 persecutes -2
## 1702 persecuting -2
## 1703 perturbed -2
## 1704 pesky -2
## 1705 pessimism -2
## 1706 pessimistic -2
## 1707 petrified -2
## 1708 phobic -2
## 1709 picturesque 2
## 1710 pileup -1
## 1711 pique -2
## 1712 piqued -2
## 1713 piss -4
## 1714 pissed -4
## 1715 pissing -3
## 1716 piteous -2
## 1717 pitied -1
## 1718 pity -2
## 1719 playful 2
## 1720 pleasant 3
## 1721 please 1
## 1722 pleased 3
## 1723 pleasure 3
## 1724 poised -2
## 1725 poison -2
## 1726 poisoned -2
## 1727 poisons -2
## 1728 pollute -2
## 1729 polluted -2
## 1730 polluter -2
## 1731 polluters -2
## 1732 pollutes -2
## 1733 poor -2
## 1734 poorer -2
## 1735 poorest -2
## 1736 popular 3
## 1737 positive 2
## 1738 positively 2
## 1739 possessive -2
## 1740 postpone -1
## 1741 postponed -1
## 1742 postpones -1
## 1743 postponing -1
## 1744 poverty -1
## 1745 powerful 2
## 1746 powerless -2
## 1747 praise 3
## 1748 praised 3
## 1749 praises 3
## 1750 praising 3
## 1751 pray 1
## 1752 praying 1
## 1753 prays 1
## 1754 prblm -2
## 1755 prblms -2
## 1756 prepared 1
## 1757 pressure -1
## 1758 pressured -2
## 1759 pretend -1
## 1760 pretending -1
## 1761 pretends -1
## 1762 pretty 1
## 1763 prevent -1
## 1764 prevented -1
## 1765 preventing -1
## 1766 prevents -1
## 1767 prick -5
## 1768 prison -2
## 1769 prisoner -2
## 1770 prisoners -2
## 1771 privileged 2
## 1772 proactive 2
## 1773 problem -2
## 1774 problems -2
## 1775 profiteer -2
## 1776 progress 2
## 1777 prominent 2
## 1778 promise 1
## 1779 promised 1
## 1780 promises 1
## 1781 promote 1
## 1782 promoted 1
## 1783 promotes 1
## 1784 promoting 1
## 1785 propaganda -2
## 1786 prosecute -1
## 1787 prosecuted -2
## 1788 prosecutes -1
## 1789 prosecution -1
## 1790 prospect 1
## 1791 prospects 1
## 1792 prosperous 3
## 1793 protect 1
## 1794 protected 1
## 1795 protects 1
## 1796 protest -2
## 1797 protesters -2
## 1798 protesting -2
## 1799 protests -2
## 1800 proud 2
## 1801 proudly 2
## 1802 provoke -1
## 1803 provoked -1
## 1804 provokes -1
## 1805 provoking -1
## 1806 pseudoscience -3
## 1807 punish -2
## 1808 punished -2
## 1809 punishes -2
## 1810 punitive -2
## 1811 pushy -1
## 1812 puzzled -2
## 1813 quaking -2
## 1814 questionable -2
## 1815 questioned -1
## 1816 questioning -1
## 1817 racism -3
## 1818 racist -3
## 1819 racists -3
## 1820 rage -2
## 1821 rageful -2
## 1822 rainy -1
## 1823 rant -3
## 1824 ranter -3
## 1825 ranters -3
## 1826 rants -3
## 1827 rape -4
## 1828 rapist -4
## 1829 rapture 2
## 1830 raptured 2
## 1831 raptures 2
## 1832 rapturous 4
## 1833 rash -2
## 1834 ratified 2
## 1835 reach 1
## 1836 reached 1
## 1837 reaches 1
## 1838 reaching 1
## 1839 reassure 1
## 1840 reassured 1
## 1841 reassures 1
## 1842 reassuring 2
## 1843 rebellion -2
## 1844 recession -2
## 1845 reckless -2
## 1846 recommend 2
## 1847 recommended 2
## 1848 recommends 2
## 1849 redeemed 2
## 1850 refuse -2
## 1851 refused -2
## 1852 refusing -2
## 1853 regret -2
## 1854 regretful -2
## 1855 regrets -2
## 1856 regretted -2
## 1857 regretting -2
## 1858 reject -1
## 1859 rejected -1
## 1860 rejecting -1
## 1861 rejects -1
## 1862 rejoice 4
## 1863 rejoiced 4
## 1864 rejoices 4
## 1865 rejoicing 4
## 1866 relaxed 2
## 1867 relentless -1
## 1868 reliant 2
## 1869 relieve 1
## 1870 relieved 2
## 1871 relieves 1
## 1872 relieving 2
## 1873 relishing 2
## 1874 remarkable 2
## 1875 remorse -2
## 1876 repulse -1
## 1877 repulsed -2
## 1878 rescue 2
## 1879 rescued 2
## 1880 rescues 2
## 1881 resentful -2
## 1882 resign -1
## 1883 resigned -1
## 1884 resigning -1
## 1885 resigns -1
## 1886 resolute 2
## 1887 resolve 2
## 1888 resolved 2
## 1889 resolves 2
## 1890 resolving 2
## 1891 respected 2
## 1892 responsible 2
## 1893 responsive 2
## 1894 restful 2
## 1895 restless -2
## 1896 restore 1
## 1897 restored 1
## 1898 restores 1
## 1899 restoring 1
## 1900 restrict -2
## 1901 restricted -2
## 1902 restricting -2
## 1903 restriction -2
## 1904 restricts -2
## 1905 retained -1
## 1906 retard -2
## 1907 retarded -2
## 1908 retreat -1
## 1909 revenge -2
## 1910 revengeful -2
## 1911 revered 2
## 1912 revive 2
## 1913 revives 2
## 1914 reward 2
## 1915 rewarded 2
## 1916 rewarding 2
## 1917 rewards 2
## 1918 rich 2
## 1919 ridiculous -3
## 1920 rig -1
## 1921 rigged -1
## 1922 right direction 3
## 1923 rigorous 3
## 1924 rigorously 3
## 1925 riot -2
## 1926 riots -2
## 1927 risk -2
## 1928 risks -2
## 1929 rob -2
## 1930 robber -2
## 1931 robed -2
## 1932 robing -2
## 1933 robs -2
## 1934 robust 2
## 1935 rofl 4
## 1936 roflcopter 4
## 1937 roflmao 4
## 1938 romance 2
## 1939 rotfl 4
## 1940 rotflmfao 4
## 1941 rotflol 4
## 1942 ruin -2
## 1943 ruined -2
## 1944 ruining -2
## 1945 ruins -2
## 1946 sabotage -2
## 1947 sad -2
## 1948 sadden -2
## 1949 saddened -2
## 1950 sadly -2
## 1951 safe 1
## 1952 safely 1
## 1953 safety 1
## 1954 salient 1
## 1955 sappy -1
## 1956 sarcastic -2
## 1957 satisfied 2
## 1958 save 2
## 1959 saved 2
## 1960 scam -2
## 1961 scams -2
## 1962 scandal -3
## 1963 scandalous -3
## 1964 scandals -3
## 1965 scapegoat -2
## 1966 scapegoats -2
## 1967 scare -2
## 1968 scared -2
## 1969 scary -2
## 1970 sceptical -2
## 1971 scold -2
## 1972 scoop 3
## 1973 scorn -2
## 1974 scornful -2
## 1975 scream -2
## 1976 screamed -2
## 1977 screaming -2
## 1978 screams -2
## 1979 screwed -2
## 1980 screwed up -3
## 1981 scumbag -4
## 1982 secure 2
## 1983 secured 2
## 1984 secures 2
## 1985 sedition -2
## 1986 seditious -2
## 1987 seduced -1
## 1988 self-confident 2
## 1989 self-deluded -2
## 1990 selfish -3
## 1991 selfishness -3
## 1992 sentence -2
## 1993 sentenced -2
## 1994 sentences -2
## 1995 sentencing -2
## 1996 serene 2
## 1997 severe -2
## 1998 sexy 3
## 1999 shaky -2
## 2000 shame -2
## 2001 shamed -2
## 2002 shameful -2
## 2003 share 1
## 2004 shared 1
## 2005 shares 1
## 2006 shattered -2
## 2007 shit -4
## 2008 shithead -4
## 2009 shitty -3
## 2010 shock -2
## 2011 shocked -2
## 2012 shocking -2
## 2013 shocks -2
## 2014 shoot -1
## 2015 short-sighted -2
## 2016 short-sightedness -2
## 2017 shortage -2
## 2018 shortages -2
## 2019 shrew -4
## 2020 shy -1
## 2021 sick -2
## 2022 sigh -2
## 2023 significance 1
## 2024 significant 1
## 2025 silencing -1
## 2026 silly -1
## 2027 sincere 2
## 2028 sincerely 2
## 2029 sincerest 2
## 2030 sincerity 2
## 2031 sinful -3
## 2032 singleminded -2
## 2033 skeptic -2
## 2034 skeptical -2
## 2035 skepticism -2
## 2036 skeptics -2
## 2037 slam -2
## 2038 slash -2
## 2039 slashed -2
## 2040 slashes -2
## 2041 slashing -2
## 2042 slavery -3
## 2043 sleeplessness -2
## 2044 slick 2
## 2045 slicker 2
## 2046 slickest 2
## 2047 sluggish -2
## 2048 slut -5
## 2049 smart 1
## 2050 smarter 2
## 2051 smartest 2
## 2052 smear -2
## 2053 smile 2
## 2054 smiled 2
## 2055 smiles 2
## 2056 smiling 2
## 2057 smog -2
## 2058 sneaky -1
## 2059 snub -2
## 2060 snubbed -2
## 2061 snubbing -2
## 2062 snubs -2
## 2063 sobering 1
## 2064 solemn -1
## 2065 solid 2
## 2066 solidarity 2
## 2067 solution 1
## 2068 solutions 1
## 2069 solve 1
## 2070 solved 1
## 2071 solves 1
## 2072 solving 1
## 2073 somber -2
## 2074 some kind 0
## 2075 son-of-a-bitch -5
## 2076 soothe 3
## 2077 soothed 3
## 2078 soothing 3
## 2079 sophisticated 2
## 2080 sore -1
## 2081 sorrow -2
## 2082 sorrowful -2
## 2083 sorry -1
## 2084 spam -2
## 2085 spammer -3
## 2086 spammers -3
## 2087 spamming -2
## 2088 spark 1
## 2089 sparkle 3
## 2090 sparkles 3
## 2091 sparkling 3
## 2092 speculative -2
## 2093 spirit 1
## 2094 spirited 2
## 2095 spiritless -2
## 2096 spiteful -2
## 2097 splendid 3
## 2098 sprightly 2
## 2099 squelched -1
## 2100 stab -2
## 2101 stabbed -2
## 2102 stable 2
## 2103 stabs -2
## 2104 stall -2
## 2105 stalled -2
## 2106 stalling -2
## 2107 stamina 2
## 2108 stampede -2
## 2109 startled -2
## 2110 starve -2
## 2111 starved -2
## 2112 starves -2
## 2113 starving -2
## 2114 steadfast 2
## 2115 steal -2
## 2116 steals -2
## 2117 stereotype -2
## 2118 stereotyped -2
## 2119 stifled -1
## 2120 stimulate 1
## 2121 stimulated 1
## 2122 stimulates 1
## 2123 stimulating 2
## 2124 stingy -2
## 2125 stolen -2
## 2126 stop -1
## 2127 stopped -1
## 2128 stopping -1
## 2129 stops -1
## 2130 stout 2
## 2131 straight 1
## 2132 strange -1
## 2133 strangely -1
## 2134 strangled -2
## 2135 strength 2
## 2136 strengthen 2
## 2137 strengthened 2
## 2138 strengthening 2
## 2139 strengthens 2
## 2140 stressed -2
## 2141 stressor -2
## 2142 stressors -2
## 2143 stricken -2
## 2144 strike -1
## 2145 strikers -2
## 2146 strikes -1
## 2147 strong 2
## 2148 stronger 2
## 2149 strongest 2
## 2150 struck -1
## 2151 struggle -2
## 2152 struggled -2
## 2153 struggles -2
## 2154 struggling -2
## 2155 stubborn -2
## 2156 stuck -2
## 2157 stunned -2
## 2158 stunning 4
## 2159 stupid -2
## 2160 stupidly -2
## 2161 suave 2
## 2162 substantial 1
## 2163 substantially 1
## 2164 subversive -2
## 2165 success 2
## 2166 successful 3
## 2167 suck -3
## 2168 sucks -3
## 2169 suffer -2
## 2170 suffering -2
## 2171 suffers -2
## 2172 suicidal -2
## 2173 suicide -2
## 2174 suing -2
## 2175 sulking -2
## 2176 sulky -2
## 2177 sullen -2
## 2178 sunshine 2
## 2179 super 3
## 2180 superb 5
## 2181 superior 2
## 2182 support 2
## 2183 supported 2
## 2184 supporter 1
## 2185 supporters 1
## 2186 supporting 1
## 2187 supportive 2
## 2188 supports 2
## 2189 survived 2
## 2190 surviving 2
## 2191 survivor 2
## 2192 suspect -1
## 2193 suspected -1
## 2194 suspecting -1
## 2195 suspects -1
## 2196 suspend -1
## 2197 suspended -1
## 2198 suspicious -2
## 2199 swear -2
## 2200 swearing -2
## 2201 swears -2
## 2202 sweet 2
## 2203 swift 2
## 2204 swiftly 2
## 2205 swindle -3
## 2206 swindles -3
## 2207 swindling -3
## 2208 sympathetic 2
## 2209 sympathy 2
## 2210 tard -2
## 2211 tears -2
## 2212 tender 2
## 2213 tense -2
## 2214 tension -1
## 2215 terrible -3
## 2216 terribly -3
## 2217 terrific 4
## 2218 terrified -3
## 2219 terror -3
## 2220 terrorize -3
## 2221 terrorized -3
## 2222 terrorizes -3
## 2223 thank 2
## 2224 thankful 2
## 2225 thanks 2
## 2226 thorny -2
## 2227 thoughtful 2
## 2228 thoughtless -2
## 2229 threat -2
## 2230 threaten -2
## 2231 threatened -2
## 2232 threatening -2
## 2233 threatens -2
## 2234 threats -2
## 2235 thrilled 5
## 2236 thwart -2
## 2237 thwarted -2
## 2238 thwarting -2
## 2239 thwarts -2
## 2240 timid -2
## 2241 timorous -2
## 2242 tired -2
## 2243 tits -2
## 2244 tolerant 2
## 2245 toothless -2
## 2246 top 2
## 2247 tops 2
## 2248 torn -2
## 2249 torture -4
## 2250 tortured -4
## 2251 tortures -4
## 2252 torturing -4
## 2253 totalitarian -2
## 2254 totalitarianism -2
## 2255 tout -2
## 2256 touted -2
## 2257 touting -2
## 2258 touts -2
## 2259 tragedy -2
## 2260 tragic -2
## 2261 tranquil 2
## 2262 trap -1
## 2263 trapped -2
## 2264 trauma -3
## 2265 traumatic -3
## 2266 travesty -2
## 2267 treason -3
## 2268 treasonous -3
## 2269 treasure 2
## 2270 treasures 2
## 2271 trembling -2
## 2272 tremulous -2
## 2273 tricked -2
## 2274 trickery -2
## 2275 triumph 4
## 2276 triumphant 4
## 2277 trouble -2
## 2278 troubled -2
## 2279 troubles -2
## 2280 TRUE 2
## 2281 trust 1
## 2282 trusted 2
## 2283 tumor -2
## 2284 twat -5
## 2285 ugly -3
## 2286 unacceptable -2
## 2287 unappreciated -2
## 2288 unapproved -2
## 2289 unaware -2
## 2290 unbelievable -1
## 2291 unbelieving -1
## 2292 unbiased 2
## 2293 uncertain -1
## 2294 unclear -1
## 2295 uncomfortable -2
## 2296 unconcerned -2
## 2297 unconfirmed -1
## 2298 unconvinced -1
## 2299 uncredited -1
## 2300 undecided -1
## 2301 underestimate -1
## 2302 underestimated -1
## 2303 underestimates -1
## 2304 underestimating -1
## 2305 undermine -2
## 2306 undermined -2
## 2307 undermines -2
## 2308 undermining -2
## 2309 undeserving -2
## 2310 undesirable -2
## 2311 uneasy -2
## 2312 unemployment -2
## 2313 unequal -1
## 2314 unequaled 2
## 2315 unethical -2
## 2316 unfair -2
## 2317 unfocused -2
## 2318 unfulfilled -2
## 2319 unhappy -2
## 2320 unhealthy -2
## 2321 unified 1
## 2322 unimpressed -2
## 2323 unintelligent -2
## 2324 united 1
## 2325 unjust -2
## 2326 unlovable -2
## 2327 unloved -2
## 2328 unmatched 1
## 2329 unmotivated -2
## 2330 unprofessional -2
## 2331 unresearched -2
## 2332 unsatisfied -2
## 2333 unsecured -2
## 2334 unsettled -1
## 2335 unsophisticated -2
## 2336 unstable -2
## 2337 unstoppable 2
## 2338 unsupported -2
## 2339 unsure -1
## 2340 untarnished 2
## 2341 unwanted -2
## 2342 unworthy -2
## 2343 upset -2
## 2344 upsets -2
## 2345 upsetting -2
## 2346 uptight -2
## 2347 urgent -1
## 2348 useful 2
## 2349 usefulness 2
## 2350 useless -2
## 2351 uselessness -2
## 2352 vague -2
## 2353 validate 1
## 2354 validated 1
## 2355 validates 1
## 2356 validating 1
## 2357 verdict -1
## 2358 verdicts -1
## 2359 vested 1
## 2360 vexation -2
## 2361 vexing -2
## 2362 vibrant 3
## 2363 vicious -2
## 2364 victim -3
## 2365 victimize -3
## 2366 victimized -3
## 2367 victimizes -3
## 2368 victimizing -3
## 2369 victims -3
## 2370 vigilant 3
## 2371 vile -3
## 2372 vindicate 2
## 2373 vindicated 2
## 2374 vindicates 2
## 2375 vindicating 2
## 2376 violate -2
## 2377 violated -2
## 2378 violates -2
## 2379 violating -2
## 2380 violence -3
## 2381 violent -3
## 2382 virtuous 2
## 2383 virulent -2
## 2384 vision 1
## 2385 visionary 3
## 2386 visioning 1
## 2387 visions 1
## 2388 vitality 3
## 2389 vitamin 1
## 2390 vitriolic -3
## 2391 vivacious 3
## 2392 vociferous -1
## 2393 vulnerability -2
## 2394 vulnerable -2
## 2395 walkout -2
## 2396 walkouts -2
## 2397 wanker -3
## 2398 want 1
## 2399 war -2
## 2400 warfare -2
## 2401 warm 1
## 2402 warmth 2
## 2403 warn -2
## 2404 warned -2
## 2405 warning -3
## 2406 warnings -3
## 2407 warns -2
## 2408 waste -1
## 2409 wasted -2
## 2410 wasting -2
## 2411 wavering -1
## 2412 weak -2
## 2413 weakness -2
## 2414 wealth 3
## 2415 wealthy 2
## 2416 weary -2
## 2417 weep -2
## 2418 weeping -2
## 2419 weird -2
## 2420 welcome 2
## 2421 welcomed 2
## 2422 welcomes 2
## 2423 whimsical 1
## 2424 whitewash -3
## 2425 whore -4
## 2426 wicked -2
## 2427 widowed -1
## 2428 willingness 2
## 2429 win 4
## 2430 winner 4
## 2431 winning 4
## 2432 wins 4
## 2433 winwin 3
## 2434 wish 1
## 2435 wishes 1
## 2436 wishing 1
## 2437 withdrawal -3
## 2438 woebegone -2
## 2439 woeful -3
## 2440 won 3
## 2441 wonderful 4
## 2442 woo 3
## 2443 woohoo 3
## 2444 wooo 4
## 2445 woow 4
## 2446 worn -1
## 2447 worried -3
## 2448 worry -3
## 2449 worrying -3
## 2450 worse -3
## 2451 worsen -3
## 2452 worsened -3
## 2453 worsening -3
## 2454 worsens -3
## 2455 worshiped 3
## 2456 worst -3
## 2457 worth 2
## 2458 worthless -2
## 2459 worthy 2
## 2460 wow 4
## 2461 wowow 4
## 2462 wowww 4
## 2463 wrathful -3
## 2464 wreck -2
## 2465 wrong -2
## 2466 wronged -2
## 2467 wtf -4
## 2468 yeah 1
## 2469 yearning 1
## 2470 yeees 2
## 2471 yes 1
## 2472 youthful 2
## 2473 yucky -2
## 2474 yummy 3
## 2475 zealot -2
## 2476 zealots -2
## 2477 zealous 2
Lets look at bing
bing <- get_sentiments("bing")
bing
## # A tibble: 6,786 × 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # ℹ 6,776 more rows
And lastly nrc
nrc <- get_sentiments("nrc")
nrc
## # A tibble: 13,872 × 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # ℹ 13,862 more rows
These libraries were created either using crowdsourcing or cloud computing/AI like Amazon Mechanical Turk, or by labor of one of the authors, and then validated with crowdsourcing.
Lets look at the words with a joy score from NRC
library(gutenbergr)
library(dplyr)
library(stringr)
darwin <- gutenberg_download(c(944, 1227, 1228, 2300), mirror = "https://gutenberg.nabasny.com/")
## Warning: ! Could not download a book at
## https://gutenberg.nabasny.com//1/2/2/1228/1228.zip.
## ℹ The book may have been archived.
## ℹ Alternatively, You may need to select a different mirror.
## → See https://www.gutenberg.org/MIRRORS.ALL for options.
tidy_books <- darwin %>%
group_by(gutenberg_id) %>%
dplyr::mutate(linenumber = row_number(), chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
tidy_books
## # A tibble: 629,573 × 4
## gutenberg_id linenumber chapter word
## <int> <int> <int> <chr>
## 1 944 1 0 the
## 2 944 1 0 voyage
## 3 944 1 0 of
## 4 944 1 0 the
## 5 944 1 0 beagle
## 6 944 1 0 by
## 7 944 2 0 charles
## 8 944 2 0 darwin
## 9 944 8 0 about
## 10 944 8 0 the
## # ℹ 629,563 more rows
Lets add the book name instead of GID
colnames(tidy_books)[1] <- "book"
tidy_books$book[tidy_books$book == 944] <- "The Voyage of the Beagle"
tidy_books$book[tidy_books$book == 1227] <- "The Expression of the Emotions in Man and Animals"
tidy_books$book[tidy_books$book == 1228] <- "On the Origin of Species By Means of Natural Selection"
tidy_books$book[tidy_books$book == 2300] <- "The Descent of Man, and Selection in Relation to Sex"
tidy_books
## # A tibble: 629,573 × 4
## book linenumber chapter word
## <chr> <int> <int> <chr>
## 1 The Voyage of the Beagle 1 0 the
## 2 The Voyage of the Beagle 1 0 voyage
## 3 The Voyage of the Beagle 1 0 of
## 4 The Voyage of the Beagle 1 0 the
## 5 The Voyage of the Beagle 1 0 beagle
## 6 The Voyage of the Beagle 1 0 by
## 7 The Voyage of the Beagle 2 0 charles
## 8 The Voyage of the Beagle 2 0 darwin
## 9 The Voyage of the Beagle 8 0 about
## 10 The Voyage of the Beagle 8 0 the
## # ℹ 629,563 more rows
Now that we have a tidy format with one word per row, we are ready for sentiment analysis. First lets use NRC.
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book == "The Voyage of the Beagle") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
## Joining with `by = join_by(word)`
## # A tibble: 277 × 2
## word n
## <chr> <int>
## 1 found 301
## 2 good 161
## 3 remarkable 114
## 4 green 95
## 5 kind 92
## 6 tree 86
## 7 present 85
## 8 food 78
## 9 beautiful 61
## 10 elevation 60
## # ℹ 267 more rows
We can alos examine how sentiment changes throughout a work.
library(tidyr)
Charles_Darwin_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
Now lets plot it
library(ggplot2)
ggplot(Charles_Darwin_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
Lets compare the three sentiment directions
There are several options for sentiment lexicons, you might want some more info on which is appropriate for your purpose. Here we will use all 3 of our dictationaries and examine how the sentiment changes across the arc of TVOTB.
library(tidyr)
voyage <- tidy_books %>%
filter(book == "The Voyage of the Beagle")
voyage
## # A tibble: 208,118 × 4
## book linenumber chapter word
## <chr> <int> <int> <chr>
## 1 The Voyage of the Beagle 1 0 the
## 2 The Voyage of the Beagle 1 0 voyage
## 3 The Voyage of the Beagle 1 0 of
## 4 The Voyage of the Beagle 1 0 the
## 5 The Voyage of the Beagle 1 0 beagle
## 6 The Voyage of the Beagle 1 0 by
## 7 The Voyage of the Beagle 2 0 charles
## 8 The Voyage of the Beagle 2 0 darwin
## 9 The Voyage of the Beagle 8 0 about
## 10 The Voyage of the Beagle 8 0 the
## # ℹ 208,108 more rows
Lets again use integar division (‘%/%’) to define larger sections of text that span multiple lines, and we can use the same pattern with ‘count()’, ‘pivot_wider()’, and ‘mutate()’, to find the net sentiment in each of these sections of text.
affin <- voyage %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
## Joining with `by = join_by(word)`
bing_and_nrc <- bind_rows(
voyage %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
voyage %>%
inner_join(get_sentiments("nrc")%>%
filter(sentiment %in% c("positive", "negative"))
) %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %% 80, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("nrc") %>% filter(sentiment %in% : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1154 of `x` matches multiple rows in `y`.
## ℹ Row 4245 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
I DO NOT KNOW WHY “POSITIVE” IS NOT FOUND?!?!?!
We can now estimate the net sentiment (positive - negative) in each chunk of the novel for each lexion (dictionary). Lets bind them all together and visualize with ggplot
bind_rows(afinn, bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1, scales = "free_y")
## Warning: Removed 2477 rows containing missing values (`position_stack()`).
Lets look at the counts based on each dictionary
get_sentiments("nrc") %>%
filter(sentiment %in% c("positive", "negative")) %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 3316
## 2 positive 2308
get_sentiments("bing") %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining with `by = join_by(word)`
bing_word_counts
## # A tibble: 2,421 × 3
## word sentiment n
## <chr> <chr> <int>
## 1 great positive 964
## 2 like positive 705
## 3 well positive 701
## 4 good positive 384
## 5 doubt negative 306
## 6 respect positive 266
## 7 wild negative 266
## 8 bright positive 252
## 9 remarkable positive 247
## 10 savages negative 217
## # ℹ 2,411 more rows
This can be shown visually, and we can pipe straight into ggplot2
bing_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scale = "free_y") +
labs(x = "Contribution to Sentiment", y = NULL)
Lets spot an anomoly in the dataset.
custom_stop_words <- bind_rows(tibble(word = c("wild", "dark", "great", "like"), lexicon = c("custom")), stop_words)
custom_stop_words
## # A tibble: 1,153 × 2
## word lexicon
## <chr> <chr>
## 1 wild custom
## 2 dark custom
## 3 great custom
## 4 like custom
## 5 a SMART
## 6 a's SMART
## 7 able SMART
## 8 about SMART
## 9 above SMART
## 10 according SMART
## # ℹ 1,143 more rows
Word Clouds!
We can see that tidy text mining and sentiment analysis works well with ggplot2, but having our data in tidy format leads to other nice graphing techniques.
Lets use the wordcloud package!
library(wordcloud)
##
## Attaching package: 'wordcloud'
## The following object is masked from 'package:gplots':
##
## textplot
tidy_books %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
## Joining with `by = join_by(word)`
## Warning in wordcloud(word, n, max.words = 100): male could not be fit on page.
## It will not be plotted.
## Warning in wordcloud(word, n, max.words = 100): female could not be fit on
## page. It will not be plotted.
Lets also look at a comparison.cloud(), which may require turning the dataframe into a matrix.
We can change to matrix ising the acast() function.
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tigerstats':
##
## tips
## The following object is masked from 'package:tidyr':
##
## smiths
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"), max.words = 100)
## Joining with `by = join_by(word)`
Looking at units beyond words
Lots of useful work can be done by tokening at the word level, but sometimes its nice to look at different units of text. For example, we can look beyoind just unigrams.
Ex.. I am not having a good day.
bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
wordcounts <- tidy_books %>%
group_by(book, chapter) %>%
dplyr::summarize(words = n())
## `summarise()` has grouped output by 'book'. You can override using the
## `.groups` argument.
tidy_books %>%
semi_join(bingnegative) %>%
group_by(book, chapter) %>%
dplyr::summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("book", "chapter")) %>%
dplyr::mutate(ratio = negativewords/words) %>%
filter(chapter !=0) %>%
slice_max(ratio, n = 1) %>%
ungroup()
## Joining with `by = join_by(word)`
## `summarise()` has grouped output by 'book'. You can override using the
## `.groups` argument.
## # A tibble: 3 × 5
## book chapter negativewords words ratio
## <chr> <int> <int> <int> <dbl>
## 1 The Descent of Man, and Selection in Relat… 20 4 87 0.0460
## 2 The Expression of the Emotions in Man and … 10 249 4220 0.0590
## 3 The Voyage of the Beagle 10 375 11202 0.0335
So far we’ve only looked at single words, but many interesting (more accurate) analyses are based on the relationship between words.
Lets look at some methods of tidytext for calculating and visualizing word relationships.
library(dplyr)
library(tidytext)
darwin_books <- gutenberg_download(c(944, 1227, 1228, 2300), mirror = "https://gutenberg.nabasny.com/")
## Warning: ! Could not download a book at
## https://gutenberg.nabasny.com//1/2/2/1228/1228.zip.
## ℹ The book may have been archived.
## ℹ Alternatively, You may need to select a different mirror.
## → See https://www.gutenberg.org/MIRRORS.ALL for options.
colnames(darwin_books)[1] <- "book"
darwin_books$book[darwin_books$book == 944] <- "The Voyage of the Beagle"
darwin_books$book[darwin_books$book == 1227] <- "The Expression of the Emotions in Man and Animals"
darwin_books$book[darwin_books$book == 1228] <- "On the Origin of Species By Means of Natural Selection"
darwin_books$book[darwin_books$book == 2300] <- "The Descent of Man, and Selection in Relation to Sex"
darwin_bigrams <- darwin_books %>%
unnest_tokens(bigram, text, token = "ngrams", n=2)
darwin_bigrams
## # A tibble: 580,636 × 2
## book bigram
## <chr> <chr>
## 1 The Voyage of the Beagle the voyage
## 2 The Voyage of the Beagle voyage of
## 3 The Voyage of the Beagle of the
## 4 The Voyage of the Beagle the beagle
## 5 The Voyage of the Beagle beagle by
## 6 The Voyage of the Beagle charles darwin
## 7 The Voyage of the Beagle <NA>
## 8 The Voyage of the Beagle <NA>
## 9 The Voyage of the Beagle <NA>
## 10 The Voyage of the Beagle <NA>
## # ℹ 580,626 more rows
This data is still tidytext format, and is structured as one-token-per-row. Each token is a bigram.
Counting and filtering n-gram
#darwin_bigrams %>%
# count(bigram, sort = TRUE)
Most of the common bigrams are stop-words. This can be a good time to use tidyr’s separate command which splits a column into multiple based on a delimiter. This will let us make a column for word one and word two.
library(tidyr)
bigrams_separated <- darwin_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
New bigram counts
bigram_counts <- bigrams_filtered %>%
unite(bigram, word1, word2, sep = " ")
bigram_counts
## # A tibble: 79,272 × 2
## book bigram
## <chr> <chr>
## 1 The Voyage of the Beagle charles darwin
## 2 The Voyage of the Beagle NA NA
## 3 The Voyage of the Beagle NA NA
## 4 The Voyage of the Beagle NA NA
## 5 The Voyage of the Beagle NA NA
## 6 The Voyage of the Beagle NA NA
## 7 The Voyage of the Beagle online edition
## 8 The Voyage of the Beagle NA NA
## 9 The Voyage of the Beagle degree symbol
## 10 The Voyage of the Beagle degs italics
## # ℹ 79,262 more rows
We may also be interested in trigrams, which are 3 word combos
trigrams <- darwin_books %>%
unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!word3 %in% stop_words$word) %>%
count(word1, word2, word3, sort = TRUE)
trigrams
## # A tibble: 17,771 × 4
## word1 word2 word3 n
## <chr> <chr> <chr> <int>
## 1 <NA> <NA> <NA> 7965
## 2 tierra del fuego 84
## 3 secondary sexual characters 79
## 4 captain fitz roy 45
## 5 de la physionomie 30
## 6 domestication vol ii 26
## 7 vol ii pp 22
## 8 vertebrates vol iii 21
## 9 proc zoolog soc 18
## 10 proc zool soc 17
## # ℹ 17,761 more rows
Lets analyze some bigrams
bigrams_filtered %>%
filter(word2 == "selection") %>%
count(book, word1, sort = TRUE)
## # A tibble: 24 × 3
## book word1 n
## <chr> <chr> <int>
## 1 The Descent of Man, and Selection in Relation to Sex sexual 254
## 2 The Descent of Man, and Selection in Relation to Sex natural 156
## 3 The Descent of Man, and Selection in Relation to Sex unconscious 6
## 4 The Descent of Man, and Selection in Relation to Sex continued 5
## 5 The Expression of the Emotions in Man and Animals natural 4
## 6 The Descent of Man, and Selection in Relation to Sex artificial 2
## 7 The Descent of Man, and Selection in Relation to Sex ordinary 2
## 8 The Descent of Man, and Selection in Relation to Sex careful 1
## 9 The Descent of Man, and Selection in Relation to Sex consequent 1
## 10 The Descent of Man, and Selection in Relation to Sex la 1
## # ℹ 14 more rows
Lets again look at tf-idf across bigrams across Darwins works.
bigram_tf_idf <- bigram_counts %>%
count(book, bigram) %>%
bind_tf_idf(bigram, book, n) %>%
arrange(desc(tf_idf))
bigram_tf_idf
## # A tibble: 51,213 × 6
## book bigram n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 The Expression of the Emotions in Man and… nerve… 47 0.00350 1.10 0.00384
## 2 The Descent of Man, and Selection in Rela… sexua… 138 0.00315 1.10 0.00346
## 3 The Expression of the Emotions in Man and… la ph… 35 0.00260 1.10 0.00286
## 4 The Voyage of the Beagle bueno… 54 0.00245 1.10 0.00269
## 5 The Voyage of the Beagle capta… 53 0.00240 1.10 0.00264
## 6 The Descent of Man, and Selection in Rela… secon… 101 0.00231 1.10 0.00254
## 7 The Voyage of the Beagle fitz … 50 0.00227 1.10 0.00249
## 8 The Expression of the Emotions in Man and… muscl… 30 0.00223 1.10 0.00245
## 9 The Expression of the Emotions in Man and… orbic… 29 0.00216 1.10 0.00237
## 10 The Descent of Man, and Selection in Rela… sexua… 254 0.00580 0.405 0.00235
## # ℹ 51,203 more rows
bigram_tf_idf %>%
arrange(desc(tf_idf)) %>%
group_by(book) %>%
slice_max(tf_idf, n = 10) %>%
ungroup() %>%
mutate(bigram = reorder(bigram, tf_idf)) %>%
ggplot(aes(tf_idf, bigram, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free") +
labs(x = "tf_idf of bigrams", y = NULL)
Using bigrams to provide context in sentiment analysis
bigrams_separated %>%
filter(word1 == "not") %>%
count(word1, word2, sort = TRUE)
## # A tibble: 740 × 3
## word1 word2 n
## <chr> <chr> <int>
## 1 not be 90
## 2 not only 89
## 3 not have 80
## 4 not a 78
## 5 not to 74
## 6 not the 63
## 7 not at 56
## 8 not been 54
## 9 not appear 51
## 10 not know 47
## # ℹ 730 more rows
By doing sentiment analysis on bigrams, we can examine how often sentiment associated words are preceded bby a modifier like “not” or other negating words.
AFINN <- get_sentiments("afinn")
AFINN
## # A tibble: 2,477 × 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # ℹ 2,467 more rows
We can examine the most frequent words that were preceded by “not” and associated with sentiment.
not_words <- bigrams_separated %>%
filter(word1 == "not") %>%
inner_join(AFINN, by = c(word2 = "word")) %>%
count(word2, value, sort = TRUE)
not_words
## # A tibble: 97 × 3
## word2 value n
## <chr> <dbl> <int>
## 1 like 2 10
## 2 doubt -1 9
## 3 wish 1 7
## 4 admit -1 5
## 5 easy 1 5
## 6 pretend -1 5
## 7 reach 1 5
## 8 difficult -1 4
## 9 extend 1 4
## 10 help 2 4
## # ℹ 87 more rows
Lets visualize
library(ggplot2)
not_words %>%
mutate(contribution = n * value) %>%
arrange(desc(abs(contribution))) %>%
head(20) %>%
mutate(word2 = reorder(word2, contribution)) %>%
ggplot(aes(n * value, word2, fill = n * value > 0)) +
geom_col(show.legend = FALSE) +
labs(x = "Sentiment value * number of occurences", y = "words preceded by \"not\"")
negation_words <- c("not", "no", "never", "non", "without")
negated_words <- bigrams_separated %>%
filter(word1 %in% negation_words) %>%
inner_join(AFINN, by = c(word2 = "word")) %>%
count(word1, word2, value, sort = TRUE)
negated_words
## # A tibble: 182 × 4
## word1 word2 value n
## <chr> <chr> <dbl> <int>
## 1 no doubt -1 173
## 2 no great 3 14
## 3 not like 2 10
## 4 not doubt -1 9
## 5 not wish 1 7
## 6 without doubt -1 7
## 7 not admit -1 5
## 8 not easy 1 5
## 9 not pretend -1 5
## 10 not reach 1 5
## # ℹ 172 more rows
Lets visualize the negated words
negated_words %>%
mutate(contribution = n * value,
word2 = reorder(paste(word2, word1, sep = "_"), contribution)) %>%
group_by(word1) %>%
slice_max(abs(contribution), n = 12, with_ties = FALSE) %>%
ggplot(aes(word2, contribution, fill = n * value > 0)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ word1, scales = "free") +
scale_x_discrete(labels = function(x) gsub("_.+$", "", x)) +
xlab("Words preceded by negation term") +
ylab("Sentiment value * # of occurences") +
coord_flip()
Lets visualize a network of bigrams with ggraph
library(igraph)
##
## Attaching package: 'igraph'
## The following object is masked from 'package:mosaic':
##
## compare
## The following objects are masked from 'package:lubridate':
##
## %--%, union
## The following object is masked from 'package:plotly':
##
## groups
## The following object is masked from 'package:tidyr':
##
## crossing
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
bigram_counts <- bigrams_filtered %>%
count(word1, word2, sort = TRUE)
bigram_graph <- bigram_counts %>%
filter(n > 20) %>%
graph_from_data_frame()
## Warning in graph_from_data_frame(.): In `d' `NA' elements were replaced with
## string "NA"
bigram_graph
## IGRAPH aba6dda DN-- 158 106 --
## + attr: name (v/c), n (e/n)
## + edges from aba6dda (vertex names):
## [1] NA ->NA sexual ->selection vol ->ii
## [4] natural ->selection lower ->animals sexual ->differences
## [7] breeding ->season secondary ->sexual south ->america
## [10] sexual ->characters del ->fuego tierra ->del
## [13] vol ->iii de ->la bright ->colours
## [16] sexual ->difference distinct ->species tail ->feathers
## [19] closely ->allied vocal ->organs zoological->gardens
## [22] natural ->history buenos ->ayres captain ->fitz
## + ... omitted several edges
library(ggraph)
set.seed(1234)
ggraph(bigram_graph, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)
## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
We can also add directionality to this netwoek
set.seed(1234)
a <- grid::arrow(type = "closed", length = unit(0.15, "inches"))
ggraph(bigram_graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a, end_cap = circle(0.7, 'inches')) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()
A central question in text mining is how to quantify what a document is about. We can do this by looking at words that make up the documents, and measuring term frequency.
There are a lot of words that may not be important, these are the stop words.
One way to remedy this is to look at inverse document frequency words, which decreases the weight for commonly use words and increases the weight for words that are not used very much.
Term frequency in Darwin’s works
library(dplyr)
library(tidytext)
book_words <- gutenberg_download(c(944, 1227, 1228, 2300), mirror = "https://gutenberg.nabasny.com/")
## Warning: ! Could not download a book at
## https://gutenberg.nabasny.com//1/2/2/1228/1228.zip.
## ℹ The book may have been archived.
## ℹ Alternatively, You may need to select a different mirror.
## → See https://www.gutenberg.org/MIRRORS.ALL for options.
colnames(book_words)[1] <- "book"
book_words$book[book_words$book == 944] <- "The Voyage of the Beagle"
book_words$book[book_words$book == 1227] <- "The Expression of the Emotions in Man and Animals"
book_words$book[book_words$book == 1228] <- "On the Origin of Species By Means of Natural Selection"
book_words$book[book_words$book == 2300] <- "The Descent of Man, and Selection in Relation to Sex"
Now lets dissect
book_words <- book_words %>%
unnest_tokens(word, text) %>%
count(book, word, sort = TRUE)
book_words
## # A tibble: 35,579 × 3
## book word n
## <chr> <chr> <int>
## 1 The Descent of Man, and Selection in Relation to Sex the 25490
## 2 The Voyage of the Beagle the 16930
## 3 The Descent of Man, and Selection in Relation to Sex of 16762
## 4 The Voyage of the Beagle of 9438
## 5 The Descent of Man, and Selection in Relation to Sex in 8882
## 6 The Expression of the Emotions in Man and Animals the 8045
## 7 The Descent of Man, and Selection in Relation to Sex and 7854
## 8 The Descent of Man, and Selection in Relation to Sex to 5901
## 9 The Voyage of the Beagle and 5768
## 10 The Voyage of the Beagle a 5328
## # ℹ 35,569 more rows
book_words$n <- as.numeric(book_words$n)
total_words <- book_words %>%
group_by(book) %>%
dplyr::summarize(total = sum(n))
book_words
## # A tibble: 35,579 × 3
## book word n
## <chr> <chr> <dbl>
## 1 The Descent of Man, and Selection in Relation to Sex the 25490
## 2 The Voyage of the Beagle the 16930
## 3 The Descent of Man, and Selection in Relation to Sex of 16762
## 4 The Voyage of the Beagle of 9438
## 5 The Descent of Man, and Selection in Relation to Sex in 8882
## 6 The Expression of the Emotions in Man and Animals the 8045
## 7 The Descent of Man, and Selection in Relation to Sex and 7854
## 8 The Descent of Man, and Selection in Relation to Sex to 5901
## 9 The Voyage of the Beagle and 5768
## 10 The Voyage of the Beagle a 5328
## # ℹ 35,569 more rows
book_words <- dplyr::left_join(book_words, total_words)
## Joining with `by = join_by(book)`
book_words
## # A tibble: 35,579 × 4
## book word n total
## <chr> <chr> <dbl> <dbl>
## 1 The Descent of Man, and Selection in Relation to Sex the 25490 311041
## 2 The Voyage of the Beagle the 16930 208118
## 3 The Descent of Man, and Selection in Relation to Sex of 16762 311041
## 4 The Voyage of the Beagle of 9438 208118
## 5 The Descent of Man, and Selection in Relation to Sex in 8882 311041
## 6 The Expression of the Emotions in Man and Animals the 8045 110414
## 7 The Descent of Man, and Selection in Relation to Sex and 7854 311041
## 8 The Descent of Man, and Selection in Relation to Sex to 5901 311041
## 9 The Voyage of the Beagle and 5768 208118
## 10 The Voyage of the Beagle a 5328 208118
## # ℹ 35,569 more rows
You can see that the usual suspects are the most common words, but don’t tell us anything about what the book topic is.
library(ggplot2)
ggplot(book_words, aes(n/total, fill = book)) +
geom_histogram(show.legend = FALSE) +
xlim(NA, 0.0009) +
facet_wrap(~book, ncol = 2, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 370 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 3 rows containing missing values (`geom_bar()`).
Zipf’s Law
The frequency that a word appears is inversely proportional to its rank when predicting a topic.
Lets apply Zipf’s law to Darwin’s work
freq_by_rank <- book_words %>%
group_by(book) %>%
dplyr::mutate(rank = row_number(),
'term frequency' = n/total) %>%
ungroup()
freq_by_rank
## # A tibble: 35,579 × 6
## book word n total rank `term frequency`
## <chr> <chr> <dbl> <dbl> <int> <dbl>
## 1 The Descent of Man, and Selection … the 25490 311041 1 0.0820
## 2 The Voyage of the Beagle the 16930 208118 1 0.0813
## 3 The Descent of Man, and Selection … of 16762 311041 2 0.0539
## 4 The Voyage of the Beagle of 9438 208118 2 0.0453
## 5 The Descent of Man, and Selection … in 8882 311041 3 0.0286
## 6 The Expression of the Emotions in … the 8045 110414 1 0.0729
## 7 The Descent of Man, and Selection … and 7854 311041 4 0.0253
## 8 The Descent of Man, and Selection … to 5901 311041 5 0.0190
## 9 The Voyage of the Beagle and 5768 208118 3 0.0277
## 10 The Voyage of the Beagle a 5328 208118 4 0.0256
## # ℹ 35,569 more rows
freq_by_rank %>%
ggplot(aes(rank, `term frequency`, color = book)) +
geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
scale_x_log10() +
scale_y_log10()
Lets use TF-IDF to find words for each document by decreasing the weight for commonly used words and increasing the weight for words that are not used very much in a collection of documents.
book_tf_idf <- book_words %>%
bind_tf_idf(word, book, n)
book_tf_idf
## # A tibble: 35,579 × 7
## book word n total tf idf tf_idf
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 The Descent of Man, and Selection in … the 25490 311041 0.0820 0 0
## 2 The Voyage of the Beagle the 16930 208118 0.0813 0 0
## 3 The Descent of Man, and Selection in … of 16762 311041 0.0539 0 0
## 4 The Voyage of the Beagle of 9438 208118 0.0453 0 0
## 5 The Descent of Man, and Selection in … in 8882 311041 0.0286 0 0
## 6 The Expression of the Emotions in Man… the 8045 110414 0.0729 0 0
## 7 The Descent of Man, and Selection in … and 7854 311041 0.0253 0 0
## 8 The Descent of Man, and Selection in … to 5901 311041 0.0190 0 0
## 9 The Voyage of the Beagle and 5768 208118 0.0277 0 0
## 10 The Voyage of the Beagle a 5328 208118 0.0256 0 0
## # ℹ 35,569 more rows
Lets look at some terms with high tf-idf in Darwin’s works
book_tf_idf %>%
select(-total) %>%
arrange(desc(tf_idf))
## # A tibble: 35,579 × 6
## book word n tf idf tf_idf
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 The Expression of the Emotions in Man and … tears 126 1.14e-3 1.10 1.25e-3
## 2 The Expression of the Emotions in Man and … blush 114 1.03e-3 1.10 1.13e-3
## 3 The Descent of Man, and Selection in Relat… sexu… 745 2.40e-3 0.405 9.71e-4
## 4 The Descent of Man, and Selection in Relat… sele… 621 2.00e-3 0.405 8.10e-4
## 5 The Voyage of the Beagle sea 351 1.69e-3 0.405 6.84e-4
## 6 The Voyage of the Beagle degs 117 5.62e-4 1.10 6.18e-4
## 7 The Expression of the Emotions in Man and … eyeb… 149 1.35e-3 0.405 5.47e-4
## 8 The Voyage of the Beagle isla… 271 1.30e-3 0.405 5.28e-4
## 9 The Descent of Man, and Selection in Relat… shewn 143 4.60e-4 1.10 5.05e-4
## 10 The Expression of the Emotions in Man and … frown 46 4.17e-4 1.10 4.58e-4
## # ℹ 35,569 more rows
Lets look at a visualization for these high tf-idf words
book_tf_idf %>%
group_by(book) %>%
slice_max(tf_idf, n = 15) %>%
ungroup() %>%
ggplot(aes(tf_idf, reorder(word, tf_idf), fill = book)) +
geom_col(show.legend = FALSE) + facet_wrap(~book, ncol = 2, scales = "free") +
labs(x = "tf-idf", y = NULL)