Tatum Hasen 11-17-2022 ## Data Visualization ### GG Plot #### Barplots Now lets take a look at some ggplot2 barplots
We’ll start with making a dataframe based on the tooth data.
df<- data.frame(dose = c("D0.5", "D1", "D2"),
len = c(4.2, 10, 29.5))
df
## dose len
## 1 D0.5 4.2
## 2 D1 10.0
## 3 D2 29.5
And now lets make a second dataframe.
df2<- data.frame(supp=rep(c("VC", "OJ"), each = 3),
dose = rep(c("D0.5", "D1", "D2"), 2),
len = c(6.8, 15, 33, 4.2, 10, 29.5))
df2
## supp dose len
## 1 VC D0.5 6.8
## 2 VC D1 15.0
## 3 VC D2 33.0
## 4 OJ D0.5 4.2
## 5 OJ D1 10.0
## 6 OJ D2 29.5
Lets load up ggplot2
library(ggplot2)
lets set our parameters for ggplot2
theme_set(
theme_classic() +
theme(legend.position = "top")
)
Lets start with some basic barplots using the tooth data
f <- ggplot(df, aes(x = dose, y = len))
f + geom_col()
Now lets change the fill, and add labels to the top
f + geom_col(fill = "darkblue") +
geom_text(aes(label = len), vjust = -0.3)
Now lets add the labels inside the bars
f + geom_col(fill = "darkblue") +
geom_text(aes(label = len), vjust = 1.6, color = "white")
Now lets change the barplot colors by group
f + geom_col(aes(color = dose), fill = "white")
scale_color_manual(values = c("blue", "gold", "red"))
## <ggproto object: Class ScaleDiscrete, Scale, gg>
## aesthetics: colour
## axis_order: function
## break_info: function
## break_positions: function
## breaks: waiver
## call: call
## clone: function
## dimension: function
## drop: TRUE
## expand: waiver
## get_breaks: function
## get_breaks_minor: function
## get_labels: function
## get_limits: function
## guide: legend
## is_discrete: function
## is_empty: function
## labels: waiver
## limits: NULL
## make_sec_title: function
## make_title: function
## map: function
## map_df: function
## n.breaks.cache: NULL
## na.translate: TRUE
## na.value: grey50
## name: waiver
## palette: function
## palette.cache: NULL
## position: left
## range: <ggproto object: Class RangeDiscrete, Range, gg>
## range: NULL
## reset: function
## train: function
## super: <ggproto object: Class RangeDiscrete, Range, gg>
## rescale: function
## reset: function
## scale_name: manual
## train: function
## train_df: function
## transform: function
## transform_df: function
## super: <ggproto object: Class ScaleDiscrete, Scale, gg>
This is kinda hard to see so lets change the fill
f + geom_col(aes(fill = dose)) +
scale_fill_manual(values = c("blue", "gold", "red"))
Ok, how do we do this with multiple groups.
ggplot(df2, aes(x = dose, y = len)) +
geom_col(aes(color = supp, fill = supp), position = position_stack()) +
scale_color_manual(values = c("blue", "red")) +
scale_fill_manual(values = c("blue", "red"))
p <- ggplot(df2, aes(x = dose, y = len)) +
geom_col(aes(color = supp, fill = supp), position = position_dodge(0.8), width = 0.7) +
scale_color_manual(values = c("blue", "red")) +
scale_fill_manual(values = c("blue", "red"))
p
Now lets add those labels to the dodged barplot.
p + geom_text(
aes(label = len, group = supp),
position = position_dodge(0.8),
vjust = -0.3, size = 3.5
)
Now what if we want to add labels to our stacked barplots? For this we
need dplyr
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df2 <- df2 %>%
group_by(dose) %>%
arrange(dose, desc(supp)) %>%
mutate(lab_ypos = cumsum(len) - 0.5 * len)
df2
## # A tibble: 6 × 4
## # Groups: dose [3]
## supp dose len lab_ypos
## <chr> <chr> <dbl> <dbl>
## 1 VC D0.5 6.8 3.4
## 2 OJ D0.5 4.2 8.9
## 3 VC D1 15 7.5
## 4 OJ D1 10 20
## 5 VC D2 33 16.5
## 6 OJ D2 29.5 47.8
Now lets recreate our stacked graphs
ggplot(df2, aes(x = dose, y = len)) +
geom_col(aes(fill = supp), width = 0.7) +
geom_text(aes(y = lab_ypos, label = len, group = supp), color = "white") +
scale_color_manual(values = c("blue", "red")) +
scale_fill_manual(values = c("blue", "red"))
Lets look at some Boxplots
data("ToothGrowth")
Lets change the dose to a factor, and look at the top of the dataframe.
ToothGrowth$dose <- as.factor(ToothGrowth$dose)
head(ToothGrowth, 4)
## len supp dose
## 1 4.2 VC 0.5
## 2 11.5 VC 0.5
## 3 7.3 VC 0.5
## 4 5.8 VC 0.5
Lets load ggplot
library(ggplot2)
Lets set the theme for our plots to classic
theme_set(
theme_bw() +
theme(legend.position = "top")
)
Lets start with a very basic boxplot with dose vs length
tg <- ggplot(ToothGrowth, aes(x = dose, y = len))
tg + geom_boxplot()
Now lets look at a boxplot with points for the mean.
tg + geom_boxplot(notch = TRUE, fill = "lightgrey") +
stat_summary(fun.y = mean, geom = "point", shape = 18, size = 2.5, color = "indianred")
## Warning: `fun.y` is deprecated. Use `fun` instead.
We can also change the scale number of variables included, and their order
tg + geom_boxplot() +
scale_x_discrete(limits = c("0.5", "2"))
## Warning: Removed 20 rows containing missing values (stat_boxplot).
Lets put our X-axis in descending order.
tg + geom_boxplot() +
scale_x_discrete(limits = c("2", "1", "0.5"))
We can also change boxplot colors by groups
tg + geom_boxplot(aes(color = dose)) +
scale_color_manual(values = c("indianred", "blue1", "green2"))
What if we want to display our dataset by OJ vs VC?
tg2 <- tg + geom_boxplot(aes(fill = supp), position = position_dodge(0.9)) +
scale_fill_manual(values = c("#999999", "#E69F00"))
tg2
We can also arrange this as two plots with facet_wrap
tg2 + facet_wrap(~supp)
set.seed(1234)
wdata = data.frame(
sex = factor(rep(c("F", "M"), each = 200)),
weight = c(rnorm(200, 50), rnorm(200, 58))
)
head(wdata, 4)
## sex weight
## 1 F 48.79293
## 2 F 50.27743
## 3 F 51.08444
## 4 F 47.65430
Now lets load dplyr
library(dplyr)
mu <- wdata %>%
group_by(sex) %>%
summarise(grp.mean = mean(weight))
Now lets load the plotting package
theme_set(
theme_classic() +
theme(legend.position = "bottom")
)
Now lets create a ggplot object
a <- ggplot(wdata, aes(x = weight))
a + geom_histogram(bins = 30, color = "black", fill = "grey") +
geom_vline(aes(xintercept = mean(weight)),
linetype = "dashed", size = 0.6)
Now lets change the color by group
a + geom_histogram(aes(color = sex), fill = "white", position = "identity") +
scale_color_manual(values = c("#00AFBB", "#E7B800"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
a + geom_histogram(aes(color = sex, fill = sex), position = "identity") +
scale_color_manual(values = c("#00AFBB", "#E7B800")) +
scale_fill_manual(values = c("indianred", "lightblue1"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
What if we want to combine density plots and histograms?
a + geom_histogram(aes(y = stat(density)),
color = "black", fill = "white") +
geom_density(alpha = 0.2, fill = "#FF6666")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
a + geom_histogram(aes(y = stat(density), color = sex),
fill = "white", position = "identity") +
geom_density(aes(color = sex), size = 1) +
scale_color_manual(values = c("indianred", "lightblue1"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
First lets load the required packages
library(ggplot2)
Lets set our theme
theme_set(
theme_dark() +
theme(legend.position = "top")
)
First lets initiate a ggplot called TG
data("ToothGrowth")
ToothGrowth$dose <- as.factor(ToothGrowth$dose)
tg <- ggplot(ToothGrowth, aes(x = dose, y = len))
Lets create a dotplot with a summary statistic
tg + geom_dotplot(binaxis = "y", stackdir = "center", fill = "lightgrey") +
stat_summary(fun = mean, fun.args = list(mult=1))
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_segment).
Lets add a boxplot and a dotplot together
tg + geom_boxplot(width = 0.5) +
geom_dotplot(binaxis = "y", stackdir = "center", fill = "white")
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.
tg + geom_violin(trim = FALSE) +
geom_dotplot(binaxis = "y", stackdir = "center", fill = "#999999") +
stat_summary(fun = mean, fun.args = list(mult = 1))
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_segment).
lets create a dotplot with multiple groups
tg + geom_boxplot(width = 0.5) +
geom_dotplot(aes(fill = supp), binaxis = "y", stackdir = "center") +
scale_fill_manual(values = c("indianred", "lightblue1"))
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.
tg + geom_boxplot(aes(color = supp), width = 0.5, position = position_dodge(0.8)) +
geom_dotplot(aes(fill = supp, color = supp), binaxis = "y", stackdir = "center",
dotsize = 0.8, position = position_dodge(0.8)) +
scale_fill_manual(values = c("#00AFBB", "#E7B800")) +
scale_color_manual(values = c("#00AFBB", "#E7B800"))
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.
Now lets change it up and look at some lineplots
We’ll start by making a custom dataframe kinda like the tooth dataset. This way we can see the lines and stuff we’re modifying.
df<- data.frame(dose = c("D0.5", "D1", "D2"),
len = c(4.2, 10, 29.5))
Now lets create a second dataframe for plotting by groups.
df2 <- data.frame(supp = rep(c("VC", "OJ"), each = 3),
dose = rep(c("D0.5", "D1", "D2"), 2),
len = c(6.8, 15, 33, 4.2, 10, 29.5))
df2
## supp dose len
## 1 VC D0.5 6.8
## 2 VC D1 15.0
## 3 VC D2 33.0
## 4 OJ D0.5 4.2
## 5 OJ D1 10.0
## 6 OJ D2 29.5
Now lets again load ggplot and set a theme.
library(ggplot2)
theme_set(
theme_gray() +
theme(legend.position = "right")
)
Now lets do some basic lineplots. First we will build a function to display all the different line types.
generateRLineTypes <- function(){
oldPar <- par()
par(font = 2, mar = c(0,0,0,0))
plot(1, pch = "", ylim = c(0,6), xlim = c(0,0.7), axes = FALSE, xlab = "", ylab = "")
for(i in 0:6) lines(c(0.3,0.7), c(i,i), lty = i, lwd = 3)
text(rep(0.1, 6), 0:6, labels = c("0.'Blank'", "1.'Solid'", "2.'dashed'", "3.'dotted'",
"4.'dotdash'", "5.'longdash'", "6.'twodash'"))
par(mar=oldPar$mar, font=oldPar$font)
}
generateRLineTypes()
Now lets build a basic line plot.
p <- ggplot(data = df, aes(x = dose, y = len, group = 1))
p + geom_line() + geom_point()
Now lets modify the line type and color.
p + geom_line(linetype = "dashed", color = "steelblue") +
geom_point(color = "steelblue")
Now lets try a step graph, which indicates a threshold type progression.
p + geom_step() + geom_point()
Now lets move on to makeing multiple groups. First we’ll create our ggplot object.
p <- ggplot(df2, aes(x=dose, y=len, group = supp))
Now lets change line types and point shapes by group.
p + geom_line(aes(linetype =supp, color = supp)) +
geom_point(aes(shape = supp, color = supp)) +
scale_color_manual(values = c("red", "blue"))
Now lets look at line plots with a numeric x axis
df3 <- data.frame(supp = rep(c("VC", "OJ"), each = 3),
dose = rep(c("0.5", "1", "2"), 2),
len=c(6.8, 15, 33, 4.2, 10, 29.5))
df3
## supp dose len
## 1 VC 0.5 6.8
## 2 VC 1 15.0
## 3 VC 2 33.0
## 4 OJ 0.5 4.2
## 5 OJ 1 10.0
## 6 OJ 2 29.5
Now lets plot where both axises are treated as continuos labels.
df3$dose <- as.numeric(as.vector(df3$dose))
ggplot(data = df3, aes(x=dose, y=len, group=supp, color=supp)) +
geom_line() + geom_point()
Now lets look at a line graph with having the x axis as dates. We’ll use the built in economics time series for this example.
head(economics)
## # A tibble: 6 × 6
## date pce pop psavert uempmed unemploy
## <date> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1967-07-01 507. 198712 12.6 4.5 2944
## 2 1967-08-01 510. 198911 12.6 4.7 2945
## 3 1967-09-01 516. 199113 11.9 4.6 2958
## 4 1967-10-01 512. 199311 12.9 4.9 3143
## 5 1967-11-01 517. 199498 12.8 4.7 3066
## 6 1967-12-01 525. 199657 11.8 4.8 3018
ggplot(data = economics, aes(x=date, y = pop)) +
geom_line()
Now lets subset the data
ss <- subset(economics, date > as.Date("2006-1-1"))
ggplot(data = ss, aes(x=date, y=pop)) + geom_line()
We can also change the line size, for instance by another variable like unemployment
ggplot(data = economics, aes(x=date, y=pop)) +
geom_line(aes(size = unemploy/pop))
We can also plot multi-time series data
ggplot(economics, aes(x=date)) +
geom_line(aes(y=psavert), color = "darkred") +
geom_line(aes(y=uempmed), color = "steelblue", linetype = "twodash")
Lastly, lets make this into an area plot.
ggplot(economics, aes(x=date)) +
geom_area(aes(y=psavert), fill = "#999999",
color = "#999999", alpha = 0.5) +
geom_area(aes(y=uempmed), fill = "#E69F00",
color = "#E69F00", alpha = 0.5)
First lets load the required packages
library(ggplot2)
library(ggridges)
#BiocManager::instal("ggridges")
Now lets load the sample data
?airquality
air <- ggplot(airquality) +aes(Temp, Month, group = Month) + geom_density_ridges()
air
## Picking joint bandwidth of 2.65
Now lets add some pazzaz to our graph
library(viridis)
## Loading required package: viridisLite
ggplot(airquality) + aes(Temp, Month, group = Month, fill = ..x..) +
geom_density_ridges_gradient() +
scale_fill_viridis(option = "c", name = "Temp")
## Warning in viridisLite::viridis(256, alpha, begin, end, direction, option):
## Option 'c' does not exist. Defaulting to 'viridis'.
## Picking joint bandwidth of 2.65
The last thing we will do is create a facet plot for all our data.
library(tidyr)
airquality %>%
gather(key = "Measurement", value = "value", Ozone, Solar.R, Wind, Temp) %>%
ggplot() + aes(value, Month, group = Month) +
geom_density_ridges() +
facet_wrap(~ Measurement, scales = "free")
## Picking joint bandwidth of 11
## Picking joint bandwidth of 40.1
## Picking joint bandwidth of 2.65
## Picking joint bandwidth of 1.44
## Warning: Removed 44 rows containing non-finite values (stat_density_ridges).
A density plot is a nice alternative to histogram.
set.seed(1234)
wdata = data.frame(
sex = factor(rep(c("F", "M"), each = 200)),
weight = c(rnorm(200,55), rnorm(200, 58))
)
library(dplyr)
mu <-wdata %>%
group_by(sex) %>%
summarise(grp.mean = mean(weight))
Now lets load the graphing packages
library(ggplot2)
theme_set(
theme_classic() +
theme(legend.position = "right")
)
Now lets do the basic plot function. First we will create a ggplot object
d <- ggplot(wdata, aes(x = weight))
Now lets do a basic density plot.
d + geom_density() +
geom_vline(aes(xintercept = mean(weight)), linetype = "dashed")
Now lets change the y axis to count instead of density
d + geom_density(aes(y = stat(count)), fill = "lightgray") +
geom_vline(aes(xintercept = mean(weight)), linetype = "dashed")
d + geom_density(aes(color = sex)) +
scale_color_manual(values = c("darkgray", "gold"))
Lastly, lets fill the density plots
d + geom_density(aes(fill = sex), alpha = 0.4) +
geom_vline(aes(xintercept = grp.mean, color = sex), data = mu, linetype = "dashed") +
scale_color_manual(values = c("grey", "gold")) +
scale_fill_manual(values = c("grey", "gold"))
## Exploratory Data Analysis ### Outlier Detection #### Missing Values
If you encounter an unusual value is your dataset, and simply want to
move on to the of your analysis, you have two options:
Drop the entire row with the strange values:
library(dplyr)
library(ggplot2)
diamonds <- diamonds
diamonds2 <- diamonds %>%
filter(between(y, 3, 20))
In this instance, y is the width of the diamond, so anything under 3 mm or above 20 is excluded
I don’t recommend this option, just because there is one bad measurement doesn’t mean they are all bad.
Instead, I recommend replacing the unusual values with missing values
diamonds3 <- diamonds %>%
mutate(y = ifelse(y < 3 | y > 20, NA, y))
Like R, ggplot2 subscribes to the idea that missing values shouldn’t pass silently into the night.
ggplot(data = diamonds3, mapping = aes(x = x, y = y)) +
geom_point()
## Warning: Removed 9 rows containing missing values (geom_point).
If you want to suppress that warning you can use na.rm = TRUE
ggplot(data = diamonds3, mapping = aes(x = x, y = y)) +
geom_point(na.rm = TRUE)
Other times you want to understand what makes observations with missing values different to the observation with recorded values. For example, in the NYCflights13 dataset, missing values in the dep_time variable indicate that the flight was cancelled. So you might want to compare the scheduled departure times for cancelled and non-cancelled times.
library(nycflights13)
nycflights13::flights %>%
mutate(
cancelled = is.na(dep_time),
sched_hour = sched_dep_time %/% 100,
sched_min = sched_dep_time %% 100,
sched_dep_time = sched_hour + sched_min / 60
) %>%
ggplot(mapping = aes(sched_dep_time)) +
geom_freqpoly(mapping = aes(color = cancelled), bindwith = 1/4)
## Warning: Ignoring unknown parameters: bindwith
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
What if we want to know what our outliers are?
First we need to load the required libraries.
library(outliers)
library(ggplot2)
library("readxl")
And load dataset with outliers.
Air_data <- read_xlsx("~/Desktop/classroom/starter/AirQualityUCI.xlsx")
Lets create a function using the grubb test to identify all outliers. The grubbs test identifies outliers in a univariate dataset that is presumed to come from a normal distribution.
grubbs.flag <- function(x) {
#lets create a variable called outliers and save nothing in it, we’ll add to the variable
#as we identify them
outliers <- NULL
# we’ll create a variable called test to identify which univariate we are testing
test <- x
# now using the outliers package, use grubbs.test to find outliers in our variable
grubbs.result <- grubbs.test(test)
# lets get the p-values of all tested variables
pv <- grubbs.result$p.value
# now lets search through our p-values for ones that are outside of 0.5
while(pv < 0.05) {
# anything with pvalues greater than p = 0.05, we add to our empty outliers vector
outliers <- c(outliers, as.numeric(strsplit(grubbs.result$alternative, " ") [[1]][3]))
# now we want to remove those outliers from our test variable
test <- x[!x %in% outliers]
# and run the grubbs test again without the outliers
grubbs.result <- grubbs.test(test)
# and save the new p values
pv <- grubbs.result$p.value
}
return(data.frame(x = x, Outliers = (x %in% outliers)))
}
identified_outliers <- grubbs.flag(Air_data$AH)
Now we can create a histogram showing where the outliers were
ggplot(grubbs.flag(Air_data$AH), aes(x = Air_data$AH, color = Outliers, fill = Outliers)) +
geom_histogram(bindwidth = diff(range(Air_data$AH))) +
theme_bw()
## Warning: Ignoring unknown parameters: bindwidth
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
library(ggplot2)
ggplot(data = diamonds, mapping = aes(x = price)) +
geom_freqpoly(mapping = aes(color = cut), bindwidth = 500)
## Warning: Ignoring unknown parameters: bindwidth
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Its hard to see the difference in distribution because the counts differ so much.
ggplot(diamonds) +
geom_bar(mapping = aes(x = cut))
To make the comparison easier, we need to swap the display on the y-axis. Instead of displaying count, we’ll display density, which is the count standardized so that the area under the curve is one.
ggplot(data = diamonds, mapping = aes(x = price, y = ..density..)) +
geom_freqpoly(mapping = aes(color = cut), bindwidth = 500)
## Warning: Ignoring unknown parameters: bindwidth
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
It appears that fair diamonds (the lowest cut quality) have the highest average price. But maybe that’s because the frequency polygons are a little hard to interpret.
Another alternative is the boxplot. A boxplot is a type of visual shorthand for a distribution of values.
ggplot(data = diamonds, mapping = aes(x = cut, y = price)) +
geom_boxplot()
We see much less information about the distribution, but the boxplots are much more compact, so we can more easily compare them. It supports the counterintuitive finding that better quality diamonds are cheaper on average!
Lets look at some car data
ggplot(data = mpg, mapping = aes(x = class, y = hwy)) +
geom_boxplot()
ggplot(data = mpg) +
geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy))
If you have long variable names, you can switch the axis and flip it 90 degrees.
ggplot(data = mpg) +
geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy)) +
coord_flip()
To visualize the correlation between to continuous variables, we can use a scatter plot.
ggplot(data = diamonds) +
geom_point(mapping =aes(x = carat, y = price))
Scatterplots become less useful as the size of your dataset grows, because we get overplot. We can fix this using the alpha aesthetic.
ggplot(data = diamonds) +
geom_point(mapping = aes(x = carat, y = price), alpha = 1/100)
First lets load the required libraries.
library(RCurl)
##
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
##
## complete
library(dplyr)
Now lets get our data.
site <- "https://raw.githubusercontent.com/nytimes/covid-19-data/master/colleges/colleges.csv"
College_Data <- read.csv(site)
First lets use the str function, this shows the structure of the object.
str(College_Data)
## 'data.frame': 1948 obs. of 9 variables:
## $ date : chr "2021-05-26" "2021-05-26" "2021-05-26" "2021-05-26" ...
## $ state : chr "Alabama" "Alabama" "Alabama" "Alabama" ...
## $ county : chr "Madison" "Montgomery" "Limestone" "Lee" ...
## $ city : chr "Huntsville" "Montgomery" "Athens" "Auburn" ...
## $ ipeds_id : chr "100654" "100724" "100812" "100858" ...
## $ college : chr "Alabama A&M University" "Alabama State University" "Athens State University" "Auburn University" ...
## $ cases : int 41 2 45 2742 220 4 263 137 49 76 ...
## $ cases_2021: int NA NA 10 567 80 NA 49 53 10 35 ...
## $ notes : chr "" "" "" "" ...
What if we want to arrange out dataset alphabetically by college?
alphabetical <- College_Data %>%
arrange(College_Data$college)
The glimpse package is another way to preview data.
glimpse(College_Data)
## Rows: 1,948
## Columns: 9
## $ date <chr> "2021-05-26", "2021-05-26", "2021-05-26", "2021-05-26", "20…
## $ state <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Ala…
## $ county <chr> "Madison", "Montgomery", "Limestone", "Lee", "Montgomery", …
## $ city <chr> "Huntsville", "Montgomery", "Athens", "Auburn", "Montgomery…
## $ ipeds_id <chr> "100654", "100724", "100812", "100858", "100830", "102429",…
## $ college <chr> "Alabama A&M University", "Alabama State University", "Athe…
## $ cases <int> 41, 2, 45, 2742, 220, 4, 263, 137, 49, 76, 67, 0, 229, 19, …
## $ cases_2021 <int> NA, NA, 10, 567, 80, NA, 49, 53, 10, 35, 5, NA, 10, NA, 19,…
## $ notes <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",…
We can also subset with select()
College_Cases <- select(College_Data, college, cases)
We can also filter our subset with the filter function.
Louisiana_Cases <- filter(College_Data, state == "Louisiana")
Lets filter out a smaller amount of states.
South_Cases <- filter(College_Data, state == "Louisiana" | state == "Texas" | state == "Arkansas" | state == "Mississippi")
Lets look at some time series data.
First we’ll load the required libraries.
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(dplyr)
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:viridis':
##
## viridis_pal
Now lets load some data.
state_site <- "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv"
State_Data <- read.csv(state_site)
Lets create group_by object using the state column.
state_cases <- group_by(State_Data, state)
class(state_cases)
## [1] "grouped_df" "tbl_df" "tbl" "data.frame"
How many measurements were made by state? This gives us an idea of when states started reporting.
Days_since_first_reported <- tally(state_cases)
Lets visualize some data.
options(repr.plot.width = 6, rep.plot.height = 6)
class(College_Data)
## [1] "data.frame"
head(College_Data)
## date state county city ipeds_id
## 1 2021-05-26 Alabama Madison Huntsville 100654
## 2 2021-05-26 Alabama Montgomery Montgomery 100724
## 3 2021-05-26 Alabama Limestone Athens 100812
## 4 2021-05-26 Alabama Lee Auburn 100858
## 5 2021-05-26 Alabama Montgomery Montgomery 100830
## 6 2021-05-26 Alabama Walker Jasper 102429
## college cases cases_2021 notes
## 1 Alabama A&M University 41 NA
## 2 Alabama State University 2 NA
## 3 Athens State University 45 10
## 4 Auburn University 2742 567
## 5 Auburn University at Montgomery 220 80
## 6 Bevill State Community College 4 NA
summary(College_Data)
## date state county city
## Length:1948 Length:1948 Length:1948 Length:1948
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## ipeds_id college cases cases_2021
## Length:1948 Length:1948 Min. : 0.0 Min. : 0.0
## Class :character Class :character 1st Qu.: 32.0 1st Qu.: 23.0
## Mode :character Mode :character Median : 114.5 Median : 65.0
## Mean : 363.5 Mean : 168.1
## 3rd Qu.: 303.0 3rd Qu.: 159.0
## Max. :9914.0 Max. :3158.0
## NA's :337
## notes
## Length:1948
## Class :character
## Mode :character
##
##
##
##
Now lets take a look at a different dataset.
iris <- as.data.frame(iris)
class(iris)
## [1] "data.frame"
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
Lets start by creating a scatter plot of the College Data
ggplot(data = College_Data, aes(x = cases, y = cases_2021)) +
geom_point() +
theme_minimal()
## Warning: Removed 337 rows containing missing values (geom_point).
Now lets do the iris data.
ggplot(data = iris, aes(x = Sepal.Width, y = Sepal.Length)) +
geom_point() +
theme_minimal()
Now lets coordinate our Collge Data.
ggplot(data = College_Data, aes(x = cases, y = cases_2021, color = state)) +
geom_point() +
theme_minimal()
## Warning: Removed 337 rows containing missing values (geom_point).
Lets color coordinate the iris data.
ggplot(data = iris, aes(x = Sepal.Width, y = Sepal.Length, color = Species)) +
geom_point() +
theme_minimal()
Lets run a simple histogram of our Louisiana Case Data.
hist(Louisiana_Cases$cases, freq = NULL, density = NULL, breaks = 10, xlab = "Total Cases", ylab = "Frequency", main = "Total College Covid-19 Infections (Louisiana)")
Lets run a simple histogram for the Iris Data
hist(iris$Sepal.Width, freq = NULL, density = NULL, breaks = 10, xlab = "Sepal Width", ylab = "Frequency", main = "Iris Sepal Width")
histogram_college <- ggplot(data = Louisiana_Cases, aes(x = cases))
histogram_college + geom_histogram(bindwidth = 100, color = "black", aes(fill = county)) + xlab("Cases") + ylab("Frequency") + ggtitle("Histogram of Covid 19 Cases in Louisiana")
## Warning: Ignoring unknown parameters: bindwidth
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Lets create a ggplot for the Iris Data.
histogram_iris <- ggplot(data = iris, aes(x = Sepal.Width))
histogram_iris + geom_histogram(bindiwdth = 0.2, color = "black", aes(fill = Species)) + xlab("Sepal Width") + ylab("Frequency") + ggtitle("Histogram | Iris Sepal Width by Species")
## Warning: Ignoring unknown parameters: bindiwdth
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Maybe a density plot makes more sense for our college data
ggplot(South_Cases) +
geom_density(aes(x = cases, fill = state), alpha = 0.25)
Lets do it with the Iris Data
ggplot(iris) +
geom_density(aes(x = Sepal.Width, fill = Species), alpha = 0.25)
ggplot(data = iris, aes(x = Species, y = Sepal.Length, color = Species)) +
geom_violin() +
theme_classic()
Now lets try the South Data
ggplot(data = South_Cases, aes(x = state, y = cases, color = state)) +
geom_violin() +
theme_grey() +
theme(legend.position = "none")
Now lets take a look at residual plots. This is a graph that displays the residuals on the vertical axis, and the independent variable on the horizontal. In the event that the points in a residual plot are dispersed in a rabdom manner around the horizontal axis, it is appropriate to use a linear regression. If they are not randomly dispersed, a non linear model is more appropriate.
Lets start with the iris data
ggplot(lm(Sepal.Length ~ Sepal.Width, data = iris)) +
geom_point(aes(x = .fitted, y = .resid))
Now look at the southern states cases
ggplot(lm(cases ~ cases_2021, data = South_Cases)) +
geom_point(aes(x = .fitted, y = .resid))
A linear model is not a good call for the state cases.
Now lets do some correlations
library(readr)
##
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
##
## col_factor
obesity <- read_csv("~/Desktop/classroom/starter/Obesity_insurance.csv")
## Rows: 1338 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): sex, smoker, region
## dbl (4): age, bmi, children, charges
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Lets look at the structure of the dataset.
str(obesity)
## spec_tbl_df [1,338 × 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ age : num [1:1338] 19 18 28 33 32 31 46 37 37 60 ...
## $ sex : chr [1:1338] "female" "male" "male" "male" ...
## $ bmi : num [1:1338] 27.9 33.8 33 22.7 28.9 ...
## $ children: num [1:1338] 0 1 3 0 0 0 1 3 2 0 ...
## $ smoker : chr [1:1338] "yes" "no" "no" "no" ...
## $ region : chr [1:1338] "southwest" "southeast" "southeast" "northwest" ...
## $ charges : num [1:1338] 16885 1726 4449 21984 3867 ...
## - attr(*, "spec")=
## .. cols(
## .. age = col_double(),
## .. sex = col_character(),
## .. bmi = col_double(),
## .. children = col_double(),
## .. smoker = col_character(),
## .. region = col_character(),
## .. charges = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
Lets look at the column classes.
class(obesity)
## [1] "spec_tbl_df" "tbl_df" "tbl" "data.frame"
And get a summary of distribution of the variables.
summary(obesity)
## age sex bmi children
## Min. :18.00 Length:1338 Min. :15.96 Min. :0.000
## 1st Qu.:27.00 Class :character 1st Qu.:26.30 1st Qu.:0.000
## Median :39.00 Mode :character Median :30.40 Median :1.000
## Mean :39.21 Mean :30.66 Mean :1.095
## 3rd Qu.:51.00 3rd Qu.:34.69 3rd Qu.:2.000
## Max. :64.00 Max. :53.13 Max. :5.000
## smoker region charges
## Length:1338 Length:1338 Min. : 1122
## Class :character Class :character 1st Qu.: 4740
## Mode :character Mode :character Median : 9382
## Mean :13270
## 3rd Qu.:16640
## Max. :63770
Now lets look at the distribution for insurance charges.
hist(obesity$charges)
We can also get an idea of the distribution using a boxplot.
boxplot(obesity$charges)
boxplot(obesity$bmi)
Now lets look at correltation. The cor() command is used to determine correltaitons bewteen two vectors, all of the columns of a data frame, or two data frames. The cov() command, on the other hand, examins the covariance. The cor.test() command carries out a test as the significance of the correlation.
cor(obesity$charges, obesity$bmi)
## [1] 0.198341
This test uses a spearman Rho correltation, or you can use Kendall’s tau by specifying it.
cor(obesity$charges, obesity$bmi, method = 'kendall')
## [1] 0.08252397
This correlation measures strength of a correlation between -1 and 1.
Now lets look at the TietjenMoore test. This is used for univariate datasets. The algorithm depicts the detection of the outliers in a univariate dataset.
TietjenMoore <- function(dataSeries, k)
{
n = length(dataSeries)
## compute the absolute residuals
r = abs(dataSeries - mean(dataSeries))
##sort data according to size of residual
df = data.frame(dataSeries, r)
dfs = df[order(df$r),]
## create a subset of the data without the largest values
klarge = c((n-k+1):n)
subdataSeries = dfs$dataSeries[-klarge]
## compute the sumes of squares
ksub = (subdataSeries - mean(subdataSeries))**2
all = (df$subdataSeries - mean(df$dataSeries))**2
## compute the test statistic
sum(ksub)/sum(all)
}
This function helps to compute the absolute residuals and sorts data according to the size of the residuals. Later, we will focus on the computation of sum of squares.
FindOutliersTietjenMooreTest <- function(dataSeries, k, alpha = 0.5){
ek <- TietjenMoore(dataSeries, k)
#compute critical values based on simulation
test = c(1:10000)
for (i in 1:10000) {
dataSeriesdataSeries = rnorm(length(dataSeries))
test[i] = TietjenMoore(dataSeriesdataSeries, k)}
Talpha = quantile(test, alpha)
list(T = ek, Talpha = Talpha)
}
This function helps us to compute the critical values based on simulation data. Now lers demonstrate these functions with sample data and the obesity dataset for evaluating this algorithm.
The critical region for the Tietjen-Moore test is determined by simulation. The simulation is performed by generating a standard normal random sample of size n and computing the Tietjen Moore test statistic. Typically, 10,000 random samples are used. The values of the Tietjen-Moore statistic obtained from the data is compared to this reference distribution. The values of the test statistic is between 0 and 1. If there are no outliers in the data, the test statistic is close to 1. If there are outliers the test statistic will be closer to 0. Thus, the test is always a lower, one-tailed test regardless of which test statistic is used, Lk or Ek.
First, we will look at charges.
boxplot(obesity$charges)
FindOutliersTietjenMooreTest(obesity$charges, 50)
## $T
## [1] Inf
##
## $Talpha
## 50%
## Inf
Lets check out bmi
boxplot(obesity$bmi)
FindOutliersTietjenMooreTest(obesity$bmi, 7)
## $T
## [1] Inf
##
## $Talpha
## 50%
## Inf
Probability Plots
library(ggplot2)
library(tigerstats)
## Loading required package: abd
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## Loading required package: lattice
## Loading required package: grid
## Loading required package: mosaic
## Registered S3 method overwritten by 'mosaic':
## method from
## fortify.SpatialPolygonsDataFrame ggplot2
##
## The 'mosaic' package masks several functions from core packages in order to add
## additional features. The original behavior of these functions should not be affected by this.
##
## Attaching package: 'mosaic'
## The following object is masked from 'package:Matrix':
##
## mean
## The following object is masked from 'package:scales':
##
## rescale
## The following objects are masked from 'package:dplyr':
##
## count, do, tally
## The following object is masked from 'package:ggplot2':
##
## stat
## The following objects are masked from 'package:stats':
##
## binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
## quantile, sd, t.test, var
## The following objects are masked from 'package:base':
##
## max, mean, min, prod, range, sample, sum
## Welcome to tigerstats!
## To learn more about this package, consult its website:
## http://homerhanumat.github.io/tigerstats
We will use the probability plot function and their output dnorm: density function of the normal distribution. Using the density, it is possible to determine the probability of events. Or for example, you may wonder “what is the likelihood that a person has an BMI of exactly ___?” In this case, you would need to retrieve the density of the BMI distribution at that value. The BMI distriution can be modeled with a mean of 100 and a standard deviation of 15. The corresponding density is:
bmi.mean <- mean(obesity$bmi)
bmi.sd <- sd(obesity$bmi)
Lets create a plot of our normal distribution
bmi.dist <- dnorm(obesity$bmi, mean = bmi.mean, sd = bmi.sd)
bmi.df <- data.frame("bmi" = obesity$bmi, "Density" = bmi.dist)
ggplot(bmi.df, aes(x = bmi, y = Density)) +
geom_point()
This gives us the probability of every single point occurring
Now lets use the pnorm function for more info
bmi.dist <- pnorm(obesity$bmi, mean = bmi.mean, sd = bmi.sd)
bmi.df <- data.frame("bmi" = obesity$bmi, "Density" = bmi.dist)
ggplot(bmi.df, aes(x = bmi, y = Density)) +
geom_point()
What if we want to find the probability of the bmi being greater than 40 in our distribution?
pp_greater <- function(x) {
paste(round(100 * pnorm(x,, mean = 30.66339, sd = 6.09818, lower.tail = FALSE), 2), "%")
}
pp_greater(40)
## [1] "6.29 %"
pnormGC(40, region = "above", mean = 30.66339, sd = 6.09818, graph = TRUE)
## [1] 0.06287869
What about the probability that a bmi is less than 40 in our population?
pp_less <- function(x) {
paste(round(100 *(1-pnorm(x, mean = 30, sd = 6, lower.tail = FALSE)), 2), "%")
}
pp_less(40)
## [1] "95.22 %"
pnormGC(40, region = "below", mean = 30.66339, sd = 6.09818, graph = TRUE)
## [1] 0.9371213
What if we want to find the area in between?
pnormGC(c(20,40), region = "between", mean = 30.66339, sd = 6.09818, graph = TRUE)
## [1] 0.8969428
What if we want to know the quantiles? Lets use the qnorm function. We need to assume a normal distribustion for this.
What bmi represents the lowest 1% of the population?
qnorm(0.01, mean = 30.66339, sd = 6.09818, lower.tail = TRUE)
## [1] 16.4769
What if you want a random sampling of values within your distribution?
subset <- rnorm(50, mean = 30.66339, sd = 6.09818)
hist(subset)
subset2 <- rnorm(5000, mean = 30.66339, sd = 6.09818)
hist(subset2)
Shapiro-Wilk Test
So now we know how to generate a normal distribution, how do we tell if our samples came from a normal distribution?
shapiro.test(obesity$charges[1:5])
##
## Shapiro-Wilk normality test
##
## data: obesity$charges[1:5]
## W = 0.84164, p-value = 0.1695
You can see here, with a small sample size, we would reject the null hypothesis that the samples came from a normal distribution. We can increase the power of the test by increasing the sample size
shapiro.test(obesity$charges[1:1000])
##
## Shapiro-Wilk normality test
##
## data: obesity$charges[1:1000]
## W = 0.8119, p-value < 2.2e-16
Now lets check out age
shapiro.test(obesity$age[1:1000])
##
## Shapiro-Wilk normality test
##
## data: obesity$age[1:1000]
## W = 0.94406, p-value < 2.2e-16
And lastly, bmi
shapiro.test(obesity$bmi[1:1000])
##
## Shapiro-Wilk normality test
##
## data: obesity$bmi[1:1000]
## W = 0.99471, p-value = 0.001426
Time series data
First lets load our packages
library(readr)
library(readxl)
Air_data <- read_xlsx("~/Desktop/classroom/starter/AirQualityUCI.xlsx")
Date - date of measurement time- time of measurement CO(GT) - avg hourly O2 PT08,s1(CO) - tin oxide hourly avg sensor response NMHC - avg hrly non-metallic hydrocarbon concentration C6HC - avg benzene concentration PT08.S3(NMHC) - titania avg hrly sensor response NOx - avg hrly NOx concentration NO2 - avg hr;y NO2 concentration T - temperature RH - relative humidity AH - absolute humidity
str(Air_data)
## tibble [9,357 × 15] (S3: tbl_df/tbl/data.frame)
## $ Date : POSIXct[1:9357], format: "2004-03-10" "2004-03-10" ...
## $ Time : POSIXct[1:9357], format: "1899-12-31 18:00:00" "1899-12-31 19:00:00" ...
## $ CO(GT) : num [1:9357] 2.6 2 2.2 2.2 1.6 1.2 1.2 1 0.9 0.6 ...
## $ PT08.S1(CO) : num [1:9357] 1360 1292 1402 1376 1272 ...
## $ NMHC(GT) : num [1:9357] 150 112 88 80 51 38 31 31 24 19 ...
## $ C6H6(GT) : num [1:9357] 11.88 9.4 9 9.23 6.52 ...
## $ PT08.S2(NMHC): num [1:9357] 1046 955 939 948 836 ...
## $ NOx(GT) : num [1:9357] 166 103 131 172 131 89 62 62 45 -200 ...
## $ PT08.S3(NOx) : num [1:9357] 1056 1174 1140 1092 1205 ...
## $ NO2(GT) : num [1:9357] 113 92 114 122 116 96 77 76 60 -200 ...
## $ PT08.S4(NO2) : num [1:9357] 1692 1559 1554 1584 1490 ...
## $ PT08.S5(O3) : num [1:9357] 1268 972 1074 1203 1110 ...
## $ T : num [1:9357] 13.6 13.3 11.9 11 11.2 ...
## $ RH : num [1:9357] 48.9 47.7 54 60 59.6 ...
## $ AH : num [1:9357] 0.758 0.725 0.75 0.787 0.789 ...
library(tidyr)
library(dplyr)
library(lubridate)
library(hms)
##
## Attaching package: 'hms'
## The following object is masked from 'package:lubridate':
##
## hms
library(ggplot2)
Lets get rid of the date in the time column
Air_data$Time <- as_hms(Air_data$Time)
glimpse(Air_data)
## Rows: 9,357
## Columns: 15
## $ Date <dttm> 2004-03-10, 2004-03-10, 2004-03-10, 2004-03-10, 2004-…
## $ Time <time> 18:00:00, 19:00:00, 20:00:00, 21:00:00, 22:00:00, 23:…
## $ `CO(GT)` <dbl> 2.6, 2.0, 2.2, 2.2, 1.6, 1.2, 1.2, 1.0, 0.9, 0.6, -200…
## $ `PT08.S1(CO)` <dbl> 1360.00, 1292.25, 1402.00, 1375.50, 1272.25, 1197.00, …
## $ `NMHC(GT)` <dbl> 150, 112, 88, 80, 51, 38, 31, 31, 24, 19, 14, 8, 16, 2…
## $ `C6H6(GT)` <dbl> 11.881723, 9.397165, 8.997817, 9.228796, 6.518224, 4.7…
## $ `PT08.S2(NMHC)` <dbl> 1045.50, 954.75, 939.25, 948.25, 835.50, 750.25, 689.5…
## $ `NOx(GT)` <dbl> 166, 103, 131, 172, 131, 89, 62, 62, 45, -200, 21, 16,…
## $ `PT08.S3(NOx)` <dbl> 1056.25, 1173.75, 1140.00, 1092.00, 1205.00, 1336.50, …
## $ `NO2(GT)` <dbl> 113, 92, 114, 122, 116, 96, 77, 76, 60, -200, 34, 28, …
## $ `PT08.S4(NO2)` <dbl> 1692.00, 1558.75, 1554.50, 1583.75, 1490.00, 1393.00, …
## $ `PT08.S5(O3)` <dbl> 1267.50, 972.25, 1074.00, 1203.25, 1110.00, 949.25, 73…
## $ T <dbl> 13.600, 13.300, 11.900, 11.000, 11.150, 11.175, 11.325…
## $ RH <dbl> 48.875, 47.700, 53.975, 60.000, 59.575, 59.175, 56.775…
## $ AH <dbl> 0.7577538, 0.7254874, 0.7502391, 0.7867125, 0.7887942,…
plot(Air_data$AH, Air_data$RH, main = "Humidity Analysis", xlab = "Absolute Humidity", ylab = "Relative Humidity")
Notice we have an outlier in our data.
t.test(Air_data$RH, Air_data$AH)
##
## Welch Two Sample t-test
##
## data: Air_data$RH and Air_data$AH
## t = 69.62, df = 17471, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 45.01707 47.62536
## sample estimates:
## mean of x mean of y
## 39.483611 -6.837604
First we’ll look at the unnest_token function
Lets start by looking at an Emily Dickenson passage
text <- c("Because I could not stop from Death =",
"He kindly stopped for me -",
"The Carriage held but just Ourselves -",
"and Immortality")
text
## [1] "Because I could not stop from Death ="
## [2] "He kindly stopped for me -"
## [3] "The Carriage held but just Ourselves -"
## [4] "and Immortality"
This is the typical character vector that we might want to analyze. In order to turn it inot a tidytext dataset, we first need to put it into a dataframe.
library(dplyr)
text_df <- tibble(line = 1:4, text = text)
text_df
## # A tibble: 4 × 2
## line text
## <int> <chr>
## 1 1 Because I could not stop from Death =
## 2 2 He kindly stopped for me -
## 3 3 The Carriage held but just Ourselves -
## 4 4 and Immortality
Reminder: a tibble is a modern class of data frame within R. Its available in the dplyr and tibble packages, that has a convenient print method, will not convert strings to factors, and does not use row names. Tibbles are great for use with tidy tools.
Next we will use the ‘unnest_tokens’ function
First we have the output column name that will be created as the text is unnested into it.
library(tidytext)
text_df %>%
unnest_tokens(word, text)
## # A tibble: 20 × 2
## line word
## <int> <chr>
## 1 1 because
## 2 1 i
## 3 1 could
## 4 1 not
## 5 1 stop
## 6 1 from
## 7 1 death
## 8 2 he
## 9 2 kindly
## 10 2 stopped
## 11 2 for
## 12 2 me
## 13 3 the
## 14 3 carriage
## 15 3 held
## 16 3 but
## 17 3 just
## 18 3 ourselves
## 19 4 and
## 20 4 immortality
Lets use the janeaustenr package to analyze some Jane Austen texts. There are 6 books in this package.
library(janeaustenr)
library(dplyr)
library(stringr)
originial_books <- austen_books() %>%
group_by(book) %>%
mutate(linenumber = row_number(),
chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
ignore_case = TRUE)))) %>%
ungroup()
originial_books
## # A tibble: 73,422 × 4
## text book linenumber chapter
## <chr> <fct> <int> <int>
## 1 "SENSE AND SENSIBILITY" Sense & Sensibility 1 0
## 2 "" Sense & Sensibility 2 0
## 3 "by Jane Austen" Sense & Sensibility 3 0
## 4 "" Sense & Sensibility 4 0
## 5 "(1811)" Sense & Sensibility 5 0
## 6 "" Sense & Sensibility 6 0
## 7 "" Sense & Sensibility 7 0
## 8 "" Sense & Sensibility 8 0
## 9 "" Sense & Sensibility 9 0
## 10 "CHAPTER 1" Sense & Sensibility 10 1
## # … with 73,412 more rows
To work with this as a tidy dataset, we need to restructure it in the on-token-per-row format, which as we saw earlier is done with the unnest_tokens() function
library(tidytext)
tidy_books <- originial_books %>%
unnest_tokens(word, text)
tidy_books
## # A tibble: 725,055 × 4
## book linenumber chapter word
## <fct> <int> <int> <chr>
## 1 Sense & Sensibility 1 0 sense
## 2 Sense & Sensibility 1 0 and
## 3 Sense & Sensibility 1 0 sensibility
## 4 Sense & Sensibility 3 0 by
## 5 Sense & Sensibility 3 0 jane
## 6 Sense & Sensibility 3 0 austen
## 7 Sense & Sensibility 5 0 1811
## 8 Sense & Sensibility 10 1 chapter
## 9 Sense & Sensibility 10 1 1
## 10 Sense & Sensibility 13 1 the
## # … with 725,045 more rows
This function uses the tokenizers package to seperate each line of text in the original dataframe into tokens.
The default tokenizing is for words, but other options including characters, n-grams, sentences, lines, or paragraphs used. Now that the data is in a one-word-per-row format, we can manipulte it with tools like dplyr. Often in text analysis, we will want to remove stop words. Stop words are words that are NOT USEFUL for an analysis. These include: the, of, to, and, and so forth.
We can remove stop words (kept in the tidytext dataset ‘stop_words’) with an anti_join()
data(stop_word)
## Warning in data(stop_word): data set 'stop_word' not found
tidy_books <- tidy_books %>%
anti_join(stop_words)
## Joining, by = "word"
The stop words dataset in the tidytext package contians stopwords from three lexicons. We can use them all together, have three, or filter() to only use one set of stop words if thats more appropriate for your analysis.
tidy_books %>%
count(word, sort = TRUE)
## # A tibble: 13,914 × 2
## word n
## <chr> <int>
## 1 miss 1855
## 2 time 1337
## 3 fanny 862
## 4 dear 822
## 5 lady 817
## 6 sir 806
## 7 day 797
## 8 emma 787
## 9 sister 727
## 10 house 699
## # … with 13,904 more rows
Because we’ve been using tidy tools, our word counts are stored in a tidy data frame. This allows us to pipe this directly into ggplot2. For example, we can create a visualization of the most common words.
library(ggplot2)
tidy_books %>%
count(word, sort = TRUE) %>%
filter(n > 600) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word)) +
geom_col() +
labs(y = NULL, x = "word count")
The gutenbergr package
This package provides access to the public domain works from the gutenberg project (www.gutenber.org). This package includes tools for both downloading books and a complete dataset of project gutenberg metadata that can be used to find works of interest. We will mostly use the function gutenberg_download().
Word frequencies
Lets look at some biology texts, starting with Darwin
The Voyage of the Beagle - 944 On the origin of species by the means of natural selection - 1228 The expression of emotions in man and animals - 1227 The descent of man, and selection in relation to sex - 2300
We can access these works usign the gutenberg_download() and the Project Gutenberg IDnumbers
library(gutenbergr)
darwin <- gutenberg_download(c(944, 1227, 1228, 2300), mirror = "https://mirror2.sandyriver.net/pub/gutenberg")
Lets break these into tokens
tidy_darwin <- darwin %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
## Joining, by = "word"
Lets check out what the most common darwin words are.
tidy_darwin %>%
count(word, sort = TRUE)
## # A tibble: 23,630 × 2
## word n
## <chr> <int>
## 1 species 2998
## 2 male 1672
## 3 males 1337
## 4 animals 1310
## 5 birds 1292
## 6 female 1197
## 7 sexes 1095
## 8 females 1038
## 9 selection 1038
## 10 sexual 801
## # … with 23,620 more rows
Now lets get some work from Thomas Hunt Morgan, who is credited with discovering chromosomes.
Regeneration - 57198 The genetic and operative evidence relating to secondary sexual characteristics - 57460 Evolution and Adaptation - 63540
morgan <- gutenberg_download(c(57198, 57460, 63540), mirror = "https://mirror2.sandyriver.net/pub/gutenberg")
Lets tokenize THM
tidy_morgan <- morgan %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
## Joining, by = "word"
What are THM’s most common words?
tidy_morgan %>%
count(word, sort = TRUE)
## # A tibble: 13,855 × 2
## word n
## <chr> <int>
## 1 species 869
## 2 regeneration 814
## 3 piece 702
## 4 cut 669
## 5 male 668
## 6 forms 631
## 7 selection 604
## 8 cells 576
## 9 found 552
## 10 development 546
## # … with 13,845 more rows
Lastly, lets look at Thomas Henry Huxley
Evidence as to mans place in nature - 2931 On the reception of the Origin of Species - 2089 Evolution and Ethics, and other essays - 2940 Science and Culture, and other essays - 52344
huxley <- gutenberg_download(c(2931, 2089, 2940, 52344), mirror = "https://mirror2.sandyriver.net/pub/gutenberg")
tidy_huxley <- huxley %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
## Joining, by = "word"
tidy_huxley %>%
count(word, sort = TRUE)
## # A tibble: 16,090 × 2
## word n
## <chr> <int>
## 1 species 339
## 2 nature 331
## 3 time 287
## 4 life 286
## 5 existence 255
## 6 knowledge 238
## 7 animals 227
## 8 natural 223
## 9 animal 216
## 10 science 207
## # … with 16,080 more rows
Now lets calculate the frequency for each word for the works of Darwin, Morgan and Huxley by binding the frames together.
library(tidyr)
frequency <- bind_rows(mutate(tidy_morgan, author = "Thomas Hunt Morgan"),
mutate(tidy_darwin, author = "Charles Darwin"),
mutate(tidy_huxley, author = "Thomas Henry Huxley")) %>%
mutate(word = str_extract(word, "[a-z']+")) %>%
count(author, word) %>%
group_by(author) %>%
mutate(proportion = n/ sum(n)) %>%
select(-n) %>%
pivot_wider(names_from = author, values_from = proportion) %>%
pivot_longer('Thomas Hunt Morgan': 'Charles Darwin', names_to = "author", values_to = "proportion")
frequency
## # A tibble: 95,895 × 3
## word author proportion
## <chr> <chr> <dbl>
## 1 a Thomas Hunt Morgan 0.00206
## 2 a Thomas Henry Huxley 0.0000856
## 3 a Charles Darwin 0.000141
## 4 ab Thomas Hunt Morgan 0.000165
## 5 ab Thomas Henry Huxley 0.0000978
## 6 ab Charles Darwin 0.00000642
## 7 abaiss Thomas Hunt Morgan NA
## 8 abaiss Thomas Henry Huxley NA
## 9 abaiss Charles Darwin 0.00000642
## 10 abandon Thomas Hunt Morgan 0.00000752
## # … with 95,885 more rows
Now we need to change the table so that each author has its own row
frequency2 <- pivot_wider(frequency, names_from = author, values_from = proportion)
frequency2
## # A tibble: 31,965 × 4
## word `Thomas Hunt Morgan` `Thomas Henry Huxley` `Charles Darwin`
## <chr> <dbl> <dbl> <dbl>
## 1 a 0.00206 0.0000856 0.000141
## 2 ab 0.000165 0.0000978 0.00000642
## 3 abaiss NA NA 0.00000642
## 4 abandon 0.00000752 0.0000122 0.00000321
## 5 abandoned 0.0000150 0.0000245 0.00000321
## 6 abashed NA NA 0.00000321
## 7 abatement NA 0.0000245 0.00000321
## 8 abbot NA 0.0000245 0.00000321
## 9 abbott NA NA 0.00000642
## 10 abbreviated NA NA 0.0000128
## # … with 31,955 more rows
Now lets plot
library(scales)
ggplot(frequency2, aes(x = `Charles Darwin`, y = `Thomas Hunt Morgan`), color = abs(- 'Charles Darwin' -'Thomas Hunt Morgan')) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001),
low = "darkslategray4", high = "gray75") +
theme(legend.position = "none") +
labs(y = "Thomas Hunt Morgan", x = "Charles Darwin")
## Warning: Removed 24513 rows containing missing values (geom_point).
## Warning: Removed 24514 rows containing missing values (geom_text).
The Sentiments datasets
There are a variety of methods and dictionaries that exist for evaluating the opinion or emotion of the text.
AFFIN bing nrc
bing categories words in a binary fashion into positive or negative nrc categorizes into positive, negative, anger, anticipation, disgust, fear, joy, sadness, surprise, and trust. AFFIN assigns a score bewteen -5 and 5, with negative indicating negative sentiment, and 5 positive.
The function get_sentiments() allows us to get the specific sentiments lexicon with the measure for each one.
library(tidytext)
library(textdata)
afinn <- get_sentiments("afinn")
afinn
## # A tibble: 2,477 × 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # … with 2,467 more rows
Lets look at bing
bing <- get_sentiments("bing")
bing
## # A tibble: 6,786 × 2
## word sentiment
## <chr> <chr>
## 1 2-faces negative
## 2 abnormal negative
## 3 abolish negative
## 4 abominable negative
## 5 abominably negative
## 6 abominate negative
## 7 abomination negative
## 8 abort negative
## 9 aborted negative
## 10 aborts negative
## # … with 6,776 more rows
And lastly, nrc
nrc <- get_sentiments("nrc")
nrc
## # A tibble: 13,872 × 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # … with 13,862 more rows
These libraries were created either using crowdsourcing or cloud computing/ai like Amzon Mechanical Turk, or by labor of one of the authors, and then validated with crowdsourcing.
Lets look at the words with a joy source from NRC
library(gutenbergr)
library(dplyr)
library(stringr)
darwin <- gutenberg_download(c(944, 1227, 1228, 2300), mirror = "https://mirror2.sandyriver.net/pub/gutenberg")
tidy_books <- darwin %>%
group_by(gutenberg_id) %>%
mutate(linenumber = row_number(), chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", ignore_case = TRUE)))) %>%
ungroup() %>%
unnest_tokens(word, text)
tidy_books
## # A tibble: 786,575 × 4
## gutenberg_id linenumber chapter word
## <int> <int> <int> <chr>
## 1 944 1 0 the
## 2 944 1 0 voyage
## 3 944 1 0 of
## 4 944 1 0 the
## 5 944 1 0 beagle
## 6 944 1 0 by
## 7 944 2 0 charles
## 8 944 2 0 darwin
## 9 944 8 0 about
## 10 944 8 0 the
## # … with 786,565 more rows
Lets add the book name instead of GID
colnames(tidy_books)[1] <- "book"
tidy_books$book[tidy_books$book == 944] <- "The Voyage of the Beagle"
tidy_books$book[tidy_books$book == 1227] <- "The Expression of the Emotions in Man and Animals"
tidy_books$book[tidy_books$book == 1228] <- "On the Origin of Species By Means of Natural Selection"
tidy_books$book[tidy_books$book == 2300] <- "The Descent of Man, and Selection in Relation to Sex"
tidy_books
## # A tibble: 786,575 × 4
## book linenumber chapter word
## <chr> <int> <int> <chr>
## 1 The Voyage of the Beagle 1 0 the
## 2 The Voyage of the Beagle 1 0 voyage
## 3 The Voyage of the Beagle 1 0 of
## 4 The Voyage of the Beagle 1 0 the
## 5 The Voyage of the Beagle 1 0 beagle
## 6 The Voyage of the Beagle 1 0 by
## 7 The Voyage of the Beagle 2 0 charles
## 8 The Voyage of the Beagle 2 0 darwin
## 9 The Voyage of the Beagle 8 0 about
## 10 The Voyage of the Beagle 8 0 the
## # … with 786,565 more rows
Now that we have a tidy format with one word per row, we are ready for sentiment analysis. First lets us NRC.
nrc_joy <- get_sentiments("nrc") %>%
filter(sentiment == "joy")
tidy_books %>%
filter(book == "The Voyage of the Beagle") %>%
inner_join(nrc_joy) %>%
count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 277 × 2
## word n
## <chr> <int>
## 1 found 301
## 2 good 161
## 3 remarkable 114
## 4 green 95
## 5 kind 92
## 6 tree 86
## 7 present 85
## 8 food 78
## 9 beautiful 61
## 10 elevation 60
## # … with 267 more rows
We can also examine how sentiment changes throughout a work.
library(tidyr)
charles_darwin_sentiment <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(book, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
Now lets plot it.
library(ggplot2)
ggplot(charles_darwin_sentiment, aes(index, sentiment, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2, scales = "free_x")
Lets compare the three sentiment dictions
There are several options for sentiment lexicons, you might want some more info on whcih is appropriate for your purpose. Here we will use all three of our dictionaries and examine how the sentiment chnages across the arc of TVOTB.
library(tidyr)
voyage <- tidy_books %>%
filter(book == "The Voyage of the Beagle")
voyage
## # A tibble: 208,118 × 4
## book linenumber chapter word
## <chr> <int> <int> <chr>
## 1 The Voyage of the Beagle 1 0 the
## 2 The Voyage of the Beagle 1 0 voyage
## 3 The Voyage of the Beagle 1 0 of
## 4 The Voyage of the Beagle 1 0 the
## 5 The Voyage of the Beagle 1 0 beagle
## 6 The Voyage of the Beagle 1 0 by
## 7 The Voyage of the Beagle 2 0 charles
## 8 The Voyage of the Beagle 2 0 darwin
## 9 The Voyage of the Beagle 8 0 about
## 10 The Voyage of the Beagle 8 0 the
## # … with 208,108 more rows
Lets again use integer division (‘%/%’) to define larger sections of the text that span multiple lines, and we can use the same pattern with ‘count()’, ‘pivot_wider()’, and ‘mutate()’, to find the net sentiment in each of these sections of text.
affin <- voyage %>%
inner_join(get_sentiments("afinn")) %>%
group_by(index = linenumber %/% 80) %>%
summarise(sentiment = sum(value)) %>%
mutate(method = "AFINN")
## Joining, by = "word"
bing_and_nrc <- bind_rows(
voyage %>%
inner_join(get_sentiments("bing")) %>%
mutate(method = "Bing et al."),
voyage %>%
inner_join(get_sentiments("nrc") %>%
filter(sentiment %in% c("positive", "negative"))
) %>%
mutate(method = "NRC")) %>%
count(method, index = linenumber %/% 80, sentiment) %>%
pivot_wider(names_from = sentiment,
values_from = n,
values_fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
## Joining, by = "word"
We can now estimate the net sentiment (positive - negative) in each chunk of the novel text for each lexicon (dictionary). Lets bind them all together and visualize with ggplot.
bind_rows(affin, bing_and_nrc) %>%
ggplot(aes(index, sentiment, fill = method)) +
geom_col(show.legend = FALSE) +
facet_wrap(~method, ncol = 1)
Lets look at the counts based on each dictionary
get_sentiments("nrc") %>%
filter(sentiment %in% c("positive", "negative")) %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 3316
## 2 positive 2308
get_sentiments("bing") %>%
count(sentiment)
## # A tibble: 2 × 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
bing_word_counts <- tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
bing_word_counts
## # A tibble: 2,492 × 3
## word sentiment n
## <chr> <chr> <int>
## 1 great positive 1226
## 2 well positive 855
## 3 like positive 813
## 4 good positive 487
## 5 doubt negative 414
## 6 wild negative 317
## 7 respect positive 310
## 8 remarkable positive 295
## 9 important positive 281
## 10 bright positive 258
## # … with 2,482 more rows
This can be shown visually, and we can pipe straight into ggplot2
bing_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scale = "free_y") +
labs(x = "Contribution to Sentiment", y = NULL)
Lets spot an anomaly in the dataset.
custom_stop_words <- bind_rows(tibble(words = c("wild", "dark", "great", "like"), lexicon = c("custom")), stop_words)
custom_stop_words
## # A tibble: 1,153 × 3
## words lexicon word
## <chr> <chr> <chr>
## 1 wild custom <NA>
## 2 dark custom <NA>
## 3 great custom <NA>
## 4 like custom <NA>
## 5 <NA> SMART a
## 6 <NA> SMART a's
## 7 <NA> SMART able
## 8 <NA> SMART about
## 9 <NA> SMART above
## 10 <NA> SMART according
## # … with 1,143 more rows
Word Clouds!
We can see that tidy text mining and sentiment analysis works well with ggplot2, but having our data in tidy format leads to other nice graphing techniques
Lets use the wordcloud package
library(wordcloud)
## Loading required package: RColorBrewer
tidy_books %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 100))
## Joining, by = "word"
Lets also look at comparison.cloud(), which may require turning the dataframe into a matrix.
We can change to matrix using the acast() function.
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tigerstats':
##
## tips
## The following object is masked from 'package:tidyr':
##
## smiths
tidy_books %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
acast(word ~ sentiment, value.var = "n", fill = 0) %>%
comparison.cloud(colors = c("gray20", "gray80"), max.words = 100)
## Joining, by = "word"
Looking at units beyond words
Lots of useful work can be done by tokenizing at the word level, but somtimes its nice to look at different units of text. For example, we can look beyond just unigrams.
Ex I am not having a good day.
bingnegative <- get_sentiments("bing") %>%
filter(sentiment == "negative")
wordcounts <- tidy_books %>%
group_by(book, chapter) %>%
summarize(words = n())
## `summarise()` has grouped output by 'book'. You can override using the
## `.groups` argument.
tidy_books %>%
semi_join(bingnegative) %>%
group_by(book, chapter) %>%
summarize(negativewords = n()) %>%
left_join(wordcounts, by = c("book", "chapter")) %>%
mutate(ratio = negativewords/words) %>%
filter(chapter !=0) %>%
slice_max(ratio, n = 1) %>%
ungroup()
## Joining, by = "word"
## `summarise()` has grouped output by 'book'. You can override using the
## `.groups` argument.
## # A tibble: 4 × 5
## book chapter negat…¹ words ratio
## <chr> <int> <int> <int> <dbl>
## 1 On the Origin of Species By Means of Natural Sel… 3 5 86 0.0581
## 2 The Descent of Man, and Selection in Relation to… 20 4 87 0.0460
## 3 The Expression of the Emotions in Man and Animals 10 249 4220 0.0590
## 4 The Voyage of the Beagle 10 375 11202 0.0335
## # … with abbreviated variable name ¹negativewords
So far we’ve only looked at single words, but many interestinf (more accurate) analyses are based on the relationship between words.
Lets look at some methods oftidytext for calculating and visualizing word relationships.
library(dplyr)
library(tidytext)
library(gutenbergr)
darwin_books <- gutenberg_download(c(944, 1227, 1228, 2300), mirror = "https://mirror2.sandyriver.net/pub/gutenberg")
colnames(darwin_books)[1] <- "book"
darwin_books$book[darwin_books$book == 944] <- "The Voyage of the Beagle"
darwin_books$book[darwin_books$book == 1227] <- "The Expression of the Emotions in Man and Animals"
darwin_books$book[darwin_books$book == 1228] <- "On the Origin of Species By Means of Natural Selection"
darwin_books$book[darwin_books$book == 2300] <- "The Descent of Man, and Selection in Relation to Sex"
darwin_bigrams <- darwin_books %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
darwin_bigrams
## # A tibble: 724,531 × 2
## book bigram
## <chr> <chr>
## 1 The Voyage of the Beagle the voyage
## 2 The Voyage of the Beagle voyage of
## 3 The Voyage of the Beagle of the
## 4 The Voyage of the Beagle the beagle
## 5 The Voyage of the Beagle beagle by
## 6 The Voyage of the Beagle charles darwin
## 7 The Voyage of the Beagle <NA>
## 8 The Voyage of the Beagle <NA>
## 9 The Voyage of the Beagle <NA>
## 10 The Voyage of the Beagle <NA>
## # … with 724,521 more rows
This data is still in tidytext format, and its structured as one-token-per-row. Each token is a bigram.
Counting and filtering n-gram
darwin_bigrams %>%
count(bigram, sort = TRUE)
## # A tibble: 238,516 × 2
## bigram n
## <chr> <int>
## 1 of the 11297
## 2 <NA> 8947
## 3 in the 5257
## 4 on the 4093
## 5 to the 2849
## 6 the same 2048
## 7 that the 1947
## 8 it is 1830
## 9 of a 1610
## 10 and the 1590
## # … with 238,506 more rows
Most of the common bigrams are stop-words. This can be a good time to use tidyr’s seperate command which splits a column into multiple based on a delimiter. This will let us make a column for word one and word two.
library(tidyr)
bigrams_separated <- darwin_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
New bigram counts
bigram_counts <- bigrams_filtered %>%
unite(bigram, word1, word2, sep = " ")
bigram_counts
## # A tibble: 94,896 × 2
## book bigram
## <chr> <chr>
## 1 The Voyage of the Beagle charles darwin
## 2 The Voyage of the Beagle NA NA
## 3 The Voyage of the Beagle NA NA
## 4 The Voyage of the Beagle NA NA
## 5 The Voyage of the Beagle NA NA
## 6 The Voyage of the Beagle NA NA
## 7 The Voyage of the Beagle online edition
## 8 The Voyage of the Beagle NA NA
## 9 The Voyage of the Beagle degree symbol
## 10 The Voyage of the Beagle degs italics
## # … with 94,886 more rows
We may also be intrerested in trigrams, which are three word combos
trigrams <- darwin_books %>%
unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!word3 %in% stop_words$word) %>%
count(word1, word2, word3, sort = TRUE)
trigrams
## # A tibble: 19,971 × 4
## word1 word2 word3 n
## <chr> <chr> <chr> <int>
## 1 <NA> <NA> <NA> 9884
## 2 tierra del fuego 92
## 3 secondary sexual characters 91
## 4 captain fitz roy 45
## 5 closely allied species 30
## 6 de la physionomie 30
## 7 domestication vol ii 26
## 8 vol ii pp 22
## 9 vertebrates vol iii 21
## 10 proc zoolog soc 18
## # … with 19,961 more rows
Lets analyze some bigrams
bigrams_filtered %>%
filter(word2 == "selection") %>%
count(book, word1, sort = TRUE)
## # A tibble: 39 × 3
## book word1 n
## <chr> <chr> <int>
## 1 The Descent of Man, and Selection in Relation to Sex sexual 254
## 2 On the Origin of Species By Means of Natural Selection natural 250
## 3 The Descent of Man, and Selection in Relation to Sex natural 156
## 4 On the Origin of Species By Means of Natural Selection sexual 18
## 5 On the Origin of Species By Means of Natural Selection continued 6
## 6 The Descent of Man, and Selection in Relation to Sex unconscious 6
## 7 On the Origin of Species By Means of Natural Selection methodical 5
## 8 The Descent of Man, and Selection in Relation to Sex continued 5
## 9 On the Origin of Species By Means of Natural Selection unconscious 4
## 10 The Expression of the Emotions in Man and Animals natural 4
## # … with 29 more rows
Lets again look at tf-idf across bigrams across Darwins works
bigram_tf_idf <- bigram_counts %>%
count(book, bigram) %>%
bind_tf_idf(bigram, book, n) %>%
arrange(desc(tf_idf))
bigram_tf_idf
## # A tibble: 60,595 × 6
## book bigram n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 The Expression of the Emotions in Man and… nerve… 47 0.00350 1.39 0.00485
## 2 On the Origin of Species By Means of Natu… natur… 250 0.0160 0.288 0.00460
## 3 The Expression of the Emotions in Man and… la ph… 35 0.00260 1.39 0.00361
## 4 The Voyage of the Beagle bueno… 54 0.00245 1.39 0.00339
## 5 The Voyage of the Beagle capta… 53 0.00240 1.39 0.00333
## 6 On the Origin of Species By Means of Natu… glaci… 36 0.00230 1.39 0.00319
## 7 The Voyage of the Beagle fitz … 50 0.00227 1.39 0.00314
## 8 The Expression of the Emotions in Man and… muscl… 30 0.00223 1.39 0.00310
## 9 The Expression of the Emotions in Man and… orbic… 29 0.00216 1.39 0.00299
## 10 The Expression of the Emotions in Man and… dr du… 26 0.00194 1.39 0.00268
## # … with 60,585 more rows
library(ggplot2)
bigram_tf_idf %>%
arrange(desc(tf_idf)) %>%
group_by(book) %>%
slice_max(tf_idf, n = 10) %>%
ungroup() %>%
mutate(bigram = reorder(bigram, tf_idf)) %>%
ggplot(aes(tf_idf, bigram, fill = book)) +
geom_col(show.legend = FALSE) +
facet_wrap(~book, ncol = 2) +
labs(x = "tf-idf of bigrams", y = NULL)
Using bigrams to provide context in sentiment analysis
bigrams_separated %>%
filter(word1 == "not") %>%
count(word1, word2, sort = TRUE)
## # A tibble: 867 × 3
## word1 word2 n
## <chr> <chr> <int>
## 1 not be 128
## 2 not have 104
## 3 not only 103
## 4 not a 100
## 5 not to 98
## 6 not been 89
## 7 not the 82
## 8 not at 70
## 9 not know 60
## 10 not so 58
## # … with 857 more rows
By doing sentiment analysis on bigrams, we can examine how often sentiment-associated words are preceded by a modifier like “not” or other negating words.
library(textdata)
AFINN <- get_sentiments("afinn")
AFINN
## # A tibble: 2,477 × 2
## word value
## <chr> <dbl>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # … with 2,467 more rows
We can examine the most frequent words that were preceded by “not”, and associated with sentiment.
not_words <- bigrams_separated %>%
filter(word1 == "not") %>%
inner_join(AFINN, by = c(word2 = "word")) %>%
count(word2, value, sort = TRUE)
not_words
## # A tibble: 114 × 3
## word2 value n
## <chr> <dbl> <int>
## 1 doubt -1 25
## 2 like 2 11
## 3 pretend -1 9
## 4 wish 1 8
## 5 admit -1 7
## 6 difficult -1 5
## 7 easy 1 5
## 8 reach 1 5
## 9 extend 1 4
## 10 forget -1 4
## # … with 104 more rows
Lets visualize
library(ggplot2)
not_words %>%
mutate(contribution = n * value) %>%
arrange(desc(abs(contribution))) %>%
head(20) %>%
mutate(word2 = reorder(word2, contribution)) %>%
ggplot(aes(n * value, word2, fill = n * value > 0)) +
geom_col(show.legend = FALSE) +
labs(x = "Sentiment Value * number of occurences", y = "Words preceded by \"not\" ")
negation_words <- c("not", "no", "never", "non", "without")
negated_words <- bigrams_separated %>%
filter(word1 %in% negation_words) %>%
inner_join(AFINN, by = c(word2 = "word")) %>%
count(word1, word2, value, sort = TRUE)
negated_words
## # A tibble: 208 × 4
## word1 word2 value n
## <chr> <chr> <dbl> <int>
## 1 no doubt -1 210
## 2 not doubt -1 25
## 3 no great 3 19
## 4 not like 2 11
## 5 not pretend -1 9
## 6 not wish 1 8
## 7 without doubt -1 8
## 8 not admit -1 7
## 9 no greater 3 6
## 10 not difficult -1 5
## # … with 198 more rows
A central question in text mining is how to quantify what a document is about. We can do this by looking at words that make up the document, and measuring term frequency.
There are a lot of words that may not be important, these are stop words.
One way to remedy this is to look at inverse document frequency words, which decreases the weight for commonly used words and increases the weight for words that are not used very much.
Term frequency in Darwins works
library(dplyr)
library(tidytext)
book_words <- gutenberg_download(c(944, 1227, 1228, 2300), mirror = "https://mirror2.sandyriver.net/pub/gutenberg")
colnames(book_words)[1] <- "book"
book_words$book[book_words$book == 944] <- "The Voyage of the Beagle"
book_words$book[book_words$book == 1227] <- "The Expression of the Emotions in Man and Animals"
book_words$book[book_words$book == 1228] <- "On the Origin of Species By Means of Natural Selection"
book_words$book[book_words$book == 2300] <- "The Descent of Man, and Selection in Relation to Sex"
Now lets disect
book_words <- book_words %>%
unnest_tokens(word, text) %>%
count(book, word, sort = TRUE)
book_words
## # A tibble: 43,024 × 3
## book word n
## <chr> <chr> <int>
## 1 The Descent of Man, and Selection in Relation to Sex the 25490
## 2 The Voyage of the Beagle the 16930
## 3 The Descent of Man, and Selection in Relation to Sex of 16762
## 4 On the Origin of Species By Means of Natural Selection the 10301
## 5 The Voyage of the Beagle of 9438
## 6 The Descent of Man, and Selection in Relation to Sex in 8882
## 7 The Expression of the Emotions in Man and Animals the 8045
## 8 On the Origin of Species By Means of Natural Selection of 7864
## 9 The Descent of Man, and Selection in Relation to Sex and 7854
## 10 The Descent of Man, and Selection in Relation to Sex to 5901
## # … with 43,014 more rows
book_words$n <- as.numeric(book_words$n)
total_words <- book_words %>%
group_by(book) %>%
summarize(total = sum(n))
book_words
## # A tibble: 43,024 × 3
## book word n
## <chr> <chr> <dbl>
## 1 The Descent of Man, and Selection in Relation to Sex the 25490
## 2 The Voyage of the Beagle the 16930
## 3 The Descent of Man, and Selection in Relation to Sex of 16762
## 4 On the Origin of Species By Means of Natural Selection the 10301
## 5 The Voyage of the Beagle of 9438
## 6 The Descent of Man, and Selection in Relation to Sex in 8882
## 7 The Expression of the Emotions in Man and Animals the 8045
## 8 On the Origin of Species By Means of Natural Selection of 7864
## 9 The Descent of Man, and Selection in Relation to Sex and 7854
## 10 The Descent of Man, and Selection in Relation to Sex to 5901
## # … with 43,014 more rows
book_words <- left_join(book_words, total_words)
## Joining, by = "book"
book_words
## # A tibble: 43,024 × 4
## book word n total
## <chr> <chr> <dbl> <dbl>
## 1 The Descent of Man, and Selection in Relation to Sex the 25490 311041
## 2 The Voyage of the Beagle the 16930 208118
## 3 The Descent of Man, and Selection in Relation to Sex of 16762 311041
## 4 On the Origin of Species By Means of Natural Selection the 10301 157002
## 5 The Voyage of the Beagle of 9438 208118
## 6 The Descent of Man, and Selection in Relation to Sex in 8882 311041
## 7 The Expression of the Emotions in Man and Animals the 8045 110414
## 8 On the Origin of Species By Means of Natural Selection of 7864 157002
## 9 The Descent of Man, and Selection in Relation to Sex and 7854 311041
## 10 The Descent of Man, and Selection in Relation to Sex to 5901 311041
## # … with 43,014 more rows
You can see that the usual suspects are the most common words, but don’t tell us anything about what the book topic is.
library(ggplot2)
ggplot(book_words, aes(n/total, fill = book)) +
geom_histogram(show.legend = FALSE) +
xlim(NA, 0.0009) +
facet_wrap(~book, ncol = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 515 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_bar).
Zipf’s Law
The frequency that a word appears is inversely proportional to its rank when prediciting a topic.
Lets apply Zipf’s law to Darwin’s work.
freq_by_rank <- book_words %>%
group_by(book) %>%
mutate(rank = row_number(),
'term frequency' = n/total) %>%
ungroup()
freq_by_rank
## # A tibble: 43,024 × 6
## book word n total rank term …¹
## <chr> <chr> <dbl> <dbl> <int> <dbl>
## 1 The Descent of Man, and Selection in Relati… the 25490 311041 1 0.0820
## 2 The Voyage of the Beagle the 16930 208118 1 0.0813
## 3 The Descent of Man, and Selection in Relati… of 16762 311041 2 0.0539
## 4 On the Origin of Species By Means of Natura… the 10301 157002 1 0.0656
## 5 The Voyage of the Beagle of 9438 208118 2 0.0453
## 6 The Descent of Man, and Selection in Relati… in 8882 311041 3 0.0286
## 7 The Expression of the Emotions in Man and A… the 8045 110414 1 0.0729
## 8 On the Origin of Species By Means of Natura… of 7864 157002 2 0.0501
## 9 The Descent of Man, and Selection in Relati… and 7854 311041 4 0.0253
## 10 The Descent of Man, and Selection in Relati… to 5901 311041 5 0.0190
## # … with 43,014 more rows, and abbreviated variable name ¹`term frequency`
freq_by_rank %>%
ggplot(aes(rank, `term frequency`, color = book)) +
geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
scale_x_log10() +
scale_y_log10()
Lets use TF - IDF to find words for each document by decreasing the weight for commonly used words and increase the weight for words that are not used very much in a collection of documents.
book_tf_idf <- book_words %>%
bind_tf_idf(word, book, n)
book_tf_idf
## # A tibble: 43,024 × 7
## book word n total tf idf tf_idf
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 The Descent of Man, and Selection in … the 25490 311041 0.0820 0 0
## 2 The Voyage of the Beagle the 16930 208118 0.0813 0 0
## 3 The Descent of Man, and Selection in … of 16762 311041 0.0539 0 0
## 4 On the Origin of Species By Means of … the 10301 157002 0.0656 0 0
## 5 The Voyage of the Beagle of 9438 208118 0.0453 0 0
## 6 The Descent of Man, and Selection in … in 8882 311041 0.0286 0 0
## 7 The Expression of the Emotions in Man… the 8045 110414 0.0729 0 0
## 8 On the Origin of Species By Means of … of 7864 157002 0.0501 0 0
## 9 The Descent of Man, and Selection in … and 7854 311041 0.0253 0 0
## 10 The Descent of Man, and Selection in … to 5901 311041 0.0190 0 0
## # … with 43,014 more rows
Lets look at terms with high tf-idf in Darwin’s works
book_tf_idf %>%
select(-total) %>%
arrange(desc(tf_idf))
## # A tibble: 43,024 × 6
## book word n tf idf tf_idf
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 The Expression of the Emotions in Man and … tears 126 1.14e-3 1.39 1.58e-3
## 2 The Expression of the Emotions in Man and … blush 114 1.03e-3 1.39 1.43e-3
## 3 The Expression of the Emotions in Man and … eyeb… 149 1.35e-3 0.693 9.35e-4
## 4 The Voyage of the Beagle degs 117 5.62e-4 1.39 7.79e-4
## 5 On the Origin of Species By Means of Natur… sele… 412 2.62e-3 0.288 7.55e-4
## 6 The Descent of Man, and Selection in Relat… sexu… 745 2.40e-3 0.288 6.89e-4
## 7 The Descent of Man, and Selection in Relat… shewn 143 4.60e-4 1.39 6.37e-4
## 8 On the Origin of Species By Means of Natur… hybr… 133 8.47e-4 0.693 5.87e-4
## 9 The Expression of the Emotions in Man and … frown 46 4.17e-4 1.39 5.78e-4
## 10 The Descent of Man, and Selection in Relat… sele… 621 2.00e-3 0.288 5.74e-4
## # … with 43,014 more rows