GGplot

Barplots

Section 1a - lets look at barplots in ggplot2

we’ll start with making a dataframe based on the tooth data

df <- data.frame(dose = c("D0.5", "D1", "D2"),
                 len = c(4.2, 10, 29.5))

df
##   dose  len
## 1 D0.5  4.2
## 2   D1 10.0
## 3   D2 29.5

now lets make a second dataframe

df2 <- data.frame(supp=rep(c("VC", "OJ"), each = 3),
                  dose = rep(c("D0.5", "D1", "D2"), 2),
                  len = c(6.8, 15, 33, 4.2, 10, 29.5))

df2
##   supp dose  len
## 1   VC D0.5  6.8
## 2   VC   D1 15.0
## 3   VC   D2 33.0
## 4   OJ D0.5  4.2
## 5   OJ   D1 10.0
## 6   OJ   D2 29.5

lets load up ggplot2

library(ggplot2)

lets set parameters for ggplot2

theme_set(
  theme_classic() +
    theme(legend.position = "top")
)

lets start with some basic bar plots using the tooth data

f <- ggplot(df, aes(x = dose, y = len))

f + geom_col()

now lets change the fill, and add labels to the top

f + geom_col(fill = "darkblue") + 
  geom_text(aes(label = len), vjust = -0.3)

now lets add labels inside the bars

f + geom_col(fill = "darkblue") + 
  geom_text(aes(label = len), vjust = 1.6, color = "white")

now lets change the barplot colors by group

f + geom_col(aes(color = dose), fill = "white") +
  scale_color_manual(values = c("blue", "gold", "red"))

This is hard to see, lets change the fill

f + geom_col(aes(fill = dose)) +
  scale_fill_manual(values = c("blue", "gold", "red"))

now lets do this with multiple groups

ggplot(df2, aes(x = dose, y = len)) +
  geom_col(aes(color = supp, fill = supp), position = position_stack()) +
  scale_color_manual(values = c("blue", "gold")) +
  scale_fill_manual(values = c("blue", "gold"))

p <- ggplot(df2, aes(x = dose, y = len)) +
  geom_col(aes(color = supp, fill = supp), position = position_dodge(0.8), width = 0.7)+
  scale_color_manual(values = c("blue", "gold")) +
  scale_fill_manual(values = c("blue", "gold"))
p

now lets add labels to the dodged barplot

p + geom_text(
  aes(label = len, group = supp),
  position = position_dodge(0.8),
  vjust = -0.3, size = 3.5
)

now what if we want to add labels to our stacked barplots? for this we need dplyr

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
df2 <- df2 %>%
  group_by(dose) %>%
  arrange(dose, desc(supp)) %>%
  dplyr::mutate(lab_ypos = cumsum(len) - 0.5 * len)
df2
## # A tibble: 6 × 4
## # Groups:   dose [3]
##   supp  dose    len lab_ypos
##   <chr> <chr> <dbl>    <dbl>
## 1 VC    D0.5    6.8      3.4
## 2 OJ    D0.5    4.2      8.9
## 3 VC    D1     15        7.5
## 4 OJ    D1     10       20  
## 5 VC    D2     33       16.5
## 6 OJ    D2     29.5     47.8

now lets recreate our stacked graphs

ggplot(df2, aes(x=dose, y=len)) +
  geom_col(aes(fill=supp), width = 0.7) +
  geom_text(aes(y=lab_ypos, label=len, group=supp), color="white") +
  scale_color_manual(values = c("blue","gold"))+
  scale_fill_manual(values = c("blue", "gold"))

Boxplots

Section 1b - lets look at some boxplots

data("ToothGrowth")

Lets change the dose to a factor, and look at the top of the dataframe

ToothGrowth$dose <- as.factor(ToothGrowth$dose)

head(ToothGrowth, 4)
##    len supp dose
## 1  4.2   VC  0.5
## 2 11.5   VC  0.5
## 3  7.3   VC  0.5
## 4  5.8   VC  0.5

Lets load ggplot

library(ggplot2)

Lets set the theme for our plots to classic

theme_set(
  theme_bw() + 
    theme(legend.position = "top")
)

lets start with a very basic boxplot with dose vs length

tg <- ggplot(ToothGrowth, aes(x = dose, y = len))
tg + geom_boxplot()

Now lets look at a boxplot with points for the mean

tg +geom_boxplot(notch = TRUE, fill = "lightgrey") +
  stat_summary(fun.y = mean, geom = "point", shape = 18, size = 2.5, color = "indianred")
## Warning: `fun.y` is deprecated. Use `fun` instead.

We can also change the scale number of variables included and their order

tg + geom_boxplot() +
  scale_x_discrete(limits = c("0.5", "2"))
## Warning: Removed 20 rows containing missing values (stat_boxplot).

Lets put our x axis in descending order

tg + geom_boxplot() +
  scale_x_discrete(limits = c("2", "1", "0.5"))

we can also change boxplot colors by groups

tg + geom_boxplot(aes(color = dose)) +
  scale_color_manual(values = c("indianred", "blue1", "green2"))

what if we want to display our data subset by oj vs vitamin c?

tg2 <- tg + geom_boxplot(aes(fill = supp), position = position_dodge(0.9)) + 
  scale_fill_manual(values = c("#999999", "#E69F00"))

tg2

we can also arrange this as two plots with facet_wrap

tg2 + facet_wrap(~supp)

Histograms

Section 1c - lets look at histograms

set.seed(1234)

wdata = data.frame(
  sex = factor(rep(c("F","M"), each = 200)),
  weight = c(rnorm(200, 56), rnorm(200, 58))
)

head(wdata, 4)
##   sex   weight
## 1   F 54.79293
## 2   F 56.27743
## 3   F 57.08444
## 4   F 53.65430

now lets load dplyr

library(ggplot2)

theme_set(
  theme_classic() +
    theme(legend.position = "bottom")
)

now lets create a ggplot

a <- ggplot(wdata, aes(x = weight))

a + geom_histogram(bins = 30, color = "black", fill = "grey") +
  geom_vline(aes(xintercept = mean(weight)),
             linetype = "dashed", size = 0.6)

now lets change the color of the group

a + geom_histogram(aes(color = sex, fill = sex), position = "identity") + scale_color_manual(values = c("#00AFBB", "#E7B800"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

a + geom_histogram(aes(color = sex, fill = sex), position = "identity") + scale_color_manual(values = c("#00AFBB", "#E7B800")) + scale_fill_manual(values = c("indianred", "lightblue"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

what if we want to combine density plots and histograms?

a + geom_histogram(aes(y = stat(density)),
                   color = "black", fill = "white") + geom_density(alpha = 0.2, fill = "#FF6666")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

a + geom_histogram(aes(y = stat(density), color = sex), fill = "white", position = "identity") + 
  geom_density(aes(color = sex), size = 1)+
  scale_color_manual(values = c("indianred", "lightblue1"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Dotplots

section 1d - lets look at dotplots

lets load the required packages

library(ggplot2)

lets set our theme

theme_set(
  theme_dark() +
    theme(legend.position = "top")
)

first lets initiate a ggplot object called TQ

data("ToothGrowth")
ToothGrowth$dose <- as.factor(ToothGrowth$dose)

tg <- ggplot(ToothGrowth, aes(x=dose, y=len))

lets create a dotplot with a summary statistic

tg + geom_dotplot(binaxis = "y", stackdir = "center", fill = "white") + 
  stat_summary(fun = mean, fun.args = list(mult=1))
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing missing values (geom_segment).

lets add a boxplot and dotplot together

tg + geom_boxplot(width=0.5) + 
  geom_dotplot(binaxis = "y", stackdir = "center", fill = "white")
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.

now lets do a violin plot

tg+geom_violin(trim=FALSE) +
  geom_dotplot(binaxis = "y", stackdir="center", fill = "#999999")
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.

stat_summary(fun=mean, fun.args = list(mult=1))
## geom_pointrange: na.rm = FALSE, orientation = NA
## stat_summary: fun.data = NULL, fun = function (x, ...) 
## UseMethod("mean"), fun.max = NULL, fun.min = NULL, fun.args = list(mult = 1), na.rm = FALSE, orientation = NA
## position_identity

now lets make a dotplot with multiple groups

tg + geom_boxplot(width = 0.5)+
  geom_dotplot(aes(fill = supp), binaxis = "y", stackdir = "center") +
  scale_fill_manual(values = c("indianred", "lightblue1"))
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.

tg + geom_boxplot(width = 0.5, position = position_dodge(0.8)) +
  geom_dotplot(aes(fill=supp, color=supp), binaxis = "y", stackdir = "center",
               dotsize = 0.8, position = position_dodge(0.8))+
  scale_fill_manual(values = c("#00AFBB", "#E7B800"))
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.

## Line Plots section 1e - lets look at some line plots

We’ll start by making a coustom datafram, kinda like the tooth dataset. This way we can see the lines and stuff that we’re modifying.

df <- data.frame(dose = c("D0.5", "D1", "D2"), len = c(4.2, 10, 29.5))

Now lets create a second dataframe for plotting by groups

df2 <- data.frame(supp = rep(c("VC", "OJ"), each = 3), dose = rep(c("D0.5", "D1", "D2"), 2), len = c(0.8,15,33,4.2,10,29.5))

df2
##   supp dose  len
## 1   VC D0.5  0.8
## 2   VC   D1 15.0
## 3   VC   D2 33.0
## 4   OJ D0.5  4.2
## 5   OJ   D1 10.0
## 6   OJ   D2 29.5

now lets again load ggplot2 and set a theme

library(ggplot2)

theme_set(
  theme_gray() +
    theme(legend.position = "right")
)

now lets do some basic line plots. First we will build a function to display all the different line types

generateRLineTypes <- function(){
  oldPar <- par()
  par(font = 2, mar = c(0,0,0,0))
  plot(1, pch="", ylim = c(0,6), xlim=c(0,0.7), axes = FALSE, xlab = "", ylab="")
  for(i in 0:6) lines(c(0.3,0.7), c(i,i), lty=i, lwd = 3)
  text(rep(0.1,6), 0:6, labels = c("0. 'Blank'", "1. 'solid'", "2. 'dashed'", "3. 'dotted'", "4. 'dotdash'", "5. 'longdash'", "6. 'twodash'"))
  par(mar=oldPar$mar, font=oldPar$font)
}

generateRLineTypes
## function(){
##   oldPar <- par()
##   par(font = 2, mar = c(0,0,0,0))
##   plot(1, pch="", ylim = c(0,6), xlim=c(0,0.7), axes = FALSE, xlab = "", ylab="")
##   for(i in 0:6) lines(c(0.3,0.7), c(i,i), lty=i, lwd = 3)
##   text(rep(0.1,6), 0:6, labels = c("0. 'Blank'", "1. 'solid'", "2. 'dashed'", "3. 'dotted'", "4. 'dotdash'", "5. 'longdash'", "6. 'twodash'"))
##   par(mar=oldPar$mar, font=oldPar$font)
## }

Now lets build a basic line plot

p <- ggplot(data=df, aes(x=dose, y=len, group=1))
p + geom_line() + geom_point()

Now lets modify the line type and color

p + geom_line(linetype="dashed", color="steelblue") +
  geom_point(color="steelblue")

now lets tryp a step graph, which indicates a threshold type progression

p +geom_step() + geom_point()

now lets move on to making multiple groups. First we will create our ggplot object

p <- ggplot(df2, aes(x=dose, y=len, group=supp))

now lets change line types and point shapes by group

p + geom_line(aes(linetype= supp, color=supp)) +
  geom_point(aes(shape=supp, color=supp)) +
  scale_color_manual(values = c("red", "blue"))

now lets look at line plots with a numeric x axis

df3 <- data.frame(supp=rep(c("VC", "OJ"), each = 3),
                  dose=rep(c("0.5", "1", "2"), 2),
                  len=c(6.8,15,33,4.2,10, 29.5))

df3
##   supp dose  len
## 1   VC  0.5  6.8
## 2   VC    1 15.0
## 3   VC    2 33.0
## 4   OJ  0.5  4.2
## 5   OJ    1 10.0
## 6   OJ    2 29.5

now lets plot where both axises are treated as continuous lables

df3$dose <-as.numeric(as.vector(df3$dose))
ggplot(data = df3, aes(x=dose, y=len, group=supp, color=supp)) +
  geom_line() + geom_point()

now lets look at a line graph with having the x axis as dates. we will use the built in economics time series for this:

head(economics)
## # A tibble: 6 × 6
##   date         pce    pop psavert uempmed unemploy
##   <date>     <dbl>  <dbl>   <dbl>   <dbl>    <dbl>
## 1 1967-07-01  507. 198712    12.6     4.5     2944
## 2 1967-08-01  510. 198911    12.6     4.7     2945
## 3 1967-09-01  516. 199113    11.9     4.6     2958
## 4 1967-10-01  512. 199311    12.9     4.9     3143
## 5 1967-11-01  517. 199498    12.8     4.7     3066
## 6 1967-12-01  525. 199657    11.8     4.8     3018
ggplot(data=economics, aes(x=date, y=pop)) +
  geom_line()

now lets subset this data

ss <- subset(economics, date > as.Date("2006-1-1"))
ggplot(data=ss, aes(x=date, y=pop)) + geom_line()

we can also change the line size (EX: uneployment variable)

ggplot(data=economics, aes(x=date, y=pop)) +
  geom_line(aes(size=unemploy/pop))

we can also plot multiple time-series data

ggplot(economics, aes(x=date))+
  geom_line(aes(y=psavert), color="darkred")+
  geom_line(aes(y=uempmed), color = "steelblue", linetype="twodash")

Lastly, lets make this into an area plot

ggplot(economics, aes(x=date))+
  geom_area(aes(y=psavert), fill = "#999999",
            color="#999999", alpha = 0.5) +
  geom_area(aes(y=uempmed), fill = "#E69F00",
            color = "#E69F00", alpha = 0.5)

## Density plots section 1f - lets look at denstiy plots

a density plot is a nice alternative to a histogram

set.seed(1234)

wdata = data.frame(
  sex=factor(rep(c("F", "M"), each = 200)),
  weight = c(rnorm(200, 55), rnorm(200, 58))
)
library(dplyr)
mu <- wdata%>%
  group_by(sex)%>%
  summarise(grp.mean=mean(weight))

now lets load the graphing packages

library(ggplot2)
theme_set(
  theme_classic() +
    theme(legend.position ="right")
  
)

now lets do the basic plot function, first we will create a ggplot obj

d <- ggplot(wdata, aes(x=weight))

now lets do a basic density plot

d + geom_density() +
  geom_vline(aes(xintercept=mean(weight)), linetype= "dashed")

now lets change the y axis to count instead of density

d + geom_density(aes(y=stat(count)), fill = "lightgray")+
  geom_vline(aes(xintercept=mean(weight)), linetype ="dashed")

d + geom_density(aes(color=sex)) +
  scale_color_manual(values = c("darkgray", "gold"))

lastly, lets fill the density plots

d +geom_density(aes(fill=sex), alpha=0.4)+
  geom_vline(aes(xintercept=grp.mean, color=sex), data =mu, linetype="dashed") +
  scale_color_manual(values=c("grey","gold")) +
  scale_fill_manual(values = c("grey","gold"))

# Plotly {.tabset}

Plotly Line Plots

section 2a - lets look at line plots in plotly first lets load req packages:

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

lets start with a scatter plot or the orange dataset

Orange <- as.data.frame(Orange)
plot_ly(data = Orange, x = ~age, y= ~circumference)
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

now lets add some more info

plot_ly(data = Orange, x=~age, y=~circumference,
        color=~Tree, size= ~age,
        text=~paste("Tree ID:", Tree, "<br>Age:", age, "Circ:", circumference)
        )
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

now lets create a random distribution and add it to our dataframe

trace_1 <-rnorm(35, mean=120, sd=10)
new_data <- data.frame(Orange, trace_1)

we will use the random numbers as lines on the graph

plot_ly(data=new_data, x=~age, y=~circumference, color=~Tree, size=~age,
        text=~paste("Tree ID:", Tree, "<br>Age:", age, "<br>Circ:", circumference)) %>%
  add_trace(y=~trace_1, mode='lines')%>%
  add_trace(y=~circumference, mode='markers')
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

now lets create a graph with the opt of showing as a scatter or line, and add lables

plot_ly(data = Orange, x = ~age, y = ~circumference,
        color = ~Tree, size = ~circumference,
        text = ~paste("Tree ID:", Tree, "<br>Age:", age, "Circ:", circumference))%>%
  add_trace(y = ~circumference, mode = 'markers')%>%
  layout(
    title = "plot or orange data with switchable trace",
    updatemenus = list(
      list(
        type = 'dropdown',
        y = 0.8,
        buttons = list(
          list(metho = 'restyle',
               args = list('mode', 'markers'),
               label = "Marker"
               ),
          list(method = "restyle",
               args = list('mode', 'lines'),
               labels = "Lines"
               )
        )
      )
    )
  )
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

3D plotly

section 2b - lets look at 3D plotly lines First lets load our req packages

library(plotly)

now lets create a random 3d matrix

d <- data.frame(
  x<-seq(1,10, by = 0.5),
  y <- seq(1,10, by = 0.5)
)
z <- matrix(rnorm(length(d$x) * length(d$y)), nrow = length(d$x), ncol=length(d$y))

now lets plot our 3d data

plot_ly(d, x=~x, y=~y, z=~z)%>%
  add_surface()

lets add topography

plot_ly(d, x=~x, y=~y, z=~z)%>%
  add_surface(
    contours=list(
      z=list(
        show=TRUE,
        usecolormap=TRUE,
        highlightcolor="FF0000",
        project=list(z=TRUE)
      )
    )
  )

now lets look at a 3d scatter plot

plot_ly(longley, x=~GNP, y=~Population, z=~Employed, marker=list(color=~GNP))%>%
  add_markers()

Other Graphing Techniques

Error Bars

section 3a - lets look at error bars lets load libraries

library(ggplot2)
library(dplyr)
BiocManager::install("plotrix", update=FALSE)
## 'getOption("repos")' replaces Bioconductor standard repositories, see
## '?repositories' for details
## 
## replacement repositories:
##     CRAN: https://cloud.r-project.org
## Bioconductor version 3.15 (BiocManager 1.30.18), R 4.2.1 (2022-06-23)
## Warning: package(s) not installed when version(s) same as current; use `force = TRUE` to
##   re-install: 'plotrix'
theme_set(
  theme_classic() +
    theme(legend.position='top')
)

lets again use the tooth data for this excercise

df <- ToothGrowth
df$dose <-as.factor(df$dose)

now lets use dplyr for manipulation

library(plotrix)
library(dplyr)

df.summary <- df %>%
  group_by(dose) %>%
  summarise(
    sd = sd(len, na.rm = TRUE),
    len = mean(len),
    stderr = std.error(len, na.rm = TRUE)
  )

df.summary
## # A tibble: 3 × 4
##   dose     sd   len stderr
##   <fct> <dbl> <dbl>  <dbl>
## 1 0.5    4.50  10.6     NA
## 2 1      4.42  19.7     NA
## 3 2      3.77  26.1     NA

some key functions: 1- geom_crossbar() for hollow bars with middle indicated by horizontal line 2. geom_erorrbar() for error bars 3. Geom_errorbarh() for horizontal error bars 4. geom_linerange() for drawing an interval represented by vertical line 5. geom_pointrange() for creating an interval represented by a vertical line; with a point in the moddle

lets create a ggplot

tg <- ggplot(
  df.summary,
  aes(x=dose, y=len, ymin=len-sd, ymax = len+sd)
)

now lets look at the mose basic error bars

tg+geom_pointrange()

tg+geom_errorbar(width=0.2)+
  geom_point(size=1.5)

now lets create horizontal error bars by manipulating our graph

ggplot(df.summary, aes(x=len, y=dose, xmin=len-sd, xmax=len+sd)) +
  geom_point()+
  geom_errorbarh(height=0.2)

now lets add jitter points to our data

ggplot(df, aes(dose, len))+
  geom_jitter(position=position_jitter(0.2), color="darkgray")+
  geom_pointrange(aes(ymin=len-sd, ymax=len+sd), data=df.summary)

error bars on a violin plot

library(ggplot2)


ggplot(df, aes(dose, len)) +
  geom_violin(color="darkgray", trim=FALSE)+
  geom_pointrange(aes(xmin=len-sd, xmax=len+sd), data=df.summary)

same thing with a line graph

ggplot(df.summary, aes(dose, len))+
  geom_line(aes(group=1)) + #always specify this when you have 1 line
  geom_errorbar(aes(ymin=len-stderr, ymax=len+stderr), width =0.2)+
  geom_point(size=2)

now lets make a bar graph with half error bars

ggplot(df.summary, aes(dose,len)) +
  geom_col(fill="lightgrey", color="black")+
  geom_errorbar(aes(ymin=len, ymax = len+stderr), width=0.2)

by not specifying wmin=len-stderr, we basically cut the error bar in half

add jitter points to line plots

ggplot(df, aes(dose, len)) +
  geom_jitter(position=position_jitter(0.2), color = "darkgrey")+
  geom_line(aes(group=1), data=df.summary)+
geom_errorbar(aes(ymin=len-stderr, ymax=len+stderr), 
              data = df.summary, width=0.2)+
  geom_point(data=df.summary, size = 0.2)

what about adding jitterpoints to a barplot

ggplot(df, aes(dose, len))+
  geom_col(Data=df.summary, sill= NA, color="black")+
  geom_jitter(position=position_jitter(0.2), color="darkgrey")+
  geom_errorbar(aes(ymin=len - stderr, ymax=len+stderr),
                data=df.summary, width=0.2)
## Warning: Ignoring unknown parameters: Data, sill

what if we wabted to have our error bars per group (EX OJ vs VC)

df.summary2<-df %>%
  group_by(dose,supp)%>%
  summarise(
    sd=sd(len),
    stderr=std.error(len),
    len=mean(len)
  )
## `summarise()` has grouped output by 'dose'. You can override using the
## `.groups` argument.
df.summary2
## # A tibble: 6 × 5
## # Groups:   dose [3]
##   dose  supp     sd stderr   len
##   <fct> <fct> <dbl>  <dbl> <dbl>
## 1 0.5   OJ     4.46  1.41  13.2 
## 2 0.5   VC     2.75  0.869  7.98
## 3 1     OJ     3.91  1.24  22.7 
## 4 1     VC     2.52  0.795 16.8 
## 5 2     OJ     2.66  0.840 26.1 
## 6 2     VC     4.80  1.52  26.1

now your can see we have mean and error for each dose and supp

ggplot(df.summary2, aes(dose, len)) +
  geom_pointrange(
    aes(ymin=len-stderr, ymax=len+stderr, color = supp),
    position = position_dodge(0.3)
  )+
  scale_color_manual(values=c("indianred", "lightblue"))

how about line plots with mult error bars

ggplot(df.summary2, aes(dose, len))+
  geom_line(aes(linetype=supp, group=supp))+ 
  geom_point()+
  geom_errorbar(aes(ymin=len-stderr, ymax=len+stderr, group=supp), width=0.2)

and the same with a bar plot

ggplot(df.summary2, aes(dose, len))+
  geom_col(aes(fill=supp), position=position_dodge(0.8), width=0.7)+
  geom_errorbar(
    aes(ymin=len-sd, ymax=len+sd, group=supp),
    width=0.2, position=position_dodge(0.8)
  )+
  scale_fill_manual(values=c("indianred", "lightblue"))

now lets add some jitterpoints

ggplot(df,aes(dose, len, color = supp))+
  geom_jitter(position=position_dodge(0.2))+
  geom_line(aes(group=supp), data=df.summary2)+
  geom_point()+
  geom_errorbar(aes(ymin=len-stderr, ymax=len+stderr, group=supp), data=df.summary2, width=0.2)

ggplot(df, aes(dose, len, color = supp))+
  geom_col(data=df.summary2, position=position_dodge(0.8), width=0.7, will="white")+
  geom_jitter(
    position=position_jitterdodge(jitter.width=0.2, dodge.width=0.8)
  )+
  geom_errorbar(
    aes(ymin=len-stderr, ymax=len+stderr), data=df.summary2, width=0.2, position=position_dodge(0.8)) +
  scale_color_manual(values=c("indianred", "lightblue"))+
  theme(legend.position="top")
## Warning: Ignoring unknown parameters: will

ECDF

Section 3b - ECDF

now lets do an empirical cumulative distribution functino. This reports any given numeber perentile of individuals that are above or below that threshold.

set.seed(1234)
wdata=data.frame(
  sex=factor(rep(c("F","M"), each = 200)),
  weight=c(rnorm(200,50)),rnorm(200,58))

now lets look at our dataframe

head(wdata, 5)
##   sex   weight rnorm.200..58.
## 1   F 48.79293       58.48523
## 2   F 50.27743       58.69677
## 3   F 51.08444       58.18551
## 4   F 47.65430       58.70073
## 5   F 50.42912       58.31168

now lets load out plotting package

library(ggplot2)
theme_set(
theme_classic()+
  theme(legend.position="bottom"))

now lets vreate our ECDF plot

ggplot(wdata,aes(x=weight))+
  stat_ecdf(aes(color=sex, linetype=sex),
            geom="step", size=1.5)+
  scale_color_manual(values=c("#00AFBB", "#E7B900"))+
  labs(y="weight")

QQ plots

Section 3c - QQ plots

now lets take a look at qq plots. these are used to determine if the given data follows a normal distribution

#install.packages("ggpubr")

set.seed(1234)

now lets randomly generate some data

wdata=data.frame(
  sex=factor(rep(c("F","M"), each=200)),
  weight=c(rnorm(200,55),rnorm(200,58))
)

lets set out theme for the graphing with ggplot

library(ggplot2)
theme_set(
  theme_classic()+
    theme(legend.position="top")
)

create a qqplot of the weight

ggplot(wdata, aes(sample=weight))+
  stat_qq(aes(color=sex))+
  scale_color_manual(values=c("#0073C2FF", "#FC4E07"))+
  labs(y="weight")

library(ggpubr)

ggqqplot(wdata, x="weight",
         color="sex",
         palettes=c("#0073C2FF", "#FC4E07"),
         ggtheme=theme_pubclean())

now what would a non-normal distribution look like

library(mnonr)
data2 <- mnonr::mnonr(n=1000, p=2, ms=3, mk=61, Sigma = matrix(c(1,0.5,0.5,1), 2, 2), initial = NULL)

data2 <- as.data.frame(data2)

now lets plot the nonnormal data

library(ggplot2)

ggplot(data2,aes(sample=V1))+
  stat_qq()

ggqqplot(data2, x = "V1",
        palette = "#0073C2FF",
        ggtheme = theme_pubclean())

Facet Plots

Section 3d - Facet Plots

lets look at how to put multiple plots together into a singel figure

library(ggpubr)
library(ggplot2)

theme_set(
  theme_bw()+
    theme(legend.position = "top")
)

First lets create a nice boxplot

df <- ToothGrowth
df$dose <- as.factor(df$dose)

Lets create the plot object

p <- ggplot(df, aes(x=dose, y=len))+
  geom_boxplot(aes(fill=supp), position = position_dodge(0.9))+
  scale_fill_manual(values=c("#00AFBB", "#E7B800"))

p

now lets look at the gvplot facit function

p +facet_grid(rows=vars(supp))

now lets do a facet with mult vars

p+facet_grid(rows=vars(dose), cols=vars(supp))

p

now lets look at the facet_wrap function (allows facets to be placed side by side)

p + facet_wrap(vars(dose), ncol=2)

combine mult plots using ggarrange()

start by making some basic plots; first define a color palette and data

my3cols <- c("#E7B800", "#2E9FDF", "#FC4E07")
ToothGrowth$dose <- as.factor(ToothGrowth$dose)

now lets make some basic plots

p <- ggplot(ToothGrowth,aes(x=dose, y=len))
bxp <- p+geom_boxplot(aes(color = dose)) +
  scale_color_manual(values = my3cols)

dotplot

dp <- p +geom_dotplot(aes(color=dose, fill=dose), binaxis = 'y', stackdir = 'center') +
  scale_color_manual(values=my3cols)+
  scale_fill_manual(values=my3cols)

lineplot

lp <- ggplot(economics, aes(x=date, y=psavert)) +
  geom_line(color="indianred")

now we can make the figure

figure <- ggarrange(bxp, dp, lp, labels=c("A","B","C"), ncol=2, nrow=2)
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.
figure

This looks great, but we can make it look better

figure2 <- ggarrange(
  lp,
  ggarrange(bxp, dp, ncol=2, labels = c("B","C")),
  nrow=2,
  labels="A")
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.
figure2

fixing two legends that are hte same

ggarrange(
  bxp, dp,labels=c("A","B"),
  common.legend=TRUE, legend="bottom")
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.

lastly we should export the plot

ggexport(figure2, filename="facetfigure.pdf")
## file saved to facetfigure.pdf

exporting mult plots to pdf

ggexport(bxp, dp, lp, filename = "multi.pdf")
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.
## file saved to multi.pdf

export to pdf with mult pages and mult cols

ggexport(bxp, dp, lp, bxp, filename="test2.pdf", nrow=2, ncol=1)
## Bin width defaults to 1/30 of the range of the data. Pick better value with `binwidth`.
## file saved to test2.pdf

Heatmaps

Section 3E - Heatmaps

lets get started

library(heatmap3)

getting data

data <- ldeaths

data2 <- do.call(cbind, split(data, cycle(data)))
dimnames(data2) <- dimnames(.preformat.ts(data))

now lets generate a heatmap

heatmap(data2)

heatmap(data2, Rowv=NA, Colv=NA)

now lets play with the colors

rc<-rainbow(nrow(data2), start=0, end=0.3)
cc<-rainbow(ncol(data2), start=0, end=0.3)

now lets apply our color selection

heatmap(data2, ColSideColors=cc)

library(RColorBrewer)
heatmap(data2, ColSideColors = cc,
        col=colorRampPalette(brewer.pal(8, "PiYG"))
        (25))

more that we can customize

library(gplots)

### need to open new window to see, marigin too wide

heatmap.2(data2, ColSideColors=cc,
          col=colorRampPalette(brewer.pal(8, "PiYG"))(25))

Outlier Detection

Missing Values

Section 4A - Missing Values

Missing Values: If you encounter an unusual value in your dataset, and simply want to move on to the rest of the analysis, you have two options:

  1. drop the entire row with the strange values:
library(dplyr)
library(ggplot2)

diamonds<-diamonds
diamonds2<-diamonds%>%
  filter(between(y,3,20))

In this instance y is the width of the diamond, so anything <3mm or >20mm is excluded. This is not recommended, instead you should replace the unusual values with missing values.

diamonds3<-diamonds%>%
  dplyr::mutate(y=ifelse(y<3|y>20, NA,y))

like R, ggplot doesnt just get rid of missing values

ggplot(data=diamonds3, mapping = aes(x=x,y=y))+
  geom_point(na.rm=TRUE)

Other times you want to understand what makes obs with missing vals different to the obs with recorded vals. For example, the NYCflights13 dataset, missing values in the dep_time var indicated that the flight was cancelled. So you may want to compare the scheduled dept_times for cancelled and non-cancelled fligths.

install.packages("nycflights13")
## Installing package into '/home/student/R/x86_64-pc-linux-gnu-library/4.2'
## (as 'lib' is unspecified)
library(nycflights13)

nycflights13::flights%>%
  dplyr::mutate(
    cancelled=is.na(dep_time),
    sched_hour=sched_dep_time %/% 100,
    sched_min=sched_dep_time %% 100,
    sched_dep_time = sched_hour + sched_min / 60
  )%>%
  ggplot(mapping=aes(sched_dep_time))+
  geom_freqpoly(mapping=aes(color=cancelled), bindwith=1/4)
## Warning: Ignoring unknown parameters: bindwith
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Identifying Outliers

Section 4B - Identifying Outliers load req libraries

library(outliers)
library(ggplot2)

and reload the dataset because we removed outliers

library(readxl)
Air_data <- read_excel("Data products/data products _ for final/other_graphs_charts_info_etc./AirQualityUCI.xlsx")

lets create a functions using the grubb test to identify all outliers. the grubbs test identifies outliers in a univariate dataset that is presumed to come from a normal dist:

grubbs.flag <-function(x) {
  # lets create a variable called outliers and save nothing to it, we'll ad the vars as we identify them
  outliers <- NULL
  # we'll create a var called test to identify which univariate we a testing
  test <- x
  # now using the outliers package, use grubbs.test to find outliers in our vars
  grubbs.result <- grubbs.test(test)
  # lets get the p vals of all tested vars
  pv <- grubbs.result$p.value
  #now lets search thru our p vals for ones that are outside 0.5
  while(pv<0.05) { 
    # anything with a p val greater than p=0.05, we add to our empty outliers vector
  outliers <- c(outliers, as.numeric(strsplit(grubbs.result$alternatve, "")[[1]][3]))
  # now we want to remove those outliers 
  test <- x[!x %in% outliers]
  # and run the grubbs again w/o the outliers
  grubbs.result <- grubbs.test(test)
  # and save the new pval
  pv <- grubbs.result$p.value
  }
  return(data.frame(X=x, Outliers=(x%in%outliers)))
}

#{r} #indentified_outliers <- grubbs.flag(Air_data$AH) # ## Covariation Section 4C - Covariation

library(ggplot2)

ggplot(data=diamonds, mapping = aes(x=price)) +
  geom_freqpoly(mapping = aes(color = cut), bindwidth = 500)
## Warning: Ignoring unknown parameters: bindwidth
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Its hard to see the difference in distribution bc the counts differ so much.

ggplot(diamonds)+
  geom_bar(mapping=aes(x=cut))

we’ll display the density (which is the count standardized so that te area under under the curve is 1)

ggplot(data=diamonds, mapping=aes(x=price, y=..density..))+
  geom_freqpoly(mapping=aes(color=cut), bindwidth=500)
## Warning: Ignoring unknown parameters: bindwidth
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

it appears that fiar diamonds have the highest average price. but maybe thats because frequency polygons are hard to interpret. another alternative is the boxplot (this is a type of visual short hand for a distribution of value)

ggplot(data=diamonds, mapping=aes(x=cut, y = price))+
  geom_boxplot()

we see much less info about the distribution, but the boxplots are much more compact. making them easier to compare. it supports the counterintuitive finding thatbetter quality diamonds are much cheaper on average

lets take a look at some car data

ggplot(data=mpg, mapping=aes(x=class, y = hwy))+
  geom_boxplot()

ggplot(data=mpg) +
  geom_boxplot(mapping=aes(x=reorder(class, hwy, FUN=median), y = hwy))

if you have long var names, you can switch the axis and flop in 90 deg

ggplot(data=mpg)+
  geom_boxplot(mapping=aes(x=reorder(class, hwy, FUN = median), y= hwy)) +
  coord_flip()

to visualize the correlation between two continuous var, we can use a scatter plot.

ggplot(data=diamonds) + 
  geom_point(mapping=aes(x=carat, y=price))

scatter plots become less useful as the size of your dataset grows, bc of this we get overplot. we can fix this by using the alpha aesthetic

ggplot(data=diamonds)+
  geom_point(mapping=aes(x=carat, y=price), alpha = 1/100)

# Exploratory {.tabset} ## Exploratory Data Analysis Section 2A - Exploratory Stats #1-6 (Combined)

loading libraries

library(RCurl)
library(dplyr)

now lets get our data

site <- "https://raw.githubusercontent.com/nytimes/covid-19-data/master/colleges/colleges.csv"
college_Data <-read.csv(site)

lets use the str function, to show the structure of the object

str(college_Data)
## 'data.frame':    1948 obs. of  9 variables:
##  $ date      : chr  "2021-05-26" "2021-05-26" "2021-05-26" "2021-05-26" ...
##  $ state     : chr  "Alabama" "Alabama" "Alabama" "Alabama" ...
##  $ county    : chr  "Madison" "Montgomery" "Limestone" "Lee" ...
##  $ city      : chr  "Huntsville" "Montgomery" "Athens" "Auburn" ...
##  $ ipeds_id  : chr  "100654" "100724" "100812" "100858" ...
##  $ college   : chr  "Alabama A&M University" "Alabama State University" "Athens State University" "Auburn University" ...
##  $ cases     : int  41 2 45 2742 220 4 263 137 49 76 ...
##  $ cases_2021: int  NA NA 10 567 80 NA 49 53 10 35 ...
##  $ notes     : chr  "" "" "" "" ...

what if we want to arrange our dataset alphabetically by college

alphabetical <-college_Data%>%
  arrange(college_Data$college)

the glipmse package is another way to preview this data

glimpse(college_Data)
## Rows: 1,948
## Columns: 9
## $ date       <chr> "2021-05-26", "2021-05-26", "2021-05-26", "2021-05-26", "20…
## $ state      <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Ala…
## $ county     <chr> "Madison", "Montgomery", "Limestone", "Lee", "Montgomery", …
## $ city       <chr> "Huntsville", "Montgomery", "Athens", "Auburn", "Montgomery…
## $ ipeds_id   <chr> "100654", "100724", "100812", "100858", "100830", "102429",…
## $ college    <chr> "Alabama A&M University", "Alabama State University", "Athe…
## $ cases      <int> 41, 2, 45, 2742, 220, 4, 263, 137, 49, 76, 67, 0, 229, 19, …
## $ cases_2021 <int> NA, NA, 10, 567, 80, NA, 49, 53, 10, 35, 5, NA, 10, NA, 19,…
## $ notes      <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",…

we can also subset with select()

college_cases <- select(college_Data, college, cases)

we can filter or subset with the filter function

Louisiana_cases <-filter(college_Data, state=="Louisiana")

lets filter out a smaller amount of states

Southern_cases <- filter(college_Data, state =="Louisiana"|state =="Texas"|state =="Arkansas" | state =="Mississippi")

loading more libraries:

library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(dplyr)
library(ggplot2)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:plotrix':
## 
##     rescale

now lets load some data

state_Site <- "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv"

state_data <-read.csv(state_Site)

lets create a group-by obj using the state column

state_cases <- group_by(state_data, state)
class(state_cases)
## [1] "grouped_df" "tbl_df"     "tbl"        "data.frame"

how many meausrements were made by the state this gives us an idea of when states started reporting

days_since_first_reported <- tally(state_cases)

Lets visualize some of our data: - Data: the stuff we want to visualize - Layer: made of geometric elements and requisite statistical info (includes geometric objs) - Scales: used to map values in the data space that is used for creation of values (ex: color, size, shape, etc.) - Coordinate system: describes how the data coorinates are mapped together in relation to the plan on the graphic - Faceting - how to break up data in to tsubsets to display with multiple types of data groups - Theme: controls the finer points of the display (ex: fontsize, background color)

options(repr.plot.width=6, rep.plot.height=6)
class(college_Data)
## [1] "data.frame"
head(college_Data)
##         date   state     county       city ipeds_id
## 1 2021-05-26 Alabama    Madison Huntsville   100654
## 2 2021-05-26 Alabama Montgomery Montgomery   100724
## 3 2021-05-26 Alabama  Limestone     Athens   100812
## 4 2021-05-26 Alabama        Lee     Auburn   100858
## 5 2021-05-26 Alabama Montgomery Montgomery   100830
## 6 2021-05-26 Alabama     Walker     Jasper   102429
##                           college cases cases_2021 notes
## 1          Alabama A&M University    41         NA      
## 2        Alabama State University     2         NA      
## 3         Athens State University    45         10      
## 4               Auburn University  2742        567      
## 5 Auburn University at Montgomery   220         80      
## 6  Bevill State Community College     4         NA
summary(college_Data)
##      date              state              county              city          
##  Length:1948        Length:1948        Length:1948        Length:1948       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    ipeds_id           college              cases          cases_2021    
##  Length:1948        Length:1948        Min.   :   0.0   Min.   :   0.0  
##  Class :character   Class :character   1st Qu.:  32.0   1st Qu.:  23.0  
##  Mode  :character   Mode  :character   Median : 114.5   Median :  65.0  
##                                        Mean   : 363.5   Mean   : 168.1  
##                                        3rd Qu.: 303.0   3rd Qu.: 159.0  
##                                        Max.   :9914.0   Max.   :3158.0  
##                                                         NA's   :337     
##     notes          
##  Length:1948       
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 

now lets take a look at a diff dataset

iris<-as.data.frame(iris)
class(iris)
## [1] "data.frame"
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
summary(iris)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 

lets start by creating a scatterplot of the college data

ggplot(data = college_Data, aes(x=cases, y=cases_2021, color = state))+
  geom_point()+
  theme_minimal()
## Warning: Removed 337 rows containing missing values (geom_point).

lets color coordiante the iris data

ggplot(data=iris, aes(x= Sepal.Width, y= Sepal.Length, color=Species))+
  geom_point()+
  theme_minimal()

lets run a simple histogram of our la case data

hist(Louisiana_cases$cases, freq=NULL, density=NULL, breaks=10, xlab="Total Cases", ylab="Frequency", main="Total College Cov-19 Infections (LA)")

lets run a simple histogram for our iris data

hist(iris$Sepal.Width, freq=NULL, density=NULL, breaks=10, xlab="Sepal Width", ylab="Frequency", main="Iris Sepal Width")

back to college cov-19 data

histogram_college <- ggplot(data=Louisiana_cases, aes(x=cases))
histogram_college + geom_histogram(bindwidth=100, colot="black", aes(fill=county))+
  xlab("cases")+ylab("Frequency") +ggtitle("Histogram of Cov-19 cases in LA")
## Warning: Ignoring unknown parameters: bindwidth, colot
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

back to iris data

histogram_iris <- ggplot(data=iris, aes(x=Sepal.Width))

histogram_iris + geom_histogram(bindwith=0.2, color="black", aes(fill=Species))+
  xlab("Sepal width")+ylab("Frequency")+ggtitle("Historgram of Iris Sepal Width by Species")
## Warning: Ignoring unknown parameters: bindwith
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

maybe we can make a density plot out of our college data make more sense

ggplot(Southern_cases) +
  geom_density(aes(x=cases, fill=state), alpha=0.50)

now lets do this with the iris data

ggplot(iris) +
  geom_density(aes(x=Sepal.Width, fill=Species), alpha=0.50)

lets look at violin plots for iris

ggplot(data=iris, aes(x=Species, y = Sepal.Length, color=Species))+
  geom_violin()+
  theme_classic()+
  theme(legend.position = "none")

now lets try this with the sounthern data

ggplot(data=Southern_cases, aes(x=state, y=cases, color=state))+
  geom_violin()+
  theme_grey()+
  theme(legend.position = "none")

now lets look at residual plots: this is a graph that displays the residuals on the veritcal acis, and the independent variables on the horizontal axis. in the event that the points in a residual plot are disperesed in a random manner around the horizontal acis, it is appropriate to use a linear regression. If they are not randomly disperesed, a non linear model is more appropriate.

lets start with iris data

ggplot(lm(Sepal.Length ~ Sepal.Width, data = iris))+
  geom_point(aes(x=.fitted, y=.resid))

now lets look at the southern cases

ggplot(lm(cases~cases_2021, data=Southern_cases)) +
  geom_point(aes(x=.fitted, y=.resid))

turns out that a linear model is not a good call for the southern cases data

now lets do some correlation:

library(readr)
## 
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
## 
##     col_factor
obesity <- read_csv("Data products/Obesity_insurance.csv")
## Rows: 1338 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): sex, smoker, region
## dbl (4): age, bmi, children, charges
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

loading libraries:

library(tidyr)
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:RCurl':
## 
##     complete
library(dplyr)
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:ggpubr':
## 
##     mutate
## The following objects are masked from 'package:plotly':
## 
##     arrange, mutate, rename, summarise
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

Lets look at the column classes:

class(obesity)
## [1] "spec_tbl_df" "tbl_df"      "tbl"         "data.frame"

and get a summary of the dist of the vars

summary(obesity)
##       age            sex                 bmi           children    
##  Min.   :18.00   Length:1338        Min.   :15.96   Min.   :0.000  
##  1st Qu.:27.00   Class :character   1st Qu.:26.30   1st Qu.:0.000  
##  Median :39.00   Mode  :character   Median :30.40   Median :1.000  
##  Mean   :39.21                      Mean   :30.66   Mean   :1.095  
##  3rd Qu.:51.00                      3rd Qu.:34.69   3rd Qu.:2.000  
##  Max.   :64.00                      Max.   :53.13   Max.   :5.000  
##     smoker             region             charges     
##  Length:1338        Length:1338        Min.   : 1122  
##  Class :character   Class :character   1st Qu.: 4740  
##  Mode  :character   Mode  :character   Median : 9382  
##                                        Mean   :13270  
##                                        3rd Qu.:16640  
##                                        Max.   :63770

Now lets look at the dist for the insurance sharges

hist(obesity$charges)

We can also get an idea of the dist using a boxplot

boxplot(obesity$charges)

boxplot(obesity$bmi)

Now lets look at the correlations. the cor() command is used to determine correlations betweentwo vectors; all the vcolumns of a dataframe or two data frames. the cov() command, on the otherhand, examines the covariance. the cor.test() command carries out a test as to the significance of the correlation

cor(obesity$charges, obesity$bmi)
## [1] 0.198341

this test uses a spearman Rho correlation, or you can use kendall’s tau by specifying it

cor(obesity$charges, obesity$bmi, method = 'kendall')
## [1] 0.08252397

This correlation measures strength of a correlation between -1 and 1.

now lets look at the tiejen-moore test. this is used for univariate dataset.

TietjenMoore <- function(dataSeries, k)
{
  n=length(dataSeries)
  # Compute the absolute residuals
  r = abs(dataSeries - mean(dataSeries))
  # sort data acc to size of residual
  df = data.frame(dataSeries, r)
  dfs = df[order(df$r),]
  # create a subset of the data w/o the largest values
  klarge = c((n-k+1):n)
  subdataSeries = dfs$dataSeries[-klarge]
  # compute the sums of squares
  ksub = (subdataSeries = mean(subdataSeries)) *2
  all = (df$dataSeries - mean(df$dataSeries))*2
  # compute the test stat
  sum(ksub)/sum(all)
  
}

This function helps to comte the abs residuals and sort data acc to the size or residuals Later we will focus on the computation of sum of squares

FindoutliersTietjenMooreTest <- function(dataSeries, k, alpha = 0.5){
  ek <- TietjenMoore(dataSeries, k)
  # compute critical vals based on simulation
  test = c(1:10000)
  for (i in 1:10000){
    dataSeriesdataSeries = rnorm(length(dataSeries))
    test[i] = TietjenMoore(dataSeriesdataSeries, k)}
  Talpha = quantile(test, alpha)
  list(T = ek, Talpha = Talpha)
  
}

TJM = tietjenmoore

This function helps us to compute the critical values based on simulation data. now lets demonstrate these functons with sample data and the obesity dataset for exaluating this algoritm.

The critical region for the TJM test is determined by simulation. the simulatino is performed by generating a standard normal random sample size of (n) and computing the TJM test stat. Typically, 10,000 random samples are used. The vals of the TJM stat are obtained from the data is compared to this reference dist. The vals of the test stat are bewteen 0 and 1. If there are no outliers, the test stat is close to 1;if there are outliers, the test stat will be closer to zero. Thus the test is always a lower, 1 tailed test (regardless of the test stat used - Lk ir EK)

first we will look at charges

boxplot(obesity$charges)

FindoutliersTietjenMooreTest(obesity$charges, 100)
## $T
## [1] -9.025633e+12
## 
## $Talpha
##          50% 
## -48214322454

lets check out bmi

boxplot(obesity$bmi)

FindoutliersTietjenMooreTest(obesity$bmi, 7)
## $T
## [1] -1.784784e+13
## 
## $Talpha
##          50% 
## -26865741921

Probability plots:

library(ggplot2)
library(tigerstats)
## Loading required package: abd
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## Loading required package: lattice
## Loading required package: grid
## Loading required package: mosaic
## Registered S3 method overwritten by 'mosaic':
##   method                           from   
##   fortify.SpatialPolygonsDataFrame ggplot2
## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.
## 
## Attaching package: 'mosaic'
## The following object is masked from 'package:Matrix':
## 
##     mean
## The following object is masked from 'package:plyr':
## 
##     count
## The following object is masked from 'package:scales':
## 
##     rescale
## The following object is masked from 'package:plotrix':
## 
##     rescale
## The following object is masked from 'package:plotly':
## 
##     do
## The following objects are masked from 'package:dplyr':
## 
##     count, do, tally
## The following object is masked from 'package:ggplot2':
## 
##     stat
## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
##     quantile, sd, t.test, var
## The following objects are masked from 'package:base':
## 
##     max, mean, min, prod, range, sample, sum
## Welcome to tigerstats!
## To learn more about this package, consult its website:
##  http://homerhanumat.github.io/tigerstats

We will use the prob plot function and their output dnorm: density function of the normal dist. using the densitym it is posible to determine the prob of events. For example, you may wonder “what is the liklihood of a person having a BMI of exactly ____? In this case you would need to retrieve the density of the BMI dist at values 140. The Bmi dist can be modeled with a mean of 100 and an SD of 15, The corresponding density is:

bmi.mean <- mean(obesity$bmi)
bmi.sd <- sd(obesity$bmi)

now lets plot our normal dist

bmi.dist <- dnorm(obesity$bmi, mean = bmi.mean, sd = bmi.sd)
bmi.df <- data.frame("bmi" = obesity$bmi, "Density" = bmi.dist)

ggplot(bmi.df, aes(x = bmi, y = Density))+
  geom_point()

This gives the prob of every single point occuring

now lets use the pnorm function for more info

bmi.dist <- pnorm(obesity$bmi, mean = bmi.mean, sd = bmi.sd)
bmi.df <- data.frame("bmi" = obesity$bmi, "Density" = bmi.dist)

ggplot(bmi.df, aes(x=bmi, y=Density))+
  geom_point()

What if we want to find the prob of the bmi bein >40 in our dist?

pp_greater <- function(x) {
  paste(round(100 *pnorm(x, mean = 30.66339, sd = 6.09818,lower.tail = FALSE), 2), "%")
}
pp_greater(40)
## [1] "6.29 %"

What about hte prob that a bmi is <40 in our population

pp_less <- function(x) {
  paste(round(100*(1-pnorm(x, mean = 30, sd = 6, lower.tail = FALSE)), 2), "%")
  
}

pp_less(40)
## [1] "95.22 %"
pnormGC(40, region = "above", mean = 30.66339, sd = 6.09818, graph = TRUE)

## [1] 0.06287869

What if we want to know the quantiles? lets use the qnorm function. We need to assume a normal dist for this:

qnorm(0.01, mean = 30,66339, sd = 6.09818, lower.tail = TRUE)
## Warning in qnorm(0.01, mean = 30, 66339, sd = 6.09818, lower.tail = TRUE): NaNs
## produced
## [1] NaN

What if you want a random sample of values w/in your dist?

subset <- rnorm(50, mean = 30.66339, sd = 6.09818)

hist(subset)

subset2 <- rnorm(50000, mean = 30.66339, sd = 6.09818)
hist(subset2)

Shapiro-wilk Test (SWT) so now we know how to generate a normal dist, how do we tell if our samples came from a noraml dist?

shapiro.test(obesity$charges[1:5])
## 
##  Shapiro-Wilk normality test
## 
## data:  obesity$charges[1:5]
## W = 0.84164, p-value = 0.1695

You can see here with a small sample size we would rejet the null hypothesis that the samples came from a normal dist. We can increase the power of the test by increasing the sample size

shapiro.test(obesity$charges[1:10000])
## 
##  Shapiro-Wilk normality test
## 
## data:  obesity$charges[1:10000]
## W = 0.81469, p-value < 2.2e-16

now lets check out our ages and bmi:

shapiro.test(obesity$age[1:1000])
## 
##  Shapiro-Wilk normality test
## 
## data:  obesity$age[1:1000]
## W = 0.94406, p-value < 2.2e-16
shapiro.test(obesity$bmi[1:1000])
## 
##  Shapiro-Wilk normality test
## 
## data:  obesity$bmi[1:1000]
## W = 0.99471, p-value = 0.001426

Time series data

loading packages:

library(readr)
library(readxl)

Air_data <- read_excel("Data products/AirQualityUCI.xlsx")

Date - date of measurement time - time of measurement CO(GT) - avg hourly CO2 PT08, s1(CO) - tin oxide hourly avg sensor response NMHC - avg hourly non-metallic hydrocarbon conc C6CH - avg benzene conc PT08.s3(NMHC) - titania avg hourly sensor response NOX - avg hourly NOx conc NO2 - avg hourly NO2 conc T - Temper RH - relative humidity AH - absolute humidity

str(Air_data)
## tibble [9,357 × 15] (S3: tbl_df/tbl/data.frame)
##  $ Date         : POSIXct[1:9357], format: "2004-03-10" "2004-03-10" ...
##  $ Time         : POSIXct[1:9357], format: "1899-12-31 18:00:00" "1899-12-31 19:00:00" ...
##  $ CO(GT)       : num [1:9357] 2.6 2 2.2 2.2 1.6 1.2 1.2 1 0.9 0.6 ...
##  $ PT08.S1(CO)  : num [1:9357] 1360 1292 1402 1376 1272 ...
##  $ NMHC(GT)     : num [1:9357] 150 112 88 80 51 38 31 31 24 19 ...
##  $ C6H6(GT)     : num [1:9357] 11.88 9.4 9 9.23 6.52 ...
##  $ PT08.S2(NMHC): num [1:9357] 1046 955 939 948 836 ...
##  $ NOx(GT)      : num [1:9357] 166 103 131 172 131 89 62 62 45 -200 ...
##  $ PT08.S3(NOx) : num [1:9357] 1056 1174 1140 1092 1205 ...
##  $ NO2(GT)      : num [1:9357] 113 92 114 122 116 96 77 76 60 -200 ...
##  $ PT08.S4(NO2) : num [1:9357] 1692 1559 1554 1584 1490 ...
##  $ PT08.S5(O3)  : num [1:9357] 1268 972 1074 1203 1110 ...
##  $ T            : num [1:9357] 13.6 13.3 11.9 11 11.2 ...
##  $ RH           : num [1:9357] 48.9 47.7 54 60 59.6 ...
##  $ AH           : num [1:9357] 0.758 0.725 0.75 0.787 0.789 ...
library(tidyr)
library(dplyr)
library(lubridate)
library(hms)
## 
## Attaching package: 'hms'
## The following object is masked from 'package:lubridate':
## 
##     hms
library(ggplot2)

Lets git rid of the date in the time col

Air_data$Time <- as_hms(Air_data$Time)

glimpse(Air_data)
## Rows: 9,357
## Columns: 15
## $ Date            <dttm> 2004-03-10, 2004-03-10, 2004-03-10, 2004-03-10, 2004-…
## $ Time            <time> 18:00:00, 19:00:00, 20:00:00, 21:00:00, 22:00:00, 23:…
## $ `CO(GT)`        <dbl> 2.6, 2.0, 2.2, 2.2, 1.6, 1.2, 1.2, 1.0, 0.9, 0.6, -200…
## $ `PT08.S1(CO)`   <dbl> 1360.00, 1292.25, 1402.00, 1375.50, 1272.25, 1197.00, …
## $ `NMHC(GT)`      <dbl> 150, 112, 88, 80, 51, 38, 31, 31, 24, 19, 14, 8, 16, 2…
## $ `C6H6(GT)`      <dbl> 11.881723, 9.397165, 8.997817, 9.228796, 6.518224, 4.7…
## $ `PT08.S2(NMHC)` <dbl> 1045.50, 954.75, 939.25, 948.25, 835.50, 750.25, 689.5…
## $ `NOx(GT)`       <dbl> 166, 103, 131, 172, 131, 89, 62, 62, 45, -200, 21, 16,…
## $ `PT08.S3(NOx)`  <dbl> 1056.25, 1173.75, 1140.00, 1092.00, 1205.00, 1336.50, …
## $ `NO2(GT)`       <dbl> 113, 92, 114, 122, 116, 96, 77, 76, 60, -200, 34, 28, …
## $ `PT08.S4(NO2)`  <dbl> 1692.00, 1558.75, 1554.50, 1583.75, 1490.00, 1393.00, …
## $ `PT08.S5(O3)`   <dbl> 1267.50, 972.25, 1074.00, 1203.25, 1110.00, 949.25, 73…
## $ T               <dbl> 13.600, 13.300, 11.900, 11.000, 11.150, 11.175, 11.325…
## $ RH              <dbl> 48.875, 47.700, 53.975, 60.000, 59.575, 59.175, 56.775…
## $ AH              <dbl> 0.7577538, 0.7254874, 0.7502391, 0.7867125, 0.7887942,…
plot(Air_data$AH, Air_data$RH, main = "Humidity Analysis", xlab = "Absolute Humidity", ylab = "Relative Humidity")

notice we have an outlier in our data

t.test(Air_data$RH, Air_data$AH)
## 
##  Welch Two Sample t-test
## 
## data:  Air_data$RH and Air_data$AH
## t = 69.62, df = 17471, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  45.01707 47.62536
## sample estimates:
## mean of x mean of y 
## 39.483611 -6.837604

Text Mining

Text mining

Section 1 - Text Mining #1-2

1st we will look at the unnest_token function lets start by looking at an emily dickenson passage

text <- c(" Because I could not stop from Death = ",
          " He kindly stopped for me -",
          "The Carriage held but just ourselves-",
          "and Immortality")

This is a typical character vector that we might want to analyze. In order to turn in into a tinytext dataset, we first need to put it into a dataframe

library(dplyr)
text_df <- tibble(line = 1:4, text = text)

text_df
## # A tibble: 4 × 2
##    line text                                     
##   <int> <chr>                                    
## 1     1 " Because I could not stop from Death = "
## 2     2 " He kindly stopped for me -"            
## 3     3 "The Carriage held but just ourselves-"  
## 4     4 "and Immortality"

reminder a tibble is a modern class of dataframe within R. It is availble in the dplyr and tibble packages, that has a convientt print method, will not converge strongs to factors, and does not use row names. Tibbles are great for use with tidy tools…

next we will use the ’unest_tokens” function.

First we have the output colmn name that will be vreated as the text is unnested into it

library(tidytext)
text_df%>%
  unnest_tokens(word, text)
## # A tibble: 20 × 2
##     line word       
##    <int> <chr>      
##  1     1 because    
##  2     1 i          
##  3     1 could      
##  4     1 not        
##  5     1 stop       
##  6     1 from       
##  7     1 death      
##  8     2 he         
##  9     2 kindly     
## 10     2 stopped    
## 11     2 for        
## 12     2 me         
## 13     3 the        
## 14     3 carriage   
## 15     3 held       
## 16     3 but        
## 17     3 just       
## 18     3 ourselves  
## 19     4 and        
## 20     4 immortality

lets use the janeaustenr package to analyze some Jane Austen texts. There are 6 books in this package

library(janeaustenr)
library(dplyr)
library(stringr)

original_books <- austen_books() %>%
  group_by(book)%>%
  dplyr::mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(text, regex("^chapter [\\divx1c]",
                                                 ignore_case = TRUE)))) %>%
  ungroup()


original_books
## # A tibble: 73,422 × 4
##    text                    book                linenumber chapter
##    <chr>                   <fct>                    <int>   <int>
##  1 "SENSE AND SENSIBILITY" Sense & Sensibility          1       0
##  2 ""                      Sense & Sensibility          2       0
##  3 "by Jane Austen"        Sense & Sensibility          3       0
##  4 ""                      Sense & Sensibility          4       0
##  5 "(1811)"                Sense & Sensibility          5       0
##  6 ""                      Sense & Sensibility          6       0
##  7 ""                      Sense & Sensibility          7       0
##  8 ""                      Sense & Sensibility          8       0
##  9 ""                      Sense & Sensibility          9       0
## 10 "CHAPTER 1"             Sense & Sensibility         10       1
## # … with 73,412 more rows

To work with this as a tidy dataset, we need to restrucure it in the one-token-per-row format, which as we saw earlier, is done with the unnest_tokens() function.

library(tidytext)
tidy_books <- original_books %>%
  unnest_tokens(word, text)

tidy_books
## # A tibble: 725,055 × 4
##    book                linenumber chapter word       
##    <fct>                    <int>   <int> <chr>      
##  1 Sense & Sensibility          1       0 sense      
##  2 Sense & Sensibility          1       0 and        
##  3 Sense & Sensibility          1       0 sensibility
##  4 Sense & Sensibility          3       0 by         
##  5 Sense & Sensibility          3       0 jane       
##  6 Sense & Sensibility          3       0 austen     
##  7 Sense & Sensibility          5       0 1811       
##  8 Sense & Sensibility         10       1 chapter    
##  9 Sense & Sensibility         10       1 1          
## 10 Sense & Sensibility         13       1 the        
## # … with 725,045 more rows

This function uses the tokenizers package to seperate each line of text in the original dataframe into tokes.

The default tokenizing is for words, but other opetions including characters, R-grams, sentences, lines, or paragraphs can be used.

now that the data is in a one-word-per-row format, we anmanipulate it with tools like dplyr/

often in text analysis we will want to remove stop words. Stop words are words that are NOT USEFUL for analysis; these include: the, of, to, and, etc.

we can remove stop words (kept in the tidytext dataset ‘stop_words’) with an anti_join().

data(stop_words)

tidy_books <- tidy_books%>%
  anti_join(stop_words)
## Joining, by = "word"

The stop words dataset in the tidytext package contains stop words from 3 lexicons, we can use them all together, as we have here, or filter() to only use one set of stop words if thats more appropriate for your analysis.

tidy_books%>%
  count(word, sort=TRUE)
## # A tibble: 13,914 × 2
##    word       n
##    <chr>  <int>
##  1 miss    1855
##  2 time    1337
##  3 fanny    862
##  4 dear     822
##  5 lady     817
##  6 sir      806
##  7 day      797
##  8 emma     787
##  9 sister   727
## 10 house    699
## # … with 13,904 more rows

Bc we’ve been using tidy tools, our word counts are stored in a tidy dataframe. this allows us to pip this directly into our ggplot2. For example, we can create a visualization of the most common words:

library(ggplot2)

tidy_books%>%
  count(word, sotr=TRUE)%>%
  filter(n>600)%>%
  dplyr::mutate(word = reorder(word,n))%>%
  ggplot(aes(n,word))+
  geom_col() +
  labs(y=NULL, x = "words count")

The gutenberger package

This package provides access to the public domain works from the gutenberg project. This package includes tools for both downloading books and a complete dataset of project gutenberg metadata that can be used to find works of interest. We will mostly use the function gutenberg_download().

word frequencies: Lets look at some biology texts, starting with Darwin.

The voyage of the beagle - 944 On the origin of species by the means of natural selection - 1228 the expression of emotions in man and animals - 1227 the descent of man, and selection in relation to sex -2300

We can access these works using the gutenberg_download() and the project gutenberg IDnumbers

library(gutenbergr)

darwin <- gutenberg_download(c(944, 1227, 1228, 2300), mirror = "https://mirror2.sandyriver.net/pub/gutenberg")

lets break these into tokens

tidy_darwin <- darwin%>%
  unnest_tokens(word, text)%>%
  anti_join(stop_words)
## Joining, by = "word"

lets check out what the most common darwin words are.

tidy_darwin %>%
  count(word, sort = TRUE)
## # A tibble: 23,630 × 2
##    word          n
##    <chr>     <int>
##  1 species    2998
##  2 male       1672
##  3 males      1337
##  4 animals    1310
##  5 birds      1292
##  6 female     1197
##  7 sexes      1095
##  8 females    1038
##  9 selection  1038
## 10 sexual      801
## # … with 23,620 more rows

Now lets get some work from Thomas Hunt Morgan, who is credited with discovering chromosomes.

Regeneration - 57198 The genetic and operative evidence relating to secondary sexual characteristics - 57460 Evolution and adaptation - 63540

morgan <- gutenberg_download(c(57198, 57460, 63540), mirror = "https://mirror2.sandyriver.net/pub/gutenberg")

Lets tokenize THM

tidy_morgan <- morgan %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)
## Joining, by = "word"

what are THM’s most common words?

tidy_morgan%>%
  count(word, sort=TRUE)
## # A tibble: 13,855 × 2
##    word             n
##    <chr>        <int>
##  1 species        869
##  2 regeneration   814
##  3 piece          702
##  4 cut            669
##  5 male           668
##  6 forms          631
##  7 selection      604
##  8 cells          576
##  9 found          552
## 10 development    546
## # … with 13,845 more rows

Lastly lets look at thomas henery huzley

evidence as to mans place in nature - 2931 on the reception of the origin of species - 2089 evolution and ethis, and other esays - 2940 science and culture, and other essays - 52344

huxley <- gutenberg_download(c(2931, 2089, 2940, 52344), mirror ="https://mirror2.sandyriver.net/pub/gutenberg")
tidy_huxley <- huxley %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)
## Joining, by = "word"
tidy_huxley %>%
  count(word, sort = TRUE)
## # A tibble: 16,090 × 2
##    word          n
##    <chr>     <int>
##  1 species     339
##  2 nature      331
##  3 time        287
##  4 life        286
##  5 existence   255
##  6 knowledge   238
##  7 animals     227
##  8 natural     223
##  9 animal      216
## 10 science     207
## # … with 16,080 more rows

Now, lets calculate the frequency for each word for the works of Darwin, morgan, and huxley by binding the frames together.

library(tidyr)

frequency <- bind_rows(dplyr::mutate(tidy_morgan, author = "Thomas Hunt Morgan"),
                       dplyr::mutate(tidy_darwin, author = "Charles Darwin"),
                       dplyr::mutate(tidy_huxley, author = "Thomas Henery Huxley")) %>%
  dplyr::mutate(word=str_extract(word, "[a-z']+"))%>%
                          count(author, word)%>%
                          group_by(author)%>%
  dplyr::mutate(proportion=n/sum(n))%>%
  select(-n)%>%
  pivot_wider(names_from=author, values_from=proportion)%>%
  pivot_longer('Thomas Hunt Morgan' : 'Charles Darwin', names_to = "author", values_to = "proportion")

frequency
## # A tibble: 95,895 × 3
##    word    author                proportion
##    <chr>   <chr>                      <dbl>
##  1 a       Thomas Hunt Morgan    0.00206   
##  2 a       Thomas Henery Huxley  0.0000856 
##  3 a       Charles Darwin        0.000141  
##  4 ab      Thomas Hunt Morgan    0.000165  
##  5 ab      Thomas Henery Huxley  0.0000978 
##  6 ab      Charles Darwin        0.00000642
##  7 abaiss  Thomas Hunt Morgan   NA         
##  8 abaiss  Thomas Henery Huxley NA         
##  9 abaiss  Charles Darwin        0.00000642
## 10 abandon Thomas Hunt Morgan    0.00000752
## # … with 95,885 more rows

now we need to change the table so tha teach author has its own row

frequency2 <- pivot_wider(frequency, names_from = author, values_from=proportion)

frequency2
## # A tibble: 31,965 × 4
##    word        `Thomas Hunt Morgan` `Thomas Henery Huxley` `Charles Darwin`
##    <chr>                      <dbl>                  <dbl>            <dbl>
##  1 a                     0.00206                 0.0000856       0.000141  
##  2 ab                    0.000165                0.0000978       0.00000642
##  3 abaiss               NA                      NA               0.00000642
##  4 abandon               0.00000752              0.0000122       0.00000321
##  5 abandoned             0.0000150               0.0000245       0.00000321
##  6 abashed              NA                      NA               0.00000321
##  7 abatement            NA                       0.0000245       0.00000321
##  8 abbot                NA                       0.0000245       0.00000321
##  9 abbott               NA                      NA               0.00000642
## 10 abbreviated          NA                      NA               0.0000128 
## # … with 31,955 more rows

now lets plot

library(scales)
ggplot(frequency2, aes(x = 'Thomas Hunt Morgan', y = 'Thomas Henrey Huxley'), color = abs('-Thomas Hunt Morgan' - ' Thomas Henrey Huxley'))+
  geom_abline(color = "gray40", lty = 2)+
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
  ###scale_x_log10(labels = percent_format())+
 ###scale_y_log10(labels = percent_format())+
  scale_color_gradient(limits = c(0, 0.001),
                       low = "darkslategray4", high = "gray75")+
  theme(legend.position="none")+
  labs(y= "Thomas Henrey Huxley", x = "Thomas Hunt Morgan")
## Warning: Removed 1 rows containing missing values (geom_text).

## Sentimental Analysis, pts 1-3

There are a variety of methods and dictionaries that exist for evaluating the opinion or emotion of the test.

AFFIN bing nrc

  • Bing - catagorizes words in a binary fashion into positive or negative
  • nrc - catagorizes into: positive, negative, anger, anticipation, disgust, fear, joy, sadness, suprise, and trust
  • AFFN - assigns a score between -s and s, with negative indicating negative sentient and s indicating positive

The function get_sentiments() allows us to get they1 specific lexicon with the measures for each one.

library(tidytext)

BiocManager::install("textdata", update=FALSE)
## 'getOption("repos")' replaces Bioconductor standard repositories, see
## '?repositories' for details
## 
## replacement repositories:
##     CRAN: https://cloud.r-project.org
## Bioconductor version 3.15 (BiocManager 1.30.18), R 4.2.1 (2022-06-23)
## Warning: package(s) not installed when version(s) same as current; use `force = TRUE` to
##   re-install: 'textdata'
library(readxl)

setwd("~/Desktop/classroom/myfiles")

afinn<-read.csv("affin.csv")

head(afinn)
##   X       word value
## 1 1    abandon    -2
## 2 2  abandoned    -2
## 3 3   abandons    -2
## 4 4   abducted    -2
## 5 5  abduction    -2
## 6 6 abductions    -2

now lets look at bing

bing <- get_sentiments("bing")

head(bing)
## # A tibble: 6 × 2
##   word       sentiment
##   <chr>      <chr>    
## 1 2-faces    negative 
## 2 abnormal   negative 
## 3 abolish    negative 
## 4 abominable negative 
## 5 abominably negative 
## 6 abominate  negative

and now nrc

library(readr)
setwd("~/Desktop/classroom/myfiles")
nrc <- read.csv("nrc.csv")
head(nrc)
##   X      word sentiment
## 1 1    abacus     trust
## 2 2   abandon      fear
## 3 3   abandon  negative
## 4 4   abandon   sadness
## 5 5 abandoned     anger
## 6 6 abandoned      fear

These libraries were created either using crowd sourcing or cloud computing/ai like the amazon mechanical turk, or by labor of one of the authors, and then validated with crowd sourcing.

Lets look at the words with a joy score from NRC:

library(gutenbergr) 

library(dplyr)

library(stringr)

darwin<-gutenberg_download(c(944,1227, 1228, 2300), mirror="https://mirror2.sandyriver.net/pub/gutenberg")
tidy_books<-darwin%>%
  group_by(gutenberg_id)%>%
  dplyr::mutate(linenumber=row_number(), chapter=cumsum(str_detect(text, regex("^chapter[\\divxlc]", ignore_case=TRUE))))%>%
  ungroup()%>%
  unnest_tokens(word, text)

head(tidy_books)
## # A tibble: 6 × 4
##   gutenberg_id linenumber chapter word  
##          <int>      <int>   <int> <chr> 
## 1          944          1       0 the   
## 2          944          1       0 voyage
## 3          944          1       0 of    
## 4          944          1       0 the   
## 5          944          1       0 beagle
## 6          944          1       0 by

Lets add the book name instead of GID

colnames(tidy_books)[1]<-"book"

tidy_books$book[tidy_books$book==944]<-"The Voyage of the Beagle"

tidy_books$book[tidy_books$book==1227]<-"The Expression of the Emotions in Man and Animals"

tidy_books$book[tidy_books$book==1228]<- "On the Origin of Species By Means of Natural Selection"

tidy_books$book[tidy_books$book==2300]<- "The Descent of Man, and Selection in Relatino to Sex"

head(tidy_books)
## # A tibble: 6 × 4
##   book                     linenumber chapter word  
##   <chr>                         <int>   <int> <chr> 
## 1 The Voyage of the Beagle          1       0 the   
## 2 The Voyage of the Beagle          1       0 voyage
## 3 The Voyage of the Beagle          1       0 of    
## 4 The Voyage of the Beagle          1       0 the   
## 5 The Voyage of the Beagle          1       0 beagle
## 6 The Voyage of the Beagle          1       0 by

now that we have a tidy format with 1 word per row, we are ready for sentiment analysis. First, lets use NRC:

nrc_joy <- nrc%>%
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "The Voyage of the Beagle")%>%
  dplyr::inner_join(nrc_joy)%>%
  count(word, sort = TRUE)
## Joining, by = "word"
## # A tibble: 277 × 2
##    word           n
##    <chr>      <int>
##  1 found        301
##  2 good         161
##  3 remarkable   114
##  4 green         95
##  5 kind          92
##  6 tree          86
##  7 present       85
##  8 food          78
##  9 beautiful     61
## 10 elevation     60
## # … with 267 more rows

We can also examine how sentiment changes throught a work.

library(tidyr)

Charles_Darwin_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
  dplyr::mutate(sentiment = positive - negative)
## Joining, by = "word"

now lets plot it:

library(ggplot2)

ggplot(Charles_Darwin_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE)+ 
  facet_wrap(~book, ncol = 2, scales = "free_x")

Here we will use all three of our dictionaries and examine how the sentiment changes across the arc of TVOTB

library(tidyr)
voyage<-tidy_books %>%
  filter(book=="The Voyage of the Beagle")

head(voyage)
## # A tibble: 6 × 4
##   book                     linenumber chapter word  
##   <chr>                         <int>   <int> <chr> 
## 1 The Voyage of the Beagle          1       0 the   
## 2 The Voyage of the Beagle          1       0 voyage
## 3 The Voyage of the Beagle          1       0 of    
## 4 The Voyage of the Beagle          1       0 the   
## 5 The Voyage of the Beagle          1       0 beagle
## 6 The Voyage of the Beagle          1       0 by

Lets again use the integer dicition “%/%” to define larger sections of the text that span mult lines, and we can use the same pattern with ‘count()’, ‘pivot_wrap()’ and ‘dplyr::mutate()’ to find the net sentiment in each section of the text:

library(dplyr)
affin<-voyage%>%
  inner_join(afinn)%>%
  group_by(index=linenumber %/% 80)%>%
  summarise(sentimen=sum(value))%>%
  dplyr::mutate(method="AFINN")
## Joining, by = "word"
bing_and_nrc<-bind_rows(
  voyage%>%
    inner_join(get_sentiments("bing"))%>%
    dplyr::mutate(method="Bing et al."), 
  voyage%>%
    inner_join(nrc)%>%
                 filter(sentiment %in% c("positive", "negative"))
               )%>%
    dplyr::mutate(method="NRC")%>%
  count(method, index=linenumber%/% 80, sentiment)%>%
  pivot_wider(names_from = sentiment, 
              values_from = n,
              values_fill = 0)%>%
  dplyr::mutate(sentiment=positive-negative)
## Joining, by = "word"
## Joining, by = "word"

N-grams

Section 3 - N-grams #1-3

So far we’ve only looked at single words, but many interesting (more accurate) analyses are based on the relationship between words.

Lets look at some methods of tidytext for calculating and visualizing word relationshipss.

library(dplyr)
library(tidytext)
library(gutenbergr)
darwin_books <- gutenberg_download(c(944, 1227, 1228, 2300), mirror = "https://mirror2.sandyriver.net/pub/gutenberg")

colnames(darwin_books)[1] <- "book"

darwin_books$book[darwin_books$book == 944] <- "The Voyage of the Beagle"

darwin_books$book[darwin_books$book == 1227] <- "The Expression of the Emotions in Man and Animals"

darwin_books$book[darwin_books$book == 1228] <- "On the Origin of Species By Means of Natural Selection"

darwin_books$book[darwin_books$book == 2300] <- "The Descent of Man, and Selection in Relation to Sex"

darwin_bigrams <- darwin_books %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2)

head(darwin_bigrams)
## # A tibble: 6 × 2
##   book                     bigram        
##   <chr>                    <chr>         
## 1 The Voyage of the Beagle the voyage    
## 2 The Voyage of the Beagle voyage of     
## 3 The Voyage of the Beagle of the        
## 4 The Voyage of the Beagle the beagle    
## 5 The Voyage of the Beagle beagle by     
## 6 The Voyage of the Beagle charles darwin

This data is still in tidytext format, and its structured as one-token-per-row. Each tokem is a bigram.

Counting and filtereing n-grams

darwin_bigrams%>%
  count(bigram, sort = TRUE)
## # A tibble: 238,516 × 2
##    bigram       n
##    <chr>    <int>
##  1 of the   11297
##  2 <NA>      8947
##  3 in the    5257
##  4 on the    4093
##  5 to the    2849
##  6 the same  2048
##  7 that the  1947
##  8 it is     1830
##  9 of a      1610
## 10 and the   1590
## # … with 238,506 more rows

Most of the common bigrams are stop-words. This can be a good time to use tidyr’s seperate command which splits a colum into multiple based on a delimiter. This will let us make a column for word one and word two.

library(tidyr)

bigrams_separated <- darwin_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

bigram_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)
bigram_counts <- bigram_filtered %>%
  unite(bigram, word1, word2, sep = " ")

head(bigram_counts)
## # A tibble: 6 × 2
##   book                     bigram        
##   <chr>                    <chr>         
## 1 The Voyage of the Beagle charles darwin
## 2 The Voyage of the Beagle NA NA         
## 3 The Voyage of the Beagle NA NA         
## 4 The Voyage of the Beagle NA NA         
## 5 The Voyage of the Beagle NA NA         
## 6 The Voyage of the Beagle NA NA

We may also be interested in trigrams, which are three word combos

trigrams <- darwin_books %>%
  unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
  separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
  filter(!word1 %in% stop_words$word,
         !word2 %in% stop_words$word, 
         !word3 %in% stop_words$word) %>%
  count(word1, word2, word3, sort = TRUE)

head(trigrams)
## # A tibble: 6 × 4
##   word1     word2  word3           n
##   <chr>     <chr>  <chr>       <int>
## 1 <NA>      <NA>   <NA>         9884
## 2 tierra    del    fuego          92
## 3 secondary sexual characters     91
## 4 captain   fitz   roy            45
## 5 closely   allied species        30
## 6 de        la     physionomie    30

Lets analyze some bigrams

bigram_filtered %>%
  filter(word2 == "selection") %>%
  count(book, word1, sort = TRUE)
## # A tibble: 39 × 3
##    book                                                   word1           n
##    <chr>                                                  <chr>       <int>
##  1 The Descent of Man, and Selection in Relation to Sex   sexual        254
##  2 On the Origin of Species By Means of Natural Selection natural       250
##  3 The Descent of Man, and Selection in Relation to Sex   natural       156
##  4 On the Origin of Species By Means of Natural Selection sexual         18
##  5 On the Origin of Species By Means of Natural Selection continued       6
##  6 The Descent of Man, and Selection in Relation to Sex   unconscious     6
##  7 On the Origin of Species By Means of Natural Selection methodical      5
##  8 The Descent of Man, and Selection in Relation to Sex   continued       5
##  9 On the Origin of Species By Means of Natural Selection unconscious     4
## 10 The Expression of the Emotions in Man and Animals      natural         4
## # … with 29 more rows

lets again look at tf-idf across bigrams across Darwins works.

bigram_tf_idf <- bigram_counts %>%
  count(book, bigram) %>%
  bind_tf_idf(bigram, book, n) %>%
  arrange(desc(tf_idf))

head(bigram_tf_idf)
## # A tibble: 6 × 6
##   book                                        bigram     n      tf   idf  tf_idf
##   <chr>                                       <chr>  <int>   <dbl> <dbl>   <dbl>
## 1 The Expression of the Emotions in Man and … nerve…    47 0.00350 1.39  0.00485
## 2 On the Origin of Species By Means of Natur… natur…   250 0.0160  0.288 0.00460
## 3 The Expression of the Emotions in Man and … la ph…    35 0.00260 1.39  0.00361
## 4 The Voyage of the Beagle                    bueno…    54 0.00245 1.39  0.00339
## 5 The Voyage of the Beagle                    capta…    53 0.00240 1.39  0.00333
## 6 On the Origin of Species By Means of Natur… glaci…    36 0.00230 1.39  0.00319
library(ggplot2)


bigram_tf_idf %>%
  arrange(desc(tf_idf)) %>%
  group_by(book) %>%
  slice_max(tf_idf, n = 10) %>%
  ungroup() %>%
  dplyr::mutate(bigram = reorder(bigram, tf_idf)) %>%
  ggplot(aes(tf_idf, bigram, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free") +
  labs(x= "tf-idf of bigrams", y = NULL)

Using bigrams to procude contezt in sentiment analysis

bigrams_separated %>%
  filter(word1 == "not") %>%
  count(word1, word2, sort = TRUE)
## # A tibble: 867 × 3
##    word1 word2     n
##    <chr> <chr> <int>
##  1 not   be      128
##  2 not   have    104
##  3 not   only    103
##  4 not   a       100
##  5 not   to       98
##  6 not   been     89
##  7 not   the      82
##  8 not   at       70
##  9 not   know     60
## 10 not   so       58
## # … with 857 more rows

By doing sentiment analysis on bigrams, we can examine how often sentiment-associated words are preceeded by a modifier like “not” or other negating words.

library(tidytext)
library(textdata)
library(readr)
setwd("~/Desktop/classroom/myfiles")
AFINN <- read.csv("affin.csv")

head(AFINN)
##   X       word value
## 1 1    abandon    -2
## 2 2  abandoned    -2
## 3 3   abandons    -2
## 4 4   abducted    -2
## 5 5  abduction    -2
## 6 6 abductions    -2

We can examine the most frequent words that were preceeded by “not”, and associate with sentiment

not_words <- bigrams_separated %>%
  filter(word1 == "not") %>%
  inner_join(AFINN, by = c(word2 = "word")) %>%
  count(word2, value, sort = TRUE)

head(not_words)
## # A tibble: 6 × 3
##   word2     value     n
##   <chr>     <int> <int>
## 1 doubt        -1    25
## 2 like          2    11
## 3 pretend      -1     9
## 4 wish          1     8
## 5 admit        -1     7
## 6 difficult    -1     5
library(ggplot2)

not_words %>%
  dplyr::mutate(contribution = n * value) %>%
  arrange(desc(abs(contribution))) %>%
  head(20) %>%
  dplyr::mutate(word2 = reorder(word2, contribution)) %>%
  ggplot(aes(n * value, word2, fill = n * value > 0 )) +
  geom_col(show.legend = FALSE) +
  labs(x = "Sentimen value * number or occurences", y = "words preceeded by \"not\"")

negation_words <- c("not", "no", "never", "non", "without")

negated_words <- bigrams_separated %>%
  filter(word1%in% negation_words)%>%
  inner_join(AFINN, by = c(word2 = "word")) %>%
  count(word1, word2, value, sort = TRUE)

head(negated_words)
## # A tibble: 6 × 4
##   word1 word2   value     n
##   <chr> <chr>   <int> <int>
## 1 no    doubt      -1   210
## 2 not   doubt      -1    25
## 3 no    great       3    19
## 4 not   like        2    11
## 5 not   pretend    -1     9
## 6 not   wish        1     8

lets visualize the negation words

negated_words %>%
  dplyr::mutate(contribution = n *value,
         word2=reorder(paste(word2, word1, sep = "_"), contribution)) %>%
  group_by(word1)%>%
  slice_max(abs(contribution), n = 12, with_ties = FALSE) %>%
  ggplot(aes(word2, contribution, fill = n * value > 0)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ word1, scales = "free") +
  scale_x_discrete(labels = function(x) gsub("_.+4", "", x)) +
  xlab("words preceded by negation term") +
  ylab("sentiment value * # of occurences") +
  coord_flip()

visualize a netword of bigtams with ggraph

library(igraph)
## 
## Attaching package: 'igraph'
## The following object is masked from 'package:mosaic':
## 
##     compare
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following objects are masked from 'package:lubridate':
## 
##     %--%, union
## The following object is masked from 'package:plotly':
## 
##     groups
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
bigram_counts <- bigram_filtered %>%
  count(word1, word2, sort = TRUE)


bigram_graph <- bigram_counts %>%
  filter(n > 20) %>%
  graph_from_data_frame()
## Warning in graph_from_data_frame(.): In `d' `NA' elements were replaced with
## string "NA"
head(bigram_graph)
## 6 x 203 sparse Matrix of class "dgCMatrix"
##    [[ suppressing 203 column names 'NA', 'natural', 'sexual' ... ]]
##                                                                                
## NA      1 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## natural . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## sexual  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## vol     . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## lower   . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## south   . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
##                                                                                
## NA      . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## natural . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## sexual  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## vol     . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## lower   . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## south   . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
##                                                                                
## NA      . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## natural . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## sexual  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## vol     . . . . . . . . . . . . . . . . . . . . 1 . . . . . . . . . . . . . . .
## lower   . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## south   . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
##                                                                                
## NA      . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## natural 1 . . . . . . . 1 . . . . . . . . . . . . . . . . . . . . . . . . . . .
## sexual  1 . 1 . . 1 . . . . . 1 . . . . . . . . . . . . . . . . . . . . . . . .
## vol     . . . . . . . 1 . . . . . . . . . . . . . 1 . . . . . . . . . . . . . .
## lower   . 1 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## south   . . . 1 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
##                                                                                
## NA      . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## natural . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 1 . . . . . .
## sexual  . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## vol     . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
## lower   . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 1 . . . .
## south   . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
##                                                      
## NA      . . . . . . . . . . . . . . . . . . . . . . .
## natural . . . . . . . . . . . . . . . . . . . . . . .
## sexual  . . . . . . . . . . . . . . . . . . . . . . .
## vol     . . . . . . . . . . . . . . . . . . . . . . .
## lower   . . . . . . . . . . . . . . . . . . . . . . .
## south   . 1 . . . . . . . . . . . . . . . . . . . . .
library(ggraph)
set.seed(1234)

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)

we can also add directionality to this network

set.seed(1234)
a <- grid::arrow(type="closed", length=unit(0.15, "inches"))

ggraph(bigram_graph, layout ="fr")+
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a, end_cap = circle(0.7, "inches")) +
  geom_node_point(color="lightblue", size = 5)+
  geom_node_text(aes(label=name), vjust = 1, hjust = 1)+
  theme_void()

## Word Frequency Section 4 - Word Frequency A central question in text mining is how to quantify what a doccument is about. We can do this by looking at words that make up the doccument and measuring the frequency.

There are a lot of ords that may not be important, these are stop words.

One way to remedy this is to look at inverse doument frequency words, which decreases the weight for commonly used words and increases the weight for words that are not used very much.

Term frequency in Darwin’s works:

library(dplyr)
library(tidytext)
library(tidyr)

book_words <- gutenberg_download(c(944, 1227, 1228, 2300), mirror = "https://mirror2.sandyriver.net/pub/gutenberg")

colnames(book_words)[1] <- "book"

book_words$book[book_words$book == 944] <- "The Voyage of the Beagle"
book_words$book[book_words$book == 1227] <- "The Expression of the Emotions in Man and Animals"
book_words$book[book_words$book == 1228] <- "On the Origin of Species By Means of Natural Selection"
book_words$book[book_words$book == 2300] <- "The Decesnt of Man, and Seletion in Relation to Sex"

now lets disect

book_words <- book_words %>%
  unnest_tokens(word, text) %>%
  count(book, word, sort = TRUE)

head(book_words)
## # A tibble: 6 × 3
##   book                                                   word      n
##   <chr>                                                  <chr> <int>
## 1 The Decesnt of Man, and Seletion in Relation to Sex    the   25490
## 2 The Voyage of the Beagle                               the   16930
## 3 The Decesnt of Man, and Seletion in Relation to Sex    of    16762
## 4 On the Origin of Species By Means of Natural Selection the   10301
## 5 The Voyage of the Beagle                               of     9438
## 6 The Decesnt of Man, and Seletion in Relation to Sex    in     8882
book_words$n <- as.numeric(book_words$n)

total_words <- book_words %>%
  group_by(book) %>%
  dplyr::summarize(total = sum(n))

head(book_words)
## # A tibble: 6 × 3
##   book                                                   word      n
##   <chr>                                                  <chr> <dbl>
## 1 The Decesnt of Man, and Seletion in Relation to Sex    the   25490
## 2 The Voyage of the Beagle                               the   16930
## 3 The Decesnt of Man, and Seletion in Relation to Sex    of    16762
## 4 On the Origin of Species By Means of Natural Selection the   10301
## 5 The Voyage of the Beagle                               of     9438
## 6 The Decesnt of Man, and Seletion in Relation to Sex    in     8882
book_words <- left_join(book_words, total_words)
## Joining, by = "book"
head(book_words)
## # A tibble: 6 × 4
##   book                                                   word      n  total
##   <chr>                                                  <chr> <dbl>  <dbl>
## 1 The Decesnt of Man, and Seletion in Relation to Sex    the   25490 311041
## 2 The Voyage of the Beagle                               the   16930 208118
## 3 The Decesnt of Man, and Seletion in Relation to Sex    of    16762 311041
## 4 On the Origin of Species By Means of Natural Selection the   10301 157002
## 5 The Voyage of the Beagle                               of     9438 208118
## 6 The Decesnt of Man, and Seletion in Relation to Sex    in     8882 311041

We can see that the usual suspects are the most common words, but that doesnt tell us anything about what the book topic is.

library(ggplot2)

ggplot(book_words, aes(n/total, fill = book)) +
  geom_histogram(show.legend = FALSE) +
  xlim(NA, 0.0009)+
  facet_wrap(~book, ncol = 2, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 515 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_bar).

Zipf’s Law (furter refered to as Z’s Law) - the frequency that a word appears is inversley proportional to its rank when predicting a topic

lets apply Z’s Law to Darwin’s word

freq_by_rank <- book_words %>%
  group_by(book)%>%
  dplyr::mutate(rank = row_number(),
         `term frequency` = n/total) %>%
  ungroup()

head(freq_by_rank)
## # A tibble: 6 × 6
##   book                                          word      n  total  rank term …¹
##   <chr>                                         <chr> <dbl>  <dbl> <int>   <dbl>
## 1 The Decesnt of Man, and Seletion in Relation… the   25490 311041     1  0.0820
## 2 The Voyage of the Beagle                      the   16930 208118     1  0.0813
## 3 The Decesnt of Man, and Seletion in Relation… of    16762 311041     2  0.0539
## 4 On the Origin of Species By Means of Natural… the   10301 157002     1  0.0656
## 5 The Voyage of the Beagle                      of     9438 208118     2  0.0453
## 6 The Decesnt of Man, and Seletion in Relation… in     8882 311041     3  0.0286
## # … with abbreviated variable name ¹​`term frequency`
freq_by_rank %>%
  ggplot(aes(rank, `term frequency`, color = book)) +
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
  scale_x_log10() +
  scale_y_log10()

lets use TF - IDF to find words for each document by decreasing the weight for commonly used words and increasing the weight for words that are not used very much in a collection of documents

book_tf_idf <- book_words %>%
  bind_tf_idf(word, book, n)

head(book_tf_idf)
## # A tibble: 6 × 7
##   book                                    word      n  total     tf   idf tf_idf
##   <chr>                                   <chr> <dbl>  <dbl>  <dbl> <dbl>  <dbl>
## 1 The Decesnt of Man, and Seletion in Re… the   25490 311041 0.0820     0      0
## 2 The Voyage of the Beagle                the   16930 208118 0.0813     0      0
## 3 The Decesnt of Man, and Seletion in Re… of    16762 311041 0.0539     0      0
## 4 On the Origin of Species By Means of N… the   10301 157002 0.0656     0      0
## 5 The Voyage of the Beagle                of     9438 208118 0.0453     0      0
## 6 The Decesnt of Man, and Seletion in Re… in     8882 311041 0.0286     0      0

lets look at terms with high tf-idf in Darwin’s works

book_tf_idf %>%
  select(-total) %>%
  arrange(desc(tf_idf))
## # A tibble: 43,024 × 6
##    book                                        word      n      tf   idf  tf_idf
##    <chr>                                       <chr> <dbl>   <dbl> <dbl>   <dbl>
##  1 The Expression of the Emotions in Man and … tears   126 1.14e-3 1.39  1.58e-3
##  2 The Expression of the Emotions in Man and … blush   114 1.03e-3 1.39  1.43e-3
##  3 The Expression of the Emotions in Man and … eyeb…   149 1.35e-3 0.693 9.35e-4
##  4 The Voyage of the Beagle                    degs    117 5.62e-4 1.39  7.79e-4
##  5 On the Origin of Species By Means of Natur… sele…   412 2.62e-3 0.288 7.55e-4
##  6 The Decesnt of Man, and Seletion in Relati… sexu…   745 2.40e-3 0.288 6.89e-4
##  7 The Decesnt of Man, and Seletion in Relati… shewn   143 4.60e-4 1.39  6.37e-4
##  8 On the Origin of Species By Means of Natur… hybr…   133 8.47e-4 0.693 5.87e-4
##  9 The Expression of the Emotions in Man and … frown    46 4.17e-4 1.39  5.78e-4
## 10 The Decesnt of Man, and Seletion in Relati… sele…   621 2.00e-3 0.288 5.74e-4
## # … with 43,014 more rows

lets look at a visualization for these high tf-idf words

library(forcats)


book_tf_idf %>%
  group_by(book)%>%
  slice_max(tf_idf, n = 15) %>%
  ungroup() %>%
  ggplot(aes(tf_idf, fct_reorder(word, tf_idf), fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free")+
  labs(x="tf-idf", y = NULL)

Mining APIs

Twitter Sentiment Analysis

#lets load the twitter API package

install.packages("academictwitteR")
library(academictwitteR)
setwd("~/Desktop/classroom/myfiles/starter_tweets")
bearer_token <- "AAAAAAAAAAAAAAAAAAAAAByFhgEAAAAAl4Ql0CyArwygcl0FEWUO5ARSaAQ%3DVJpyDIOBKBVdtgnEv7wrpdwoeI5WbQLbxfSYElUGZuwix0kbQm"


tweets20US <-
  get_all_tweets(
    "covid-19 has:geo",
    "2020-06-01T01:00:00z",
    "2020-06-08T01:00:00Z",
    bearer_token,
    n = 5000,
    country = "US"
  )

tweets21US <-
  get_all_tweets(
    "covid-19 has:geo",
    "2021-01-01T01:00:00z",
    "2021-01-08T01:00:00Z",
    bearer_token,
    n = 5000,
    country = "US"
  )

tweets20US <-
  get_all_tweets(
    "covid-19 has:geo",
    "2022-01-01T01:00:00z",
    "2022-01-08T01:00:00Z",
    bearer_token,
    n = 5000,
    country = "US"
  )

first lets load our data

library(readr)
BiocManager::install("tidytweets", update=FALSE)
## 'getOption("repos")' replaces Bioconductor standard repositories, see
## '?repositories' for details
## 
## replacement repositories:
##     CRAN: https://cloud.r-project.org
## Bioconductor version 3.15 (BiocManager 1.30.18), R 4.2.1 (2022-06-23)
## Installing package(s) 'tidytweets'
## Warning: package 'tidytweets' is not available for Bioconductor version '3.15'
## 
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
setwd("~/Desktop/classroom/myfiles/starter_tweets")
tweets20US<-read.csv("tweets20us.csv", row.names=1)
tweets21US<-read.csv("tweets21us.csv", row.names=1)
tweets22US<-read.csv("tweets22us.csv", row.names=1)
library(readr)
BiocManager::install("tidytweets", update=FALSE)
## 'getOption("repos")' replaces Bioconductor standard repositories, see
## '?repositories' for details
## 
## replacement repositories:
##     CRAN: https://cloud.r-project.org
## Bioconductor version 3.15 (BiocManager 1.30.18), R 4.2.1 (2022-06-23)
## Installing package(s) 'tidytweets'
## Warning: package 'tidytweets' is not available for Bioconductor version '3.15'
## 
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
setwd("~/Desktop/classroom/myfiles/starter_tweets")
tweets20CA<-read.csv("tweets20CA.csv", row.names=1)
tweets21CA<-read.csv("tweets21CA.csv", row.names=1)
tweets22CA<-read.csv("tweets22CA.csv", row.names=1)
library(readr)
BiocManager::install("tidytweets", update=FALSE)
## 'getOption("repos")' replaces Bioconductor standard repositories, see
## '?repositories' for details
## 
## replacement repositories:
##     CRAN: https://cloud.r-project.org
## Bioconductor version 3.15 (BiocManager 1.30.18), R 4.2.1 (2022-06-23)
## Installing package(s) 'tidytweets'
## Warning: package 'tidytweets' is not available for Bioconductor version '3.15'
## 
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
setwd("~/Desktop/classroom/myfiles/starter_tweets")
tweets20GB<-read.csv("tweets20GB.csv", row.names=1)
tweets21GB<-read.csv("tweets21GB.csv", row.names=1)
tweets22GB<-read.csv("tweets22GB.csv", row.names=1)

now lets load libraries

library(lubridate)
library(ggplot2)
library(dplyr)
library(readr)

lets combine our US tweets

tweetsUS <- bind_rows(tweets21US %>%
                        dplyr::mutate(year = "y2021"),
                      tweets22US %>%
                        dplyr::mutate(year = "y2022"),
                      tweets20US %>%
                        dplyr::mutate(year = "year2022")) %>%
  dplyr::mutate(timestamp = ymd_hms(created_at))

lets combine our GB tweets:

tweetsGB <- bind_rows(tweets21GB %>%
                        dplyr::mutate(year = "y2021"),
                      tweets22GB %>%
                        dplyr::mutate(year = "y2022"),
                      tweets20GB %>%
                        dplyr::mutate(year = "year2022")) %>%
  dplyr::mutate(timestamp = ymd_hms(created_at))

lets combine our Canada tweets:

tweetsCA <- bind_rows(tweets21CA %>%
                        dplyr::mutate(year = "y2021"),
                      tweets22CA %>%
                        dplyr::mutate(year = "y2022"),
                      tweets20CA %>%
                        dplyr::mutate(year = "year2022")) %>%
  dplyr::mutate(timestamp = ymd_hms(created_at))

nows lets clean our US tweets:

library(tidytext)
library(stringr)
remove_reg <- "&amp;|&lt;|&gt;"
tidy_tweetsUS <- tweetsUS %>%
  filter(!str_detect(text, "^RT")) %>%
  dplyr::mutate(text = str_remove_all(text, remove_reg)) %>%
  unnest_tokens(word, text, token = "tweets") %>%
  filter(!word %in% stop_words$word,
         !word %in% str_remove_all(stop_words$words, "'"),
         str_detect(word, "[a-z]"))
## Using `to_lower = TRUE` with `token = 'tweets'` may not preserve URLs.
## Warning: Unknown or uninitialised column: `words`.

lets clean our GB tweets:

remove_reg <- "&amp;|&lt;|&gt;"

tidy_tweetsGB <- tweetsGB %>%
  filter(!str_detect(text, "^RT")) %>%
  dplyr::mutate(text = str_remove_all(text, remove_reg)) %>%
  unnest_tokens(word, text, token = "tweets") %>%
  filter(!word %in% stop_words$word,
         !word %in% str_remove_all(stop_words$words, "'"),
         str_detect(word, "[a-z]"))
## Using `to_lower = TRUE` with `token = 'tweets'` may not preserve URLs.
## Warning: Unknown or uninitialised column: `words`.

lets clean our CA tweets:

remove_reg <- "&amp;|&lt;|&gt;"

tidy_tweetsCA <- tweetsCA %>%
  filter(!str_detect(text, "^RT")) %>%
  dplyr::mutate(text = str_remove_all(text, remove_reg)) %>%
  unnest_tokens(word, text, token = "tweets") %>%
  filter(!word %in% stop_words$word,
         !word %in% str_remove_all(stop_words$words, "'"),
         str_detect(word, "[a-z]"))
## Using `to_lower = TRUE` with `token = 'tweets'` may not preserve URLs.
## Warning: Unknown or uninitialised column: `words`.

lets take a look at our US data

ggplot(tweetsUS, aes(x=timestamp, fill = year)) +
  geom_histogram(position = "identity", bins =20, show.legend = FALSE) +
  facet_wrap(~year, ncol = 1)

now the GB data

ggplot(tweetsGB, aes(x=timestamp, fill = year)) +
  geom_histogram(position = "identity", bins =20, show.legend = FALSE) +
  facet_wrap(~year, ncol = 1)

now the CA data

ggplot(tweetsCA, aes(x=timestamp, fill = year)) +
  geom_histogram(position = "identity", bins =20, show.legend = FALSE) +
  facet_wrap(~year, ncol = 1)

frequencyUS <- tidy_tweetsUS %>%
  count(year, word, sort = TRUE) %>%
  left_join(tidy_tweetsUS %>%
              count(year, name = "total")) %>%
  dplyr::mutate(freq = n/total)
## Joining, by = "year"
head(frequencyUS)
##       year     word    n total        freq
## 1    y2021  covid19 3832 66382 0.057726492
## 2 year2022  covid19 3626 67279 0.053894975
## 3    y2022  covid19 2568 56060 0.045808063
## 4    y2022 #covid19  894 56060 0.015947199
## 5    y2021  vaccine  643 66382 0.009686361
## 6 year2022    covid  602 67279 0.008947814
frequencyGB <- tidy_tweetsUS %>%
  count(year, word, sort = TRUE) %>%
  left_join(tidy_tweetsUS %>%
              count(year, name = "total")) %>%
  dplyr::mutate(freq = n/total)
## Joining, by = "year"
head(frequencyGB)
##       year     word    n total        freq
## 1    y2021  covid19 3832 66382 0.057726492
## 2 year2022  covid19 3626 67279 0.053894975
## 3    y2022  covid19 2568 56060 0.045808063
## 4    y2022 #covid19  894 56060 0.015947199
## 5    y2021  vaccine  643 66382 0.009686361
## 6 year2022    covid  602 67279 0.008947814
frequencyCA <- tidy_tweetsUS %>%
  count(year, word, sort = TRUE) %>%
  left_join(tidy_tweetsUS %>%
              count(year, name = "total")) %>%
  dplyr::mutate(freq = n/total)
## Joining, by = "year"
head(frequencyCA)
##       year     word    n total        freq
## 1    y2021  covid19 3832 66382 0.057726492
## 2 year2022  covid19 3626 67279 0.053894975
## 3    y2022  covid19 2568 56060 0.045808063
## 4    y2022 #covid19  894 56060 0.015947199
## 5    y2021  vaccine  643 66382 0.009686361
## 6 year2022    covid  602 67279 0.008947814
library(tidyr)
setwd("~/Desktop/classroom/myfiles/starter_tweets")

frequencyUS2<-frequencyUS%>%
  select(year, word, freq)%>%
  pivot_wider(names_from=year, values_from=freq)%>%
  arrange('y2020', 'y2021', 'y2022')
head(frequencyUS2)
library(tidyr)
setwd("~/Desktop/classroom/myfiles/starter_tweets")

frequencyGB2 <- frequencyGB %>%
  select(year, word, freq) %>%
  pivot_wider(names_from = year, values_from = freq) %>%
  arrange('y2020', 'y2021', 'y2022')

head(frequencyGB2)
library(tidyr)
setwd("~/Desktop/classroom/myfiles/starter_tweets")

frequencyCA2 <- frequencyCA %>%
  select(year, word, freq) %>%
  pivot_wider(names_from = year, values_from = freq) %>%
  arrange('y2020', 'y2021', 'y2022')

head(frequencyCA2)
library(scales)
ggplot(frequencyUS2, aes(2020, 2022)) +
  geom_jitter(alpha = 0.05, size = 2.5, width = 0.25, height = 0.25) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  geom_abline(color = "red")

finding joy

library(dplyr)
library(stringr)
nrc_joy <- (nrc) %>%
  filter(sentiment == "joy")

tidy_tweetsUS %>%
  filter(year == "y2020") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)
## Joining, by = "word"
## [1] word n   
## <0 rows> (or 0-length row.names)

finding anger

nrc_anger <- nrc %>%
  filter(sentiment == "anger")

tidy_tweetsGB %>%
  filter(year == "y2020") %>%
  inner_join(nrc_anger) %>%
  count(word, sort = TRUE)
## Joining, by = "word"
## [1] word n   
## <0 rows> (or 0-length row.names)

finding fear

nrc_fear <- nrc %>%
  filter(sentiment == "fear")

tidy_tweetsCA %>%
  filter(year == "y2020") %>%
  inner_join(nrc_fear) %>%
  count(word, sort = TRUE)
## Joining, by = "word"
## [1] word n   
## <0 rows> (or 0-length row.names)
GB_Covid_sentiment <- tidy_tweetsGB %>%
  inner_join(get_sentiments("bing")) %>%
  count(year, index = id %/% 80, sentiment) %>%
  pivot_wider(names_fro = sentiment, values_from = n, values_fill = 0) %>%
  dplyr::mutate(sentiment = positive - negative)
## Joining, by = "word"
library(ggplot2)

ggplot(GB_Covid_sentiment, aes(sentiment, fill = year)) +
  geom_histogram(show.legend = FALSE) +
  facet_wrap(~year, ncol = 2, scales = "free_x")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GB_Covid_sentiment_means <- aggregate(x = GB_Covid_sentiment$sentiment, by = list(GB_Covid_sentiment$year), FUN = mean)

GB_Covid_sentiment_means
##    Group.1          x
## 1    y2021 -0.3708450
## 2    y2022 -0.4546649
## 3 year2022 -0.5469217
US_Covid_sentiment <- tidy_tweetsUS %>%
  inner_join(get_sentiments("bing")) %>%
  count(year, index = id %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
  dplyr::mutate(sentiment = positive - negative)
## Joining, by = "word"
library(ggplot2)

ggplot(US_Covid_sentiment, aes(sentiment, fill = year)) +
  geom_histogram(show.legend = FALSE) +
  facet_wrap(~year, ncol = 2, scales = "free_x")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

US_Covid_sentiment_means <- aggregate(x = US_Covid_sentiment$sentiment, by = list(US_Covid_sentiment$year), FUN = mean)

US_Covid_sentiment_means
##    Group.1          x
## 1    y2021 -0.6293752
## 2    y2022 -0.5157497
## 3 year2022 -0.6560050
CA_Covid_sentiment <- tidy_tweetsCA %>%
  inner_join(get_sentiments("bing")) %>%
  count(year, index = id %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
  dplyr::mutate(sentiment = positive - negative)
## Joining, by = "word"
library(ggplot2)

ggplot(CA_Covid_sentiment, aes(sentiment, fill = year)) +
  geom_histogram(show.legend = FALSE) +
  facet_wrap(~year, ncol = 2, scales = "free_x")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

CA_Covid_sentiment_means <- aggregate(x = CA_Covid_sentiment$sentiment, by = list(CA_Covid_sentiment$year), FUN = mean)

US_Covid_sentiment_means
##    Group.1          x
## 1    y2021 -0.6293752
## 2    y2022 -0.5157497
## 3 year2022 -0.6560050
bing_word_counts_US <- tidy_tweetsUS %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
## Joining, by = "word"
head(bing_word_counts_US)
##       word sentiment   n
## 1 positive  positive 609
## 2    trump  positive 433
## 3    virus  negative 411
## 4     died  negative 297
## 5    death  negative 292
## 6     safe  positive 237
bing_word_counts_US %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>%
  ungroup() %>%
  dplyr::mutate(word = reorder(word, n)) +
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to Sentiment", y = NULL)

customizing

custom_stop_words <- bind_rows(tibble(word = c("trump", "positive"),
                                      lexicon = c("custom")),
                               stop_words)

custom_stop_words
## # A tibble: 1,151 × 2
##    word        lexicon
##    <chr>       <chr>  
##  1 trump       custom 
##  2 positive    custom 
##  3 a           SMART  
##  4 a's         SMART  
##  5 able        SMART  
##  6 about       SMART  
##  7 above       SMART  
##  8 according   SMART  
##  9 accordingly SMART  
## 10 across      SMART  
## # … with 1,141 more rows

Word clouds ~~~

“acast” function was not found and I also could not find the functioning library for it but these are the code chuncks for Word clouds.

library(wordcloud)
library(tidytext)
tidy_tweetsUS %>%
  anti_join(custom_stop_words) %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"), 
                   max.words = 100)
tidy_tweetsUS %>%
  anti_join(custom_stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))
bingnegative <- get_sentiments("bing") %>%
  filter(sentiment == "negative")

wordcounts <- tidy_tweetsCA %>%
  group_by(year) %>%
  dplyr::summarize(words = n())

tidy_tweetsCA %>%
  semi_join(bingnegative) %>%
  group_by(year) %>%
  dplyr::summarize(negativewords = n()) %>%
  left_join(wordcounts, by = "year") %>%
  dplyr::mutate(ratio = negativewords/words) %>%
  slice_max(ratio, n = 3) %>%
  ungroup()
## Joining, by = "word"
## # A tibble: 3 × 4
##   year     negativewords words  ratio
##   <chr>            <int> <int>  <dbl>
## 1 year2022          2603 48248 0.0540
## 2 y2021             1647 31542 0.0522
## 3 y2022              723 16889 0.0428