Barplots

Now lets take a look at some ggplots barplots

we’ll start with making a dataframe based on the tooth data.

df <- data.frame(dose = c("D0.5", "D1", "D2"),
                 len = c(4.2, 10, 29.5))
df
##   dose  len
## 1 D0.5  4.2
## 2   D1 10.0
## 3   D2 29.5

and now lets make a second dataframe

df2 <- data.frame(supp=rep(c("VC", "OJ"), each = 3),
                  dose = rep(c("D0.5", "D1", "D2"), 2),
                  len = c(6.8, 15, 33, 4.2, 10, 29.5))
df2
##   supp dose  len
## 1   VC D0.5  6.8
## 2   VC   D1 15.0
## 3   VC   D2 33.0
## 4   OJ D0.5  4.2
## 5   OJ   D1 10.0
## 6   OJ   D2 29.5

Lets load up ggplot2

library(ggplot2)

Lets set our paramters for ggplot

theme_set(
  theme_classic() +
    theme(legend.position = "top")
)

lets start with some basic barplots using the tooth data

f <- ggplot(df, aes(x = dose, y = len))

f + geom_col()

Now lets change the fill, and add labels to the top

f + geom_col(fill = "darkblue") +
  geom_text(aes(label = len), vjust = -0.3)

Now lets add the labels inside the bars

f + geom_col(fill = "darkblue") +
  geom_text(aes(label = len), vjust = 1.6, color = "white")

Now lets change the barplot colors by group

f + geom_col(aes(color = dose), fill = "white") + 
  scale_color_manual(values = c("blue", "gold", "red"))

This is kinda hard to see, so lets change the fill.

f + geom_col(aes(fill = dose)) +
  scale_fill_manual(values = c("blue", "gold", "red"))

ok how do we do this with multiple groups

ggplot(df2, aes(x = dose, y = len)) +
  geom_col(aes(color = supp, fill = supp), position = position_stack()) + scale_color_manual(values = c("blue", "gold")) + scale_fill_manual(values = c("blue", "gold"))

p <- ggplot(df2, aes(x = dose, y = len)) +
  geom_col(aes(color = supp, fill = supp), position = position_dodge(0.8), width = 0.7) + scale_color_manual(values = c("blue", "gold")) + scale_fill_manual(values = c("blue", "gold"))
p

Now lets add those labels to the dodged barplot

p + geom_text(
  aes(label = len, group = supp),
  position = position_dodge(0.8),
  vjust = -0.3, size = 3.5
)

Now what if we want to add labels to our stacked barplots? For this we need dplyr

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
df2 <- df2 %>%
  group_by(dose) %>%
  arrange(dose, desc(supp)) %>%
  mutate(lab_ypos = cumsum(len) -0.5 * len)
df2
## # A tibble: 6 × 4
## # Groups:   dose [3]
##   supp  dose    len lab_ypos
##   <chr> <chr> <dbl>    <dbl>
## 1 VC    D0.5    6.8      3.4
## 2 OJ    D0.5    4.2      8.9
## 3 VC    D1     15        7.5
## 4 OJ    D1     10       20  
## 5 VC    D2     33       16.5
## 6 OJ    D2     29.5     47.8

Now lets recreate our stacked graphs

ggplot(df2, aes(x=dose, y = len)) +
  geom_col(aes(fill = supp), width = 0.7) +
  geom_text(aes(y = lab_ypos, label = len, group = supp), color = "white") +
  scale_color_manual(values = c("blue", "gold")) +
  scale_fill_manual(values = c("blue", "gold"))

Boxplots

Lets look at some boxplots

data("ToothGrowth")

Lets change the dose to a factor, amd look at the top of the dataframe

ToothGrowth$dose <- as.factor(ToothGrowth$dose)

head(ToothGrowth, 4)
##    len supp dose
## 1  4.2   VC  0.5
## 2 11.5   VC  0.5
## 3  7.3   VC  0.5
## 4  5.8   VC  0.5

Lets load ggplot

library(ggplot2)

Lets set the theme for our plots to classic

theme_set(
  theme_bw() +
    theme(legend.position = "top")
)

Lets start with a very basic boxplot with dose vs length

tg <- ggplot(ToothGrowth, aes(x = dose, y = len))
tg + geom_boxplot()

Now lets look at a box plot with points for the mean

tg + geom_boxplot(notch = TRUE, fill = "lightgrey") + 
  stat_summary(fun.y = mean, geom = "point", shape= 18, size = 2.5, color = "indianred")
## Warning: The `fun.y` argument of `stat_summary()` is deprecated as of ggplot2 3.3.0.
## ℹ Please use the `fun` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

we can also change the scale number of variables included, and their order

tg + geom_boxplot() +
  scale_x_discrete(limits = c("0.5", "2"))
## Warning: Removed 20 rows containing missing values (`stat_boxplot()`).

Lets put our x axis in descending order

tg + geom_boxplot() +
  scale_x_discrete(limits = c("2", "1", "0.5"))

we can also change boxplot colors by groups

tg + geom_boxplot(aes(color = dose)) +
  scale_color_manual(values = c("indianred", "blue1", "green2"))

what if we want to display our data subset by oj vs viatamin c?

tg2 <- tg + geom_boxplot(aes(fill = supp), position = position_dodge(0.9)) +
  scale_fill_manual(values = c("999999", "#E69F00"))
                      
tg2

we can also arrange this as two plots with facet_wrap

tg2 + facet_wrap(~supp)

Histograms

set.seed(1234)

wdata = data.frame(
  sex = factor(rep(c("F","M"), each = 200)), 
  weight = c(rnorm(200, 50), rnorm(200, 58))
)

head(wdata, 4)
##   sex   weight
## 1   F 48.79293
## 2   F 50.27743
## 3   F 51.08444
## 4   F 47.65430

Now lets load dplyr

library(dplyr)

mu <- wdata %>% 
  group_by(sex) %>%
  summarise(grp.mean = mean(weight))

Now lets load the plotting package

library(ggplot2)

theme_set(
  theme_classic() + 
    theme(legend.position = "bottom")
  
)

Now lets create a ggplot object

a <- ggplot(wdata, aes(x = weight))

a + geom_histogram(bins = 30, color = "black", fill = "grey") + 
  geom_vline(aes(xintercept = mean(weight)),
             linetype = "dashed", size = 0.6)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Now lets change the color by group

a + geom_histogram(aes(color = sex), fill = "white", position = "identity") +
  scale_color_manual(values = c("#00AFBB", "#E7B800"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

a + geom_histogram(aes(color = sex, fill = sex), position = "identity") +
  scale_color_manual(values = c("#00AFBB", "#E7B800")) 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

  scale_fill_manual(values = c("indianred", "#lightblue1"))
## <ggproto object: Class ScaleDiscrete, Scale, gg>
##     aesthetics: fill
##     axis_order: function
##     break_info: function
##     break_positions: function
##     breaks: waiver
##     call: call
##     clone: function
##     dimension: function
##     drop: TRUE
##     expand: waiver
##     get_breaks: function
##     get_breaks_minor: function
##     get_labels: function
##     get_limits: function
##     guide: legend
##     is_discrete: function
##     is_empty: function
##     labels: waiver
##     limits: NULL
##     make_sec_title: function
##     make_title: function
##     map: function
##     map_df: function
##     n.breaks.cache: NULL
##     na.translate: TRUE
##     na.value: grey50
##     name: waiver
##     palette: function
##     palette.cache: NULL
##     position: left
##     range: environment
##     rescale: function
##     reset: function
##     scale_name: manual
##     train: function
##     train_df: function
##     transform: function
##     transform_df: function
##     super:  <ggproto object: Class ScaleDiscrete, Scale, gg>

What if we want to combine density plots and histograms?

a + geom_histogram(aes(y = stat(density)),
                   color = "black", fill = "white") +
  geom_density(alpha = 0.2, fill = "#FF6666")
## Warning: `stat(density)` was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

a + geom_histogram(aes(y = stat(density), color = sex),
                   fill = "white", position = "identity") +
  geom_density(aes(color = sex), size = 1) +
  scale_color_manual(values = c("indianred", "lightblue1"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Dot Plots

First lets load the required packages

library(ggplot2)

Lets set our theme

theme_set(
  theme_dark() +
    theme(legend.position = "top")
  )

First lets initiate a ggplot object called TG

data("ToothGrowth")
ToothGrowth$dose <- as.factor(ToothGrowth$dose)

tg <- ggplot(ToothGrowth, aes(x=dose, y = len))

lets create a dotplot with a summary statistic

tg + geom_dotplot(binaxis = "y", stackdir = "center", fill = "white") +
  stat_summary(fun = mean, fun.args = list(mult=1))
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
## Warning: Removed 3 rows containing missing values (`geom_segment()`).

Lets add a boxplot and dotplot

tg + geom_boxplot(width = 0.5) +
  geom_dotplot(binaxis = "y", stackdir = "center", fill = "white")
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

tg + geom_violin(trim = FALSE) +
  geom_dotplot(binaxis = "y", stackdir = "center", fill = "#999999") +
  stat_summary(fun = mean, fun.args = list(mult=1))
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
## Warning: Removed 3 rows containing missing values (`geom_segment()`).

Lets create a dotplot with multiple groups

tg + geom_boxplot(width = 0.5) +
  geom_dotplot(aes(fill = supp), binaxis = 'y', stackdir = 'center') +
  scale_fill_manual(values = c("indianred", "lightblue"))
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

tg + geom_boxplot(aes(color = supp), width = 0.5, position = position_dodge(0.8)) +
  geom_dotplot(aes(fill = supp, color = supp), binaxis = 'y', stackdir = 'center',
               dotsize = 0.8, position = position_dodge(0.8)) +
  scale_fill_manual(values = c("#00AFBB", "#00E7B800")) +
  scale_color_manual(values = c("#00AFBB", "#E7B800"))
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

Line Plots

Now lets change it up and look at some line plots

We’ll start by making a custom dataframe kinda like the tooth dataset. This way we can see the lines and stuff that we’re modifying

df <- data.frame(dose = c("D0.5", "D1", "D2"),
                 len = c(4.2, 10, 29.5))

Now lets create a second dataframe for plotting by groups

df2 <- data.frame(supp = rep(c("VC", "OJ"), each = 3),
                  dose = rep(c("D0.5", "D1", "D2"), 2),
                  len = c(6.8, 15, 33, 4.2, 10, 29.5))
df2
##   supp dose  len
## 1   VC D0.5  6.8
## 2   VC   D1 15.0
## 3   VC   D2 33.0
## 4   OJ D0.5  4.2
## 5   OJ   D1 10.0
## 6   OJ   D2 29.5

Now lets again load ggplot2 and set a theme

library(ggplot2)

theme_set(
  theme_gray() +
    theme(legend.position = "right")
  
)

Now lets do some basic line plots. First we will build a function to display all the different line types

generateRLineTypes <- function(){
  oldpar <- par()
  par(font = 2, mar = c(0,0,0,0))
  plot(1, pch="", ylim =c(0,6), xlim=c(0,0.7), axes = FALSE, xlab = "", ylab = "")
  for(i in 0:6) lines(c(0.3,0.7), c(i,i), lty=i, lwd =3)
  text(rep(0.1,6), 0:6, labels = c("0.'Blank'", "1.'solid'", "2.'dashed'", "3.'dotted'",
                                   "4.'dotdash'", "5.'longdash'", "6.'twodash'"))
  par(mar=oldpar$mar, font=oldpar$font)
  
}

generateRLineTypes()

Now lets build a basic line plot

p <- ggplot(data = df, aes(x = dose, y = len, group = 1))

p + geom_line() + geom_point()

Now lets modify the line type and color

p + geom_line(linetype = "dashed", color = "steelblue") +
  geom_point(color = "steelblue")

Now lets try a step graph, which indicates a threshold type progression

p + geom_step() + geom_point()

Now lets move on to making multiple groups. First we’ll create our ggplot objects

p <- ggplot(df2, aes(x=dose, y=len, group = supp))

Now lets change line types and point shapes by group

p + geom_line(aes(linetype = supp, color = supp)) +
  geom_point(aes(shape = supp, color = supp)) +
  scale_color_manual(values = c("red", "blue"))

Now lets look at line plots with a numeric x axis

df3 <- data.frame(supp = rep(c("VC", "DJ"), each = 3),
                  dose = rep(c("0.5", "1", "2"), 2),
                  len=c(6.8, 15, 33, 4.2, 10, 29.5))
df3
##   supp dose  len
## 1   VC  0.5  6.8
## 2   VC    1 15.0
## 3   VC    2 33.0
## 4   DJ  0.5  4.2
## 5   DJ    1 10.0
## 6   DJ    2 29.5

Now lets plot where both axises are treated as continous labels

df3$dose <- as.numeric(as.vector(df3$dose))
ggplot(data = df3, aes(x=dose, y=len, goup = supp, color = supp)) +
  geom_line() + geom_point()

Now lets look at a line graph with having the x axis as dates. We’ll use the built in economics time series for this example.

head(economics)
## # A tibble: 6 × 6
##   date         pce    pop psavert uempmed unemploy
##   <date>     <dbl>  <dbl>   <dbl>   <dbl>    <dbl>
## 1 1967-07-01  507. 198712    12.6     4.5     2944
## 2 1967-08-01  510. 198911    12.6     4.7     2945
## 3 1967-09-01  516. 199113    11.9     4.6     2958
## 4 1967-10-01  512. 199311    12.9     4.9     3143
## 5 1967-11-01  517. 199498    12.8     4.7     3066
## 6 1967-12-01  525. 199657    11.8     4.8     3018
ggplot(data = economics, aes(x = date, y = pop)) +
  geom_line()

Now lets subset the data

ss <- subset(economics, date > as.Date("2006-1-1"))
ggplot(data = ss, aes(x = date, y = pop)) + geom_line()

We can also change the line size, for instance by another variable like unemployment

ggplot(data = economics, aes(x=date, y = pop)) +
  geom_line(aes(size = unemploy/pop))

We can also put multiple time-series data

ggplot(economics, aes(x = date)) +
  geom_line(aes(y=psavert), color = "darkred") +
  geom_line(aes(y = uempmed), color = "steelblue", linetype = "twodash")

Lastly, lets make this into an area plot

ggplot(economics, aes(x=date)) +
  geom_area(aes(y = psavert), fill = "#999999",
            color = "#999999", alpha = 0.5) +
  geom_area(aes(y = uempmed), fill = "#E69F00", 
            color = "#E69F00", alpha = 0.5)

Ridge Plots

First lets load the required packages

library(ggplot2)
library(ggridges)
#BiocManager::install("ggridges")

now lets load some sample data

?airquality
air <- ggplot(airquality) + aes(Temp, Month, group = Month) + geom_density_ridges()

air
## Picking joint bandwidth of 2.65

Now lets add some pazzaz to our graph

library(viridis)
## Loading required package: viridisLite
ggplot(airquality) + aes(Temp, Month, group = Month, fill = ..x..) +
  geom_density_ridges_gradient() +
  scale_fill_viridis(option = "C", name = "Temp")
## Warning: The dot-dot notation (`..x..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(x)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Picking joint bandwidth of 2.65

Last thing we will do is create a facet plot for all our data.

library(tidyr)

airquality %>%
gather(key = "Measurement", value = "value", Ozone, Solar.R, Wind, Temp) %>%
  ggplot() + aes(value, Month, group = Month) +
  geom_density_ridges() +
  facet_wrap(~ Measurement, scales = "free")
## Picking joint bandwidth of 11
## Picking joint bandwidth of 40.1
## Picking joint bandwidth of 2.65
## Picking joint bandwidth of 1.44
## Warning: Removed 44 rows containing non-finite values
## (`stat_density_ridges()`).

Density Plots

A density plot is a nice alternative to a histogram

set.seed(1234)

wdata = data.frame(
  sex = factor(rep(c("F", "M"), each = 200)), 
  weight = c(rnorm(200, 55), rnorm(200, 58))
)
library(dplyr)
mu <- wdata %>%
  group_by(sex) %>%
summarise(grp.mean = mean(weight))

Now lets load the graphing packages

library(ggplot2)
theme_set(
  theme_classic() +
    theme(legend.position = "right")
)

Now lets do the basic plot function. First we will create a ggplot object

d <- ggplot(wdata, aes(x = weight))

Now lets do a basic density plot

d + geom_density() +
  geom_vline(aes(xintercept = mean(weight)), linetype = "dashed")

Now lets change the y axis to count instead of density

d + geom_density(aes(y = stat(count)), fill = "lightgray") +
  geom_vline(aes(xintercept = mean(weight)), linetype = "dashed")

d + geom_density(aes(color = sex)) +
  scale_color_manual(values = c("darkgray", "gold"))

Lastly, lets fill the density plots

d + geom_density(aes(fill = sex), alpha = 0.4) +
  geom_vline(aes(xintercept = grp.mean, color = sex), data = mu, linetype = "dashed") +
  scale_color_manual(values = c("grey", "gold")) +
  scale_fill_manual(values = c("grey", "gold"))

Plotly Line Plots

First lets load our required package

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout

Lets start with a scatter plot of the orange dataset

Orange <- as.data.frame(Orange)

plot_ly(data = Orange, x = ~age, y = ~circumference)
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

Now lets add some more info

plot_ly(data = Orange, x = ~age, y = ~circumference,
        color = ~Tree, size = ~age,
        text = ~paste("Tree ID:", Tree, "<br>Age:", age, "Circ:",circumference)
)
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode
## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

Now lets create a random distribution and add it to our dataframe

trace_1 <- rnorm(35, mean = 120, sd = 10)
new_data <- data.frame(Orange, trace_1)

we’ll use the random numbers as lines on the graph

plot_ly(data = new_data, x = ~age, y = ~circumference, color = ~Tree, size = ~age,
        text = ~paste("Tree ID:", Tree, "<br>Age:", age, "<br>Circ:", circumference)) %>%
  add_trace(y = ~trace_1, mode = 'lines') %>%
  add_trace(y = ~circumference, mode = 'markers')
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.
plot_ly(data = Orange, x = ~age, y = ~circumference,
        color = ~Tree, size = ~circumference,
        text = ~paste("Tree ID:", Tree, "<br>Age:", age, "Circ", circumference)) %>%
  add_trace(y = ~circumference, mode = 'markers') %>%
  layout(
    title = "Plot or Orange data with switchable trace",
    updatemenus = list(
      list(
        type = 'downdrop',
        y = 0.8,
        buttons = list(
          list(method = 'restyle',
               args = list('mode','markers'),
               label = "Marker"
          ),
          list(method = "restyle",
               args = list('mode', 'markers'),
               labels = "Lines"
          )
        )
      )
    )
  )
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

Plotly 3D

First lets load our required packages

library(plotly)

Now lets create a random 3d matrix

d <- data.frame(
  x <- seq(1,10, by = 0.5),
  y <- seq(1,10, by = 0.5)
)

z <- matrix(rnorm(length(d$x) * length(d$y)), nrow = length(d$x), ncol = length(d$y))

Now lets plot our 3D data

plot_ly(d, x=~x, y = ~y, z = ~z) %>%
  add_surface()

Lets add some more aspects to it, such as at topogrophy

plot_ly(d, x = ~x, y =~y, z = ~z) %>%
  add_surface(
    contours = list(
      z = list(
      show = TRUE, 
      usecolormap = TRUE, 
      highlightcolor = "FF0000",
      project = list(z = TRUE)
      )
    )
  )

Now lets look at a 3d scatter plot

plot_ly(longley, x = ~GNP, y= ~Population, z = ~Employed, marker = list(color = ~GNP)) %>%
  add_markers()

Error Bars

First lets load our required libraries

library(ggplot2)
library(dplyr)
library(plotrix)

theme_set(
  theme_classic() +
    theme(legend.position = 'top')
)

Lets again use the tooth data for this exercise

df <- ToothGrowth
df$dose <- as.factor(df$dose)

Now lets use dplyr for multipulation purposes

df.summary <- df %>%
  group_by(dose) %>%
  summarise(
    sd = sd(len, na.rm = TRUE),
    stderr = std.error(len, na.rm = TRUE),
      len = mean(len),
  )

df.summary
## # A tibble: 3 × 4
##   dose     sd stderr   len
##   <fct> <dbl>  <dbl> <dbl>
## 1 0.5    4.50  1.01   10.6
## 2 1      4.42  0.987  19.7
## 3 2      3.77  0.844  26.1

Lets now look at some key functions

  • geom_corssbar() for hollow bars with middle indicated by a horizontal line
  • geom_errorbar() for error bars
  • geom_errorbarh() for horizontal error bars
  • geom_linerange() for drawing an interval represted by a verticle line
  • geom_pointrange() for creating an interval represented by a vertcal line: with a point in the middle

lets start by creating a ggplot object

tg <- ggplot(
  df.summary, 
  aes(x = dose, y = len, ymin = len - sd, ymax = len + sd)
)

Now lets look at the most basic error bars

tg + geom_pointrange()

tg + geom_errorbar(width = 0.2) +
  geom_point(size = 1.5)

Now lets create horizontal error bars by manipuating our graph

ggplot(df.summary, aes(x=len, y=dose, xmin = len-sd, xmax = len+sd)) +
  geom_point() +
  geom_errorbarh(height = 0.2)

This just gives you an idea of error bars on the horizontasl axis

Now lets look at adding jitter points (actual measurements) to our data.

ggplot(df, aes(dose, len)) +
  geom_jitter(position = position_jitter(0.2), color = "darkgray") +
  geom_pointrange(aes(ymin = len-sd, ymax = len+sd), data = df.summary)

Now lets try error bars on a violin plot

ggplot(df, aes(dose, len)) +
  geom_violin(color = "darkgray", trim = FALSE) +
  geom_pointrange(aes(ymin = len - sd, ymax = len+sd), data = df.summary)

Now how about with a line graph?

ggplot(df.summary, aes(dose, len)) +
  geom_line(aes(group = 1)) + # always specific this when you have 1 line
  geom_errorbar(aes(ymin = len-stderr, ymax = len+stderr), width = 0.2) +
  geom_point(size = 2)

Now lets make a bar graph with halve error bars

ggplot(df.summary, aes(dose, len)) +
  geom_col(fill = "lightgrey", color = "black") +
  geom_errorbar(aes(ymin = len, ymax = len+stderr), width = 0.2)

You can see that by not specifying wmin = len-stderr, we have in essence cut our error bar inn half.

How about we add jitter points to the plots? we need to use the origional dataframe for the jitter plot, and the summary df for the geom layers.

ggplot(df, aes(dose, len)) +
  geom_jitter(position = position_jitter(0.2), color = "darkgrey") +
  geom_line(aes(group = 1), data = df.summary) +
  geom_errorbar(
    aes(ymin = len - stderr, ymax = len + stderr),
    data = df.summary, width = 0.2) +
  geom_point(data = df.summary, size = 0.2)

what about adding jitterpoints to a barplot?

ggplot(df, aes(dose, len)) +
  geom_col(data = df.summary, filll = NA, color = "black") +
  geom_jitter(position = position_jitter(0.3), color = "darkgrey") +
  geom_errorbar(aes(ymin = len - stderr, ymax = len+stderr),
                data = df.summary, width = 0.2)
## Warning in geom_col(data = df.summary, filll = NA, color = "black"): Ignoring
## unknown parameters: `filll`

what is we wanted to have our error bars per group? (OJ vs VC)

df.summary2 <- df %>%
  group_by(dose, supp) %>%
  summarise(
    sd = sd(len),
    stderr = std.error(len),
    len = mean(len)
  )
## `summarise()` has grouped output by 'dose'. You can override using the
## `.groups` argument.
df.summary2
## # A tibble: 6 × 5
## # Groups:   dose [3]
##   dose  supp     sd stderr   len
##   <fct> <fct> <dbl>  <dbl> <dbl>
## 1 0.5   OJ     4.46  1.41  13.2 
## 2 0.5   VC     2.75  0.869  7.98
## 3 1     OJ     3.91  1.24  22.7 
## 4 1     VC     2.52  0.795 16.8 
## 5 2     OJ     2.66  0.840 26.1 
## 6 2     VC     4.80  1.52  26.1

Now you can see we have mean and error for each dose and supp

ggplot(df.summary2, aes(dose, len)) +
  geom_pointrange(
    aes(ymin = len - stderr, ymax = len + stderr, color = supp),
    position = position_dodge(0.3)) +
  scale_color_manual(values = c("indianred", "lightblue"))

Now about line plots with multiple error bars?

ggplot(df.summary2, aes(dose, len)) +
  geom_line(aes(linetype = supp, group = supp)) +
  geom_point() +
  geom_errorbar(aes(ymin = len-stderr, ymax = len+stderr, group = supp), width = 0.2)

And the same with a bar plot

ggplot(df.summary2, aes(dose, len )) +
  geom_col(aes(fill = supp), position = position_dodge(0.8), width= 0.7)+
    geom_errorbar(
      aes(ymin = len-sd, ymax = len+sd, group = supp), 
      width = 0.2, position = position_dodge(0.8)) +
    scale_fill_manual(values = c("indianred", "lightblue"))

Now lets add some jitterpoints

ggplot(df, aes(dose, len, color = supp)) +
  geom_jitter(position = position_dodge(0.2)) +
  geom_line(aes(group = supp), data = df.summary2) +
  geom_point() +
  geom_errorbar(aes(ymin = len - stderr, ymax = len +stderr, group = supp), data = df.summary2, width = 0.2)

ggplot(df, aes(dose, len, color = supp)) +
  geom_col(data = df.summary2, position = position_dodge(0.8), width = 0.7, fill = "white") +
  geom_jitter(
    position = position_jitterdodge(jitter.width = 0.2, dodge.width = 0.8)) +
  geom_errorbar(
    aes(ymin = len - stderr, ymax = len+stderr), data = df.summary2,
    width = 0.2, position = position_dodge(0.8)) +
  scale_color_manual(values = c("indianred", "lightblue")) +
  theme(legend.position = "top")

ECDF Plots

Now lets do an empirical cumulative distribution function. This reports any given number percentile of individuals that are above or below that threshold.

set.seed(1234)

wdata = data.frame(
  sex = factor(rep(c("F", "M"), each = 200)), 
  weight = c(rnorm(200, 55)), rnorm(200, 58))

Now lets look at our dataframe

head(wdata, 5)
##   sex   weight rnorm.200..58.
## 1   F 53.79293       58.48523
## 2   F 55.27743       58.69677
## 3   F 56.08444       58.18551
## 4   F 52.65430       58.70073
## 5   F 55.42912       58.31168

Now lets load our plotting package

library(ggplot2)

theme_set(
  theme_classic() +
    theme(legend.position = "bottom")
)

Now lets create our ECDF plot

ggplot(wdata, aes(x=weight)) +
  stat_ecdf(aes(color = sex, linetype = sex),
            geom = "step", size = 1.5) +
  scale_color_manual(values = c("#00AFBB", "#E7B900")) +
  labs(y = "weight") 

QQ Plots

Now lets take a look at qq plots. These are used to determine if the given data follows a normal distribution.

#install.packages("ggpubr")

set.seed(1234)

Now lets randomly generate some data

wdata = data.frame(
  sex = factor(rep(c("F", "M"), each = 200)), 
  weight = c(rnorm(200, 55), rnorm(200, 58))
)

Lets set our theme for the graphing with ggplot

library(ggplot2)

theme_set(
  theme_classic() +
    theme(legend.position = "top")
)

create a qq plot of the weight

ggplot(wdata, aes(sample=weight)) +
  stat_qq(aes(color = sex)) +
  scale_color_manual(values = c("#0073c2FF", "#FC4E07")) +
  labs(y = "weight")

#install.packages(ggpubr)
#library(ggpubr)

#ggqqplot(wdata, x = "weight", 
        # color = "sex", palettes = c("#0073c2FF", "#FC4E07"),
         #ggtheme = theme_pubclean())

Now what would a non-normal distribution look like?

#install.packages(mnonr)

library(mnonr)

data2 <- mnonr::mnonr(n = 1000, p=2, ms = 3, mk = 61, Sigma=matrix(c(1,0.5, 0.5, 1), 2, 2), initial = NULL)

data2 <- as.data.frame(data2)

Now lets plot the non normal data

ggplot(data2, aes(sample=V1)) +
  stat_qq()

#ggqqplot(data2, x = "V1",
         #palette = "#0073c2FF",
         #ggtheme = theme_pubclean())

Facet Plots

Lets look at how to put multiple plots together into a single figure

library(ggpubr)
library(ggplot2)

theme_set(
  theme_bw() +
    theme(legend.position = "top")
)

First lets create a nice boxplot

lets load the data

df <- ToothGrowth
df$dose <- as.factor(df$dose)

and create the plot object

p <- ggplot(df, aes(x=dose, y = len)) +
  geom_boxplot(aes(fill = supp), position = position_dodge(0.9)) +
  scale_fill_manual(values = c("#00AFBB", "#E7B800"))
p

Now lets look at the gvplot facit function

p + facet_grid(rows = vars(supp))

Now lets do a facet with multiple variables

p + facet_grid(rows = vars(dose), cols = vars(supp))

p

Now lets look at the facet_wrap function. This allows facets to be placed side-by-side

 p + facet_wrap(vars(dose), ncol = 2)

Now how do we combine multiple plots using ggarrange()

Lets start by making some basic plots. First we will define a color palette and data

my3cols <- c("#e7B800", "#2E9FDF", "#FC4E07")
ToothGrowth$dose <- as.factor(ToothGrowth$dose)

Now lets make some basic plots

p <- ggplot(ToothGrowth, aes(x = dose, y = len))
bxp <- p + geom_boxplot(aes(color = dose)) +
  scale_color_manual(values = my3cols)

ok now lets do a dotplot

dp <- p + geom_dotplot(aes(color = dose, fill = dose),
                       binaxis = 'y', stackdir = 'center') +
  scale_color_manual(values = my3cols) +
  scale_fill_manual(values = my3cols)

Now lastly lets create a lineplot

lp <- ggplot(economics, aes(x=date, y=psavert)) +
  geom_line(color = "indianred")

Now we can make the figure

figure <- ggarrange(bxp, dp, lp, labels = c("A", "B", "C"), ncol = 2, nrow = 2)
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
figure

This looks great, but we can make it look even better

figure2 <- ggarrange(
  lp,
  ggarrange(bxp, dp, ncol=2, nlabs = c ("B", "C")),
  nrow = 2, 
  labels = "A")
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
## Warning in as_grob.default(plot): Cannot convert object of class character into
## a grob.
## Warning in as_grob.default(plot): Cannot convert object of class listggarrange
## into a grob.
figure2

ok this looks really good, but you’ll notice that there are two legends that are the same.

ggarrange(
  bxp, dp, labels = c("A", "B"),
  common.legend = TRUE, legend = "bottom")
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

Lastly, we should export the plot

ggexport(figure2, filename = "facetfigure.pdf")
## file saved to facetfigure.pdf

we can also export multiple plots to a pdf

ggexport(bxp, dp, lp, filename = "multi.pdf")
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
## file saved to multi.pdf

Lastly, we can export to pdf with multiple pages and multiple columns

ggexport(bxp, dp, lp, bxp, filename = "test2.pdf", nrow = 2, ncol = 1)
## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
## file saved to test2.pdf

Heatmaps

Lets get started with heatmaps

install.packages("heatmap3")
## Installing package into '/home/student/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
library(heatmap3)

Now lets get our data.

data <- ldeaths

data2 <- do.call(cbind, split(data, cycle(data)))
dimnames(data2) <- dimnames(.preformat.ts(data))

Now lets generate a heat map

heatmap(data2)

heatmap(data2, Rowv = NA, colv = NA)
## Warning in plot.window(...): "colv" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "colv" is not a graphical parameter
## Warning in title(...): "colv" is not a graphical parameter

Now lets play with the colors

rc <- rainbow(nrow(data2), start = 0, end = 0.3)
cc <- rainbow(ncol(data2), start = 0, end = 0.3)

Now lets apply our color sections

heatmap(data2, colsidecolors = cc)
## Warning in plot.window(...): "colsidecolors" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "colsidecolors" is not a graphical parameter
## Warning in title(...): "colsidecolors" is not a graphical parameter

library(RColorBrewer)
heatmap(data2, ColSideColors = cc,
        col = colorRampPalette(brewer.pal(8, "PiYG"))(25))

Theres more that we can customize

library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:plotrix':
## 
##     plotCI
## The following object is masked from 'package:stats':
## 
##     lowess
heatmap.2(data2, colsidecolors = cc,
          col = colorRampPalette(brewer.pal(8, "PiYG"))(25))
## Warning in plot.window(...): "colsidecolors" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "colsidecolors" is not a graphical parameter
## Warning in title(...): "colsidecolors" is not a graphical parameter

Missing Values

Missing values

if you encounter a unusual value in your dataset, and simply want to move on to the rest of your analysis, you have two options:

Drop the entire row with the strange values:

library(dplyr)
library(ggplot2)

diamonds <- diamonds

diamonds2 <- diamonds %>%
  filter(between(y, 3, 20))

In this instance, y is the width of the diamond, so anything under 3 mm or above 20 is excluded I don’t recommend this option, just beacuse there is one bad measurement doesn’t mean they are all bad instead, I recommend replacing the unusual values with missing values.

diamonds3 <- diamonds %>%
  mutate(y = ifelse(y < 3 | y > 20, NA, y))

Like R, ggplot2 subscribes to the idea that missing values should’nt pass silently into the night.

ggplot(data = diamonds3, mapping = aes(x = x, y=y)) +
  geom_point()
## Warning: Removed 9 rows containing missing values (`geom_point()`).

If you want to supress that warning you can use na.rm = TRUE

ggplot(data = diamonds3, mapping = aes(x =x, y= y)) +
  geom_point(na.rm = TRUE)

Other times you want to understand what makes observations with missing values different to the observation with recordered vales. FOr example, in the NYCflights13 dataset, missing values in the dep_time. variable indicate that the flight was cancelled. So you might want to compare the scheduled departure times for cancelled and non-cancelled times.

library(nycflights13)

nycflights13::flights %>%
  mutate(
    cancelled = is.na(dep_time),
    sched_hour = sched_dep_time %/% 100, 
    sched_min = sched_dep_time %% 100,
    sched_dep_time = sched_hour +sched_min / 60
  ) %>%
  ggplot(mapping = aes(sched_dep_time)) +
  geom_freqpoly(mapping = aes(color = cancelled), bindwidth= 1/4)
## Warning in geom_freqpoly(mapping = aes(color = cancelled), bindwidth = 1/4):
## Ignoring unknown parameters: `bindwidth`
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Identifying Outliers

what if we wnat to know what our outliers are?

First we need to load the required libraries

library(outliers)
library(ggplot2)
library(readxl)

And reload the dataset because we removed outliers

Air_data <- read_xlsx("AirQualityUCI.xlsx")

Lets create a function using the grubb test to identify all outliers.

grubbs.flag <- function(x) {
  outliers <- NULL
  test <- x 
  grubbs.result <- grubbs.test(test)
  pv <- grubbs.result$p.value
  while(pv < 0.05) {
    outliers <- c(outliers, as.numeric(strsplit(grubbs.result$alternative, "")[[1]][3]))
    test <- x[!x %in% outliers]
    grubbs.result <- grubbs.test(test)
    pv <- grubbs.result$p.value
  }
  return(data.frame(X=x, outliers = (x %in% outliers)))
}
#identified_outliers <- grubbs.flag(Air_data$AH)

Now we can create a histogram showing where the outliers were

#ggplot(grubbs.flag(Air_data$AH), aes(x=Air_data$AH, color = outliers, fill = outliers)) +
  #geom_histogram(bindwidth = diff(range(Air_data$AH)))+
  #theme_bw()

Covariation

library(ggplot2)

ggplot(data = diamonds, mapping = aes(x = price)) +
  geom_freqpoly(mapping = aes(color = cut), bindwidth = 500)
## Warning in geom_freqpoly(mapping = aes(color = cut), bindwidth = 500): Ignoring
## unknown parameters: `bindwidth`
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Its hard to see the difference in distribution because the counts differe so much.

ggplot(diamonds) +
  geom_bar(mapping = aes(x = cut))

to make the comparison easier, we need to swap the display on the y-axis. Instead of displaying count, we’ll display density, which is the count standarized so that the area under the curve is one.

ggplot(data = diamonds, mapping = aes(x = price, y = ..density..)) +
  geom_freqpoly(mapping = aes(color = cut), bindwidth = 500)
## Warning in geom_freqpoly(mapping = aes(color = cut), bindwidth = 500): Ignoring
## unknown parameters: `bindwidth`
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

It appears that fiar diamonds (the lowest cut quality) have the highest average price. But maybe that because frequency polygons are a little hard to intercept.

Another alternative is the boxplot. A boxplot is a type of visual shorthand for a distribution of values.

ggplot(data = diamonds, mapping = aes(x = cut, y = price)) +
  geom_boxplot()

We see much less information about the distribution, but the boxplots are much more compact, se we can more easily compare them. It supports the conterintuitive finding that better quality diamonds are cheaper on average!

lets look at some car data

ggplot(data = mpg, mapping = aes(x =class, y = hwy)) +
  geom_boxplot()

ggplot(data = mpg) +
  geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy))

If you have long variable names, you can switch the axis and flip it to 90 degrees.

ggplot(data = mpg) +
  geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy)) +
  coord_flip()

To visualize the correlation between to continous variables, we can use a scatter plot.

ggplot(data = diamonds) +
  geom_point(mapping = aes(x = carat, y = price))

Scatterplots become less useful as the size of your dataset grows, because we get overplot. we can fix this using the alpha aesthetic

ggplot(data = diamonds) + 
  geom_point(mapping = aes(x = carat, y = price), alpha = 1/100)

Exploratory Data Analysis

library(RCurl)
## 
## Attaching package: 'RCurl'
## The following object is masked from 'package:tidyr':
## 
##     complete
library(dplyr)

Now lets get our data

site <- "https://raw.githubusercontent.com/nytimes/covid-19-data/master/colleges/colleges.csv"
college_Data <- read.csv(site)

First lets use the str function, this shows the structure of the object

str(college_Data)
## 'data.frame':    1948 obs. of  9 variables:
##  $ date      : chr  "2021-05-26" "2021-05-26" "2021-05-26" "2021-05-26" ...
##  $ state     : chr  "Alabama" "Alabama" "Alabama" "Alabama" ...
##  $ county    : chr  "Madison" "Montgomery" "Limestone" "Lee" ...
##  $ city      : chr  "Huntsville" "Montgomery" "Athens" "Auburn" ...
##  $ ipeds_id  : chr  "100654" "100724" "100812" "100858" ...
##  $ college   : chr  "Alabama A&M University" "Alabama State University" "Athens State University" "Auburn University" ...
##  $ cases     : int  41 2 45 2742 220 4 263 137 49 76 ...
##  $ cases_2021: int  NA NA 10 567 80 NA 49 53 10 35 ...
##  $ notes     : chr  "" "" "" "" ...

What if we want to arrange our dataset alphabedically by college?

alphabetical <- college_Data %>%
  arrange(college_Data$college)

The glimpse package is another way to preview data

glimpse(college_Data)
## Rows: 1,948
## Columns: 9
## $ date       <chr> "2021-05-26", "2021-05-26", "2021-05-26", "2021-05-26", "20…
## $ state      <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Ala…
## $ county     <chr> "Madison", "Montgomery", "Limestone", "Lee", "Montgomery", …
## $ city       <chr> "Huntsville", "Montgomery", "Athens", "Auburn", "Montgomery…
## $ ipeds_id   <chr> "100654", "100724", "100812", "100858", "100830", "102429",…
## $ college    <chr> "Alabama A&M University", "Alabama State University", "Athe…
## $ cases      <int> 41, 2, 45, 2742, 220, 4, 263, 137, 49, 76, 67, 0, 229, 19, …
## $ cases_2021 <int> NA, NA, 10, 567, 80, NA, 49, 53, 10, 35, 5, NA, 10, NA, 19,…
## $ notes      <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",…

we can aslo subset with select()

college_cases <- select(college_Data, college, cases)

we can also filter or subset with the filter function

Louisiana_cases <- filter(college_Data, state =="Louisiana")

Lets filter out a smaller amount of states

south_case <- filter(college_Data, state =="Louisiana" | state =="Texas" | state =="Arkansas" | state =="Mississippi")

Lets look at some time series data

First look at some time series data

library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(dplyr)
library(ggplot2)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:plotrix':
## 
##     rescale
## The following object is masked from 'package:viridis':
## 
##     viridis_pal

Now lets load some data

state_site <- "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv"
state_Data <- read.csv(state_site)

Lets create group_by object using the state column

state_cases <- group_by(state_Data, state)

class(state_cases)
## [1] "grouped_df" "tbl_df"     "tbl"        "data.frame"

How many measurements were made by state? This gives us an idea of when states started reporting

Days_since_first_reported <- tally(state_cases)

Lets visualize some data

First lets start off with some definitions

Data- obvious - the stuff we want to visualize

Layer- made of gemetric elements and requisite statistical information. Include geometric objects which represent the plot

Scales = used to map values in the data space that is used for creation of values( color, size, shape, ect)

Coordinate system - describes how the data coorinates are mapped together in relation to the plan on the graphic

Faceting - how to break up data in to subsets to display multiple types or groups of data

theme - controls the finer points of the display, such as font size and background color

options(rep.plot.width = 6, repr.plot.height = 6)

class(college_Data)
## [1] "data.frame"
head(college_Data)
##         date   state     county       city ipeds_id
## 1 2021-05-26 Alabama    Madison Huntsville   100654
## 2 2021-05-26 Alabama Montgomery Montgomery   100724
## 3 2021-05-26 Alabama  Limestone     Athens   100812
## 4 2021-05-26 Alabama        Lee     Auburn   100858
## 5 2021-05-26 Alabama Montgomery Montgomery   100830
## 6 2021-05-26 Alabama     Walker     Jasper   102429
##                           college cases cases_2021 notes
## 1          Alabama A&M University    41         NA      
## 2        Alabama State University     2         NA      
## 3         Athens State University    45         10      
## 4               Auburn University  2742        567      
## 5 Auburn University at Montgomery   220         80      
## 6  Bevill State Community College     4         NA
summary(college_Data)
##      date              state              county              city          
##  Length:1948        Length:1948        Length:1948        Length:1948       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    ipeds_id           college              cases          cases_2021    
##  Length:1948        Length:1948        Min.   :   0.0   Min.   :   0.0  
##  Class :character   Class :character   1st Qu.:  32.0   1st Qu.:  23.0  
##  Mode  :character   Mode  :character   Median : 114.5   Median :  65.0  
##                                        Mean   : 363.5   Mean   : 168.1  
##                                        3rd Qu.: 303.0   3rd Qu.: 159.0  
##                                        Max.   :9914.0   Max.   :3158.0  
##                                                         NA's   :337     
##     notes          
##  Length:1948       
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
## 

Now lets take a look at a different dataset

iris <- as.data.frame(iris)

class(iris)
## [1] "data.frame"
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
summary(iris)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 

Lets start by creating a scatter plot of the college data

ggplot(data = college_Data, aes(x = cases, y = cases_2021)) +
  geom_point() +
  theme_minimal()
## Warning: Removed 337 rows containing missing values (`geom_point()`).

Now lets do the iris data

lets color coordinate our college data

ggplot(data = college_Data, aes(x = cases, y = cases_2021, color = state)) +
  geom_point()
## Warning: Removed 337 rows containing missing values (`geom_point()`).

  theme_minimal()
## List of 97
##  $ line                      :List of 6
##   ..$ colour       : chr "black"
##   ..$ linewidth    : num 0.5
##   ..$ linetype     : num 1
##   ..$ lineend      : chr "butt"
##   ..$ arrow        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_line" "element"
##  $ rect                      :List of 5
##   ..$ fill         : chr "white"
##   ..$ colour       : chr "black"
##   ..$ linewidth    : num 0.5
##   ..$ linetype     : num 1
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ text                      :List of 11
##   ..$ family       : chr ""
##   ..$ face         : chr "plain"
##   ..$ colour       : chr "black"
##   ..$ size         : num 11
##   ..$ hjust        : num 0.5
##   ..$ vjust        : num 0.5
##   ..$ angle        : num 0
##   ..$ lineheight   : num 0.9
##   ..$ margin       : 'margin' num [1:4] 0points 0points 0points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ title                     : NULL
##  $ aspect.ratio              : NULL
##  $ axis.title                : NULL
##  $ axis.title.x              :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 2.75points 0points 0points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.title.x.top          :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 0
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 0points 2.75points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.title.x.bottom       : NULL
##  $ axis.title.y              :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 1
##   ..$ angle        : num 90
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 2.75points 0points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.title.y.left         : NULL
##  $ axis.title.y.right        :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 0
##   ..$ angle        : num -90
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 0points 0points 2.75points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text                 :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : chr "grey30"
##   ..$ size         : 'rel' num 0.8
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.x               :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 2.2points 0points 0points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.x.top           :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 0
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 0points 2.2points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.x.bottom        : NULL
##  $ axis.text.y               :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 1
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 2.2points 0points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.y.left          : NULL
##  $ axis.text.y.right         :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 0
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 0points 0points 2.2points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.ticks                : list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ axis.ticks.x              : NULL
##  $ axis.ticks.x.top          : NULL
##  $ axis.ticks.x.bottom       : NULL
##  $ axis.ticks.y              : NULL
##  $ axis.ticks.y.left         : NULL
##  $ axis.ticks.y.right        : NULL
##  $ axis.ticks.length         : 'simpleUnit' num 2.75points
##   ..- attr(*, "unit")= int 8
##  $ axis.ticks.length.x       : NULL
##  $ axis.ticks.length.x.top   : NULL
##  $ axis.ticks.length.x.bottom: NULL
##  $ axis.ticks.length.y       : NULL
##  $ axis.ticks.length.y.left  : NULL
##  $ axis.ticks.length.y.right : NULL
##  $ axis.line                 : list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ axis.line.x               : NULL
##  $ axis.line.x.top           : NULL
##  $ axis.line.x.bottom        : NULL
##  $ axis.line.y               : NULL
##  $ axis.line.y.left          : NULL
##  $ axis.line.y.right         : NULL
##  $ legend.background         : list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ legend.margin             : 'margin' num [1:4] 5.5points 5.5points 5.5points 5.5points
##   ..- attr(*, "unit")= int 8
##  $ legend.spacing            : 'simpleUnit' num 11points
##   ..- attr(*, "unit")= int 8
##  $ legend.spacing.x          : NULL
##  $ legend.spacing.y          : NULL
##  $ legend.key                : list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ legend.key.size           : 'simpleUnit' num 1.2lines
##   ..- attr(*, "unit")= int 3
##  $ legend.key.height         : NULL
##  $ legend.key.width          : NULL
##  $ legend.text               :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : 'rel' num 0.8
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ legend.text.align         : NULL
##  $ legend.title              :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 0
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ legend.title.align        : NULL
##  $ legend.position           : chr "right"
##  $ legend.direction          : NULL
##  $ legend.justification      : chr "center"
##  $ legend.box                : NULL
##  $ legend.box.just           : NULL
##  $ legend.box.margin         : 'margin' num [1:4] 0cm 0cm 0cm 0cm
##   ..- attr(*, "unit")= int 1
##  $ legend.box.background     : list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ legend.box.spacing        : 'simpleUnit' num 11points
##   ..- attr(*, "unit")= int 8
##  $ panel.background          : list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ panel.border              : list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ panel.spacing             : 'simpleUnit' num 5.5points
##   ..- attr(*, "unit")= int 8
##  $ panel.spacing.x           : NULL
##  $ panel.spacing.y           : NULL
##  $ panel.grid                :List of 6
##   ..$ colour       : chr "grey92"
##   ..$ linewidth    : NULL
##   ..$ linetype     : NULL
##   ..$ lineend      : NULL
##   ..$ arrow        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_line" "element"
##  $ panel.grid.major          : NULL
##  $ panel.grid.minor          :List of 6
##   ..$ colour       : NULL
##   ..$ linewidth    : 'rel' num 0.5
##   ..$ linetype     : NULL
##   ..$ lineend      : NULL
##   ..$ arrow        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_line" "element"
##  $ panel.grid.major.x        : NULL
##  $ panel.grid.major.y        : NULL
##  $ panel.grid.minor.x        : NULL
##  $ panel.grid.minor.y        : NULL
##  $ panel.ontop               : logi FALSE
##  $ plot.background           : list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ plot.title                :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : 'rel' num 1.2
##   ..$ hjust        : num 0
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 0points 5.5points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ plot.title.position       : chr "panel"
##  $ plot.subtitle             :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 0
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 0points 5.5points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ plot.caption              :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : 'rel' num 0.8
##   ..$ hjust        : num 1
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 5.5points 0points 0points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ plot.caption.position     : chr "panel"
##  $ plot.tag                  :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : 'rel' num 1.2
##   ..$ hjust        : num 0.5
##   ..$ vjust        : num 0.5
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ plot.tag.position         : chr "topleft"
##  $ plot.margin               : 'margin' num [1:4] 5.5points 5.5points 5.5points 5.5points
##   ..- attr(*, "unit")= int 8
##  $ strip.background          : list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ strip.background.x        : NULL
##  $ strip.background.y        : NULL
##  $ strip.clip                : chr "inherit"
##  $ strip.placement           : chr "inside"
##  $ strip.text                :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : chr "grey10"
##   ..$ size         : 'rel' num 0.8
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 4.4points 4.4points 4.4points 4.4points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ strip.text.x              : NULL
##  $ strip.text.x.bottom       : NULL
##  $ strip.text.x.top          : NULL
##  $ strip.text.y              :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : num -90
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ strip.text.y.left         :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : num 90
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ strip.text.y.right        : NULL
##  $ strip.switch.pad.grid     : 'simpleUnit' num 2.75points
##   ..- attr(*, "unit")= int 8
##  $ strip.switch.pad.wrap     : 'simpleUnit' num 2.75points
##   ..- attr(*, "unit")= int 8
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi TRUE
##  - attr(*, "validate")= logi TRUE

Lets run a simple histogram of our Louisiana case data

 hist(Louisiana_cases$cases, freq = NULL, density = NULL, breaks = 10, xlab = "Total cases", ylab = "Frequency",
      main = "Total College Covid-19 Infections (Louisiana)")

hist(iris$Sepal.Width, freq = NULL, density = NULL, breaks = 10, xlab = "Sepal width",
     ylab = "Frequency", main = "Iris Sepal width")

histogram_college <- ggplot(data = Louisiana_cases, aes(x = cases))

histogram_college + geom_histogram(bindwidth = 100, color = "black", aes(fill = county))+
  xlab("cases") +ylab("Frequency") + ggtitle("Histogram of Covid 19 cases in Louisiana")
## Warning in geom_histogram(bindwidth = 100, color = "black", aes(fill =
## county)): Ignoring unknown parameters: `bindwidth`
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Lets create a ggplot for the IRIS data

Maybe a density plat makes more sense for our college data

ggplot(south_case) + 
  geom_density(aes(x = cases, fill = state), alpha = 0.25)

Lets look at violin plots for iris

ggplot(data = iris, aes(x = Species, y =Sepal.Length, color = Species)) +
  geom_violin() +
  theme_classic()+
  theme(legend.position = "none")

Now lets try the south data

ggplot(data = south_case, aes(x=state, y = cases, color= state)) +
  geom_violin() +
  theme_gray() +
  theme(legend.position = "none")

Now lets take a look at risidual plots.

Lets start with the iris data

ggplot(lm(Sepal.Length ~ Sepal.Width, data = iris)) +
  geom_point(aes(x=.fitted, y = .resid))

Now look at the southern state cases

ggplot(lm(cases ~ cases_2021, data = south_case)) +
  geom_point(aes(x=.fitted, y = .resid))

A linear model is not a good call for the state cases

Now lets do some correlations

obesity <- read.csv("Obesity_insurance.csv")
library(tidyr)
library(dplyr)
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:ggpubr':
## 
##     mutate
## The following objects are masked from 'package:plotly':
## 
##     arrange, mutate, rename, summarise
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

Lets look at the structure of the dataset

str(obesity)
## 'data.frame':    1338 obs. of  7 variables:
##  $ age     : int  19 18 28 33 32 31 46 37 37 60 ...
##  $ sex     : chr  "female" "male" "male" "male" ...
##  $ bmi     : num  27.9 33.8 33 22.7 28.9 ...
##  $ children: int  0 1 3 0 0 0 1 3 2 0 ...
##  $ smoker  : chr  "yes" "no" "no" "no" ...
##  $ region  : chr  "southwest" "southeast" "southeast" "northwest" ...
##  $ charges : num  16885 1726 4449 21984 3867 ...

Lets look at the column classes

class(obesity)
## [1] "data.frame"

And lets get a summary of distribution of the variables

summary(obesity)
##       age            sex                 bmi           children    
##  Min.   :18.00   Length:1338        Min.   :15.96   Min.   :0.000  
##  1st Qu.:27.00   Class :character   1st Qu.:26.30   1st Qu.:0.000  
##  Median :39.00   Mode  :character   Median :30.40   Median :1.000  
##  Mean   :39.21                      Mean   :30.66   Mean   :1.095  
##  3rd Qu.:51.00                      3rd Qu.:34.69   3rd Qu.:2.000  
##  Max.   :64.00                      Max.   :53.13   Max.   :5.000  
##     smoker             region             charges     
##  Length:1338        Length:1338        Min.   : 1122  
##  Class :character   Class :character   1st Qu.: 4740  
##  Mode  :character   Mode  :character   Median : 9382  
##                                        Mean   :13270  
##                                        3rd Qu.:16640  
##                                        Max.   :63770

Now lets look at the distribution for insurance charges

hist(obesity$charges)

We can also get an idea of the distribution using a boxplot

boxplot(obesity$charges)

boxplot(obesity$bmi)

Now lets look at correlations.

cor(obesity$charges, obesity$bmi)
## [1] 0.198341

This test uses a spearman Rho correlation, or you can use Kendall’s tau by specifying it

cor(obesity$charges, obesity$bmi, method = 'kendall')
## [1] 0.08252397

This correlation measures strength of correlation between -1 and 1.

Now lets look at the Tiethen=Moore test.

TietjenMoore <- function(dataSeries, k)
{
  n = length(dataSeries)
  r = abs(dataSeries - mean(dataSeries))
  df = data.frame(dataSeries,r)
  dfs = df[order(df$r),]
  klarge = c((n-k+1):n)
  subdataSeries = dfs$dataSeries[-klarge]
  ksub = (subdataSeries = mean(subdataSeries)) **2
  all = (df$dataSeries - mean(df$dataSeries)) **2
  sum(ksub)/sum(all)
  
 }

This function helps to compute the absolute resudiauls and sorts data according to the size of the residuals. Later, we will focus on the computation of sum of squares.

FindOutliersTietjenMooreTest <- function(dataSeries, k, alpha = 0.5){
  ek <- TietjenMoore(dataSeries, k)
  test = c(1:10000)
  for (i in 1:10000){
    dataSeriesdataSeries = rnorm(length(dataSeries))
    test[i] = TietjenMoore(dataSeriesdataSeries, k)}
  Talpha = quantile(test, alpha)
  list(T = ek, Talpha = Talpha)
}

This function helps us to compute the critical values based on simulation data. Now lets demonstrate these functions with sample data and the obesity dataset for evaluating this algorithm.

First we will look at charges

boxplot(obesity$charges)

FindOutliersTietjenMooreTest(obesity$charges, 4)
## $T
## [1] 0.0008787862
## 
## $Talpha
##          50% 
## 2.606997e-07

Lets check out bmi

boxplot(obesity$bmi)

FindOutliersTietjenMooreTest(obesity$bmi, 2)
## $T
## [1] 0.01886975
## 
## $Talpha
##         50% 
## 2.55951e-07

Probability Plots

library(ggplot2)
library(tigerstats)
## Loading required package: abd
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## Loading required package: lattice
## Loading required package: grid
## Loading required package: mosaic
## Registered S3 method overwritten by 'mosaic':
##   method                           from   
##   fortify.SpatialPolygonsDataFrame ggplot2
## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.
## 
## Attaching package: 'mosaic'
## The following object is masked from 'package:Matrix':
## 
##     mean
## The following object is masked from 'package:plyr':
## 
##     count
## The following object is masked from 'package:scales':
## 
##     rescale
## The following object is masked from 'package:plotrix':
## 
##     rescale
## The following object is masked from 'package:plotly':
## 
##     do
## The following objects are masked from 'package:dplyr':
## 
##     count, do, tally
## The following object is masked from 'package:ggplot2':
## 
##     stat
## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
##     quantile, sd, t.test, var
## The following objects are masked from 'package:base':
## 
##     max, mean, min, prod, range, sample, sum
## Welcome to tigerstats!
## To learn more about this package, consult its website:
##  http://homerhanumat.github.io/tigerstats

we will use the probability plot function and their output dnorm: density function of the normal distribution.

The corresponding density is:

bmi.mean <- mean(obesity$bmi)
bmi.sd <- sd(obesity$bmi)

lets create a plot of our normal distribution

bmi.dist <- dnorm(obesity$bmi, mean = bmi.mean, sd = bmi.sd)
bmi.df <- data.frame("bmi" = obesity$bmi, "Density" = bmi.dist)

ggplot(bmi.df, aes(x = bmi, y = Density)) + 
  geom_point()

This gives us the probability of every single point occuring

Now lets use the pnorm function for more info

bmi.dist <- pnorm(obesity$bmi, mean = bmi.mean, sd = bmi.sd)
bmi.df <- data.frame("bmi" = obesity$bmi, "Density" = bmi.dist)

ggplot(bmi.df, aes(x=bmi, y = Density)) +
  geom_point()

what if we want to find the probability of the bmi being greater than 40 in our distribution?

pp_greater <- function(x) {
  paste(round(100 * pnorm(x,, mean = 30.66339, sd = 6.09818, lower.tail = FALSE), 2), "%")
}
pp_greater(40)
## [1] "6.29 %"

What about the probability that a bmi is less than 40 in our population?

pp_less <- function(x) {
  paste(round(100 *(1-pnorm(x, mean = 30.66339, sd = 6.09818, lower.tail = FALSE)),2), "%")
}

pp_less(40)
## [1] "93.71 %"
pnormGC(40, region = "below", mean = 30.66339, sd = 6.09818, graph = TRUE)

## [1] 0.9371213

what if we want to find the are in between?

pnormGC(c(20, 40), region = "between", mean = 30.66339, sd = 6.09818, graph = TRUE)

## [1] 0.8969428

What if we want to know the quantiles? Lets use the qnorm function. we need to assume a normal distribution for this.

what bmi represents the lowest 1% of the population?

qnorm(0.01, mean = 30.66339, sd = 6.09818, lower.tail = TRUE)
## [1] 16.4769

What if you want a random sampling of values within your distribution?

subset <- rnorm(50, mean = 30.66339, sd = 6.09818)

hist(subset)

subset2 <- rnorm(5000, mean = 30.66339, sd = 6.09818)

hist(subset2)

Shapiro-wilk Test

so now we know how to generate a normal distribution, how do we tell if our samples came from a normal distribution?

shapiro.test(obesity$charges[1:5])
## 
##  Shapiro-Wilk normality test
## 
## data:  obesity$charges[1:5]
## W = 0.84164, p-value = 0.1695

you can see here, with a small sample size, we would reject the null hypothesis that the sample came from a normal distribution.

shapiro.test(obesity$charges[1:1000])
## 
##  Shapiro-Wilk normality test
## 
## data:  obesity$charges[1:1000]
## W = 0.8119, p-value < 2.2e-16

Now lets check out age

shapiro.test(obesity$age[1:1000])
## 
##  Shapiro-Wilk normality test
## 
## data:  obesity$age[1:1000]
## W = 0.94406, p-value < 2.2e-16

And lastly bmi

shapiro.test(obesity$bmi[1:1000])
## 
##  Shapiro-Wilk normality test
## 
## data:  obesity$bmi[1:1000]
## W = 0.99471, p-value = 0.001426

Time series data

First lets load our packages

library(readr)
## 
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
## 
##     col_factor
library(readxl)

Air_data <- read_xlsx("AirQualityUCI.xlsx")

Date- date of measurement time - time of measurement NMGC - average hourly non- metallic hydrocarbon concentration NO2 - average hourly NO2 concentration T - temper RH - relative humidity AH- absolute humidty

str(Air_data)
## tibble [9,357 × 15] (S3: tbl_df/tbl/data.frame)
##  $ Date         : POSIXct[1:9357], format: "2004-03-10" "2004-03-10" ...
##  $ Time         : POSIXct[1:9357], format: "1899-12-31 18:00:00" "1899-12-31 19:00:00" ...
##  $ CO(GT)       : num [1:9357] 2.6 2 2.2 2.2 1.6 1.2 1.2 1 0.9 0.6 ...
##  $ PT08.S1(CO)  : num [1:9357] 1360 1292 1402 1376 1272 ...
##  $ NMHC(GT)     : num [1:9357] 150 112 88 80 51 38 31 31 24 19 ...
##  $ C6H6(GT)     : num [1:9357] 11.88 9.4 9 9.23 6.52 ...
##  $ PT08.S2(NMHC): num [1:9357] 1046 955 939 948 836 ...
##  $ NOx(GT)      : num [1:9357] 166 103 131 172 131 89 62 62 45 -200 ...
##  $ PT08.S3(NOx) : num [1:9357] 1056 1174 1140 1092 1205 ...
##  $ NO2(GT)      : num [1:9357] 113 92 114 122 116 96 77 76 60 -200 ...
##  $ PT08.S4(NO2) : num [1:9357] 1692 1559 1554 1584 1490 ...
##  $ PT08.S5(O3)  : num [1:9357] 1268 972 1074 1203 1110 ...
##  $ T            : num [1:9357] 13.6 13.3 11.9 11 11.2 ...
##  $ RH           : num [1:9357] 48.9 47.7 54 60 59.6 ...
##  $ AH           : num [1:9357] 0.758 0.725 0.75 0.787 0.789 ...
library(tidyr)
library(dplyr)
library(lubridate)
library(hms)
## 
## Attaching package: 'hms'
## The following object is masked from 'package:lubridate':
## 
##     hms
library(ggplot2)

Lets get rid of the date in the time column

Air_data$Time <- as_hms(Air_data$Time)

glimpse(Air_data)
## Rows: 9,357
## Columns: 15
## $ Date            <dttm> 2004-03-10, 2004-03-10, 2004-03-10, 2004-03-10, 2004-…
## $ Time            <time> 18:00:00, 19:00:00, 20:00:00, 21:00:00, 22:00:00, 23:…
## $ `CO(GT)`        <dbl> 2.6, 2.0, 2.2, 2.2, 1.6, 1.2, 1.2, 1.0, 0.9, 0.6, -200…
## $ `PT08.S1(CO)`   <dbl> 1360.00, 1292.25, 1402.00, 1375.50, 1272.25, 1197.00, …
## $ `NMHC(GT)`      <dbl> 150, 112, 88, 80, 51, 38, 31, 31, 24, 19, 14, 8, 16, 2…
## $ `C6H6(GT)`      <dbl> 11.881723, 9.397165, 8.997817, 9.228796, 6.518224, 4.7…
## $ `PT08.S2(NMHC)` <dbl> 1045.50, 954.75, 939.25, 948.25, 835.50, 750.25, 689.5…
## $ `NOx(GT)`       <dbl> 166, 103, 131, 172, 131, 89, 62, 62, 45, -200, 21, 16,…
## $ `PT08.S3(NOx)`  <dbl> 1056.25, 1173.75, 1140.00, 1092.00, 1205.00, 1336.50, …
## $ `NO2(GT)`       <dbl> 113, 92, 114, 122, 116, 96, 77, 76, 60, -200, 34, 28, …
## $ `PT08.S4(NO2)`  <dbl> 1692.00, 1558.75, 1554.50, 1583.75, 1490.00, 1393.00, …
## $ `PT08.S5(O3)`   <dbl> 1267.50, 972.25, 1074.00, 1203.25, 1110.00, 949.25, 73…
## $ T               <dbl> 13.600, 13.300, 11.900, 11.000, 11.150, 11.175, 11.325…
## $ RH              <dbl> 48.875, 47.700, 53.975, 60.000, 59.575, 59.175, 56.775…
## $ AH              <dbl> 0.7577538, 0.7254874, 0.7502391, 0.7867125, 0.7887942,…
plot(Air_data$AH, Air_data$RH, main = "Humidity Analysis", xlab = "absolute Humidity", ylab = "Relative Humidity")

Notice we have an outlier in our data

t.test(Air_data$RH, Air_data$AH)
## 
##  Welch Two Sample t-test
## 
## data:  Air_data$RH and Air_data$AH
## t = 69.62, df = 17471, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  45.01707 47.62536
## sample estimates:
## mean of x mean of y 
## 39.483611 -6.837604

Text Mining

First we’ll look at the unnest_token function

Lets start by looking at an Emily Dickenson passage

text <- c("Because I could not stop from Death = ",
          "He kindly stopped for me -", 
          "The Carriage held but just ourselves -",
          "and Immortality")

text
## [1] "Because I could not stop from Death = "
## [2] "He kindly stopped for me -"            
## [3] "The Carriage held but just ourselves -"
## [4] "and Immortality"

This is a typical character vector that we might want to analyze.

library(dplyr)

text_df <- tibble(line = 1:4, text = text)

text_df
## # A tibble: 4 × 2
##    line text                                    
##   <int> <chr>                                   
## 1     1 "Because I could not stop from Death = "
## 2     2 "He kindly stopped for me -"            
## 3     3 "The Carriage held but just ourselves -"
## 4     4 "and Immortality"

Next we’ll use the ‘unest_tokens’ function.

First we have the output column name that will be created as the text is unnested into it

library(tidytext)

text_df %>%
  unnest_tokens(word, text)
## # A tibble: 20 × 2
##     line word       
##    <int> <chr>      
##  1     1 because    
##  2     1 i          
##  3     1 could      
##  4     1 not        
##  5     1 stop       
##  6     1 from       
##  7     1 death      
##  8     2 he         
##  9     2 kindly     
## 10     2 stopped    
## 11     2 for        
## 12     2 me         
## 13     3 the        
## 14     3 carriage   
## 15     3 held       
## 16     3 but        
## 17     3 just       
## 18     3 ourselves  
## 19     4 and        
## 20     4 immortality

Lets use the janeaustenr package to analyze some Jane Austen texts, there are 6 books in this package.

library(janeaustenr)
library(dplyr)
library(stringr)

original_books <- austen_books() %>%
  group_by(book) %>%
  dplyr::mutate(linenumber = row_number(), 
         chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", 
                                                 ignore_case = TRUE)))) %>%
  ungroup()

original_books
## # A tibble: 73,422 × 4
##    text                    book                linenumber chapter
##    <chr>                   <fct>                    <int>   <int>
##  1 "SENSE AND SENSIBILITY" Sense & Sensibility          1       0
##  2 ""                      Sense & Sensibility          2       0
##  3 "by Jane Austen"        Sense & Sensibility          3       0
##  4 ""                      Sense & Sensibility          4       0
##  5 "(1811)"                Sense & Sensibility          5       0
##  6 ""                      Sense & Sensibility          6       0
##  7 ""                      Sense & Sensibility          7       0
##  8 ""                      Sense & Sensibility          8       0
##  9 ""                      Sense & Sensibility          9       0
## 10 "CHAPTER 1"             Sense & Sensibility         10       1
## # ℹ 73,412 more rows
library(tidytext)

tidy_books <- original_books %>%
  unnest_tokens(word, text)

tidy_books
## # A tibble: 725,055 × 4
##    book                linenumber chapter word       
##    <fct>                    <int>   <int> <chr>      
##  1 Sense & Sensibility          1       0 sense      
##  2 Sense & Sensibility          1       0 and        
##  3 Sense & Sensibility          1       0 sensibility
##  4 Sense & Sensibility          3       0 by         
##  5 Sense & Sensibility          3       0 jane       
##  6 Sense & Sensibility          3       0 austen     
##  7 Sense & Sensibility          5       0 1811       
##  8 Sense & Sensibility         10       1 chapter    
##  9 Sense & Sensibility         10       1 1          
## 10 Sense & Sensibility         13       1 the        
## # ℹ 725,045 more rows

This function uses the tokenizers package to seperate each line of text in the origional dataframe into tokens.

the default tokenizing is for words, but ither options including characters, n- gram, sentences, lines, or paragraphs can be used.

data(stop_words)

tidy_books <- tidy_books %>%
  anti_join(stop_words)
## Joining with `by = join_by(word)`

The stop words dataset in the tidytext package contains stop words from three lexicons. we can use them all together, as we have here, or filter() to only use one set of stop words if thats more appropriate for your analysis.

tidy_books %>%
  count(word, sort = TRUE)
## # A tibble: 13,914 × 2
##    word       n
##    <chr>  <int>
##  1 miss    1855
##  2 time    1337
##  3 fanny    862
##  4 dear     822
##  5 lady     817
##  6 sir      806
##  7 day      797
##  8 emma     787
##  9 sister   727
## 10 house    699
## # ℹ 13,904 more rows

Because we’ve been using tidy tools, our word counts are stored in a tidy data frame. This allows us to pipe this directly into ggplot2. For example, we can create a visualization of the most common words.

library(ggplot2)

tidy_books %>%
  count(word, sort = TRUE) %>%
  filter(n > 600) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word)) +
  geom_col() +
  labs(y = NULL, x = "word count")

The gutenbergr package

library(gutenbergr)

darwin <- gutenberg_download(c(944, 1227, 1228, 2300), mirror = "http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/")
## Warning: ! Could not download a book at
##   http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg//1/2/2/1228/1228.zip.
## ℹ The book may have been archived.
## ℹ Alternatively, You may need to select a different mirror.
## → See https://www.gutenberg.org/MIRRORS.ALL for options.

Lets break these into tokens

tidy_darwin <- darwin %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)
## Joining with `by = join_by(word)`

Lets check out what the most common darwin words are.

tidy_darwin %>%
  count(word, sort = TRUE)
## # A tibble: 22,315 × 2
##    word        n
##    <chr>   <int>
##  1 male     1620
##  2 species  1452
##  3 males    1286
##  4 female   1173
##  5 birds    1124
##  6 sexes    1062
##  7 animals  1013
##  8 females  1003
##  9 sexual    749
## 10 vol       712
## # ℹ 22,305 more rows

Now lets look at some work from Thomas Hunt Morgan, who is credited with discovering chromosomes.

morgan <- gutenberg_download(c(57198, 57460, 63540), mirror = "http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/")

Lets tokenize THM

tidy_morgan <- morgan %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)
## Joining with `by = join_by(word)`

what are THM’s most common words?

tidy_morgan %>%
  count(word, sort = TRUE)
## # A tibble: 13,855 × 2
##    word             n
##    <chr>        <int>
##  1 species        869
##  2 regeneration   814
##  3 piece          702
##  4 cut            669
##  5 male           668
##  6 forms          631
##  7 selection      604
##  8 cells          576
##  9 found          552
## 10 development    546
## # ℹ 13,845 more rows

Lastly lets look at Thomas Henry Huxley

 huxley <- gutenberg_download(c(2931, 2089, 2940, 52344), mirror = "http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg/")
tidy_huxley <- huxley %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)
## Joining with `by = join_by(word)`
tidy_huxley %>%
  count(word, sort = TRUE)
## # A tibble: 16,090 × 2
##    word          n
##    <chr>     <int>
##  1 species     339
##  2 nature      331
##  3 time        287
##  4 life        286
##  5 existence   255
##  6 knowledge   238
##  7 animals     227
##  8 natural     223
##  9 animal      216
## 10 science     207
## # ℹ 16,080 more rows

Now lets calculate the frequency for each word for the works of Darwing, Morgan and Huxley by binding the frames together.

library(tidyr)

frequency <- bind_rows(mutate(tidy_morgan, author = "Thomas Hunt Morgan"),
                       mutate(tidy_darwin, author = "Charles Darwin"),
                       mutate(tidy_huxley, author = "Thomas henry Huxley")) %>%
  mutate(word = str_extract(word, "[a-z']+")) %>%
  count(author, word) %>%
  group_by(author) %>%
  mutate(proportion = n/ sum(n)) %>%
  select(-n) %>%
  pivot_wider(names_from = author, values_from = proportion) %>%
  pivot_longer('Thomas Hunt Morgan': 'Charles Darwin', names_to = "author", values_to = "proportions")

frequency
## # A tibble: 62,720 × 4
##    word      `Thomas henry Huxley` author             proportions
##    <chr>                     <dbl> <chr>                    <dbl>
##  1 a                    0.0000149  Thomas Hunt Morgan  0.000585  
##  2 a                    0.0000149  Charles Darwin      0.0000448 
##  3 ab                   0.0000171  Thomas Hunt Morgan  0.0000470 
##  4 ab                   0.0000171  Charles Darwin      0.00000427
##  5 abaiss              NA          Thomas Hunt Morgan NA         
##  6 abaiss              NA          Charles Darwin      0.00000427
##  7 abandon              0.00000214 Thomas Hunt Morgan  0.00000214
##  8 abandon              0.00000214 Charles Darwin      0.00000214
##  9 abandoned            0.00000427 Thomas Hunt Morgan  0.00000427
## 10 abandoned            0.00000427 Charles Darwin      0.00000214
## # ℹ 62,710 more rows

Now we need to change the table so that each author has its own row

frequency2 <- pivot_wider(frequency, names_from = author, values_from = proportions)

frequency2
## # A tibble: 31,360 × 4
##    word        `Thomas henry Huxley` `Thomas Hunt Morgan` `Charles Darwin`
##    <chr>                       <dbl>                <dbl>            <dbl>
##  1 a                      0.0000149            0.000585         0.0000448 
##  2 ab                     0.0000171            0.0000470        0.00000427
##  3 abaiss                NA                   NA                0.00000427
##  4 abandon                0.00000214           0.00000214       0.00000214
##  5 abandoned              0.00000427           0.00000427       0.00000214
##  6 abashed               NA                   NA                0.00000214
##  7 abatement              0.00000427          NA                0.00000214
##  8 abbot                  0.00000427          NA                0.00000214
##  9 abbott                NA                   NA                0.00000427
## 10 abbreviated           NA                   NA                0.00000854
## # ℹ 31,350 more rows

Now lets plot

library(scales)

ggplot(frequency2, aes(x = `Charles Darwin`, y = `Thomas Hunt Morgan`), color = abs(- 'Charles Darwin' - 'Thomas Hunt Morgan')) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), 
                       low = "darkslategray4", high = "gray75") +
   theme(legend.position="none") +
  labs(y = "Thomas Hunt Morgan", x = "Charles Darwin")
## Warning: Removed 24245 rows containing missing values (`geom_point()`).
## Warning: Removed 24246 rows containing missing values (`geom_text()`).

library(scales)

ggplot(frequency2, aes(x = `Charles Darwin`, y = `Thomas henry Huxley`), color = abs(- 'Charles Darwin' - 'Thomas henry Huxley')) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), 
                       low = "darkslategray4", high = "gray75") +
   theme(legend.position="none") +
  labs(y = "Thomas henry Huxley", x = "Charles Darwin")
## Warning: Removed 23092 rows containing missing values (`geom_point()`).
## Warning: Removed 23093 rows containing missing values (`geom_text()`).

library(scales)

ggplot(frequency2, aes(x = `Thomas Hunt Morgan`, y = `Thomas henry Huxley`), color = abs(- 'Thomas Hunt Morgan' - 'Thomas henry Huxley')) +
  geom_abline(color = "gray40", lty = 2) +
  geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
  geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
  scale_x_log10(labels = percent_format()) +
  scale_y_log10(labels = percent_format()) +
  scale_color_gradient(limits = c(0, 0.001), 
                       low = "darkslategray4", high = "gray75") +
   theme(legend.position="none") +
  labs(y = "Thomas henry Huxley", x = "Thomas Hunt Morgan")
## Warning: Removed 25463 rows containing missing values (`geom_point()`).
## Warning: Removed 25464 rows containing missing values (`geom_text()`).

Sentiment Analysis

The sentiments datasets

There are a variety of methods and dictionaries that exist for evaluating the opinion or emotion of the text.

AFFIN bing nrc

bring categorizes words in a binary fashion into positive or negative nrc categories into positive, negative, anger, anticipation, discust, fear, joy, sadness, suprise, and trust. AFFIN assigns a score between -5 and 5, with negative indicating negative sentiment, and 5 positive.

the function get_sentiments() allows us to get the specific sentiments lexicon with the measures for each one.

install.packages("textdata")
## Installing package into '/home/student/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)
library(tidytext)
library(textdata)

afinn <- get_sentiments("afinn")

afinn
## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ℹ 2,467 more rows

Lets look at bing

bing <- get_sentiments("bing")

bing
## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ℹ 6,776 more rows

and lastly nrc

nrc <- get_sentiments("nrc")

nrc
## # A tibble: 13,872 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # ℹ 13,862 more rows

These libraries were created either using crowdourcing or cloud computing/ai like Amazon Mechanical Turk, or by labor of one of the authors, and then validated with crowdsourcing.

Lets look at the words with a joy score from NRC

library(gutenbergr)
library(dplyr)
library(stringr)

darwin <- gutenberg_download(c(944, 1227, 1228, 2300), mirror = "https://gutenberg.nabasny.com/")
## Warning: ! Could not download a book at
##   https://gutenberg.nabasny.com//1/2/2/1228/1228.zip.
## ℹ The book may have been archived.
## ℹ Alternatively, You may need to select a different mirror.
## → See https://www.gutenberg.org/MIRRORS.ALL for options.
tidy_books <- darwin %>%
  group_by(gutenberg_id) %>%
  dplyr::mutate(linenumber = row_number(), chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)

tidy_books
## # A tibble: 629,573 × 4
##    gutenberg_id linenumber chapter word   
##           <int>      <int>   <int> <chr>  
##  1          944          1       0 the    
##  2          944          1       0 voyage 
##  3          944          1       0 of     
##  4          944          1       0 the    
##  5          944          1       0 beagle 
##  6          944          1       0 by     
##  7          944          2       0 charles
##  8          944          2       0 darwin 
##  9          944          8       0 about  
## 10          944          8       0 the    
## # ℹ 629,563 more rows

Lets add the book name instead of GID

colnames(tidy_books)[1] <- "book"

tidy_books$book[tidy_books$book == 944] <- "The Voyage of the Beagle"
tidy_books$book[tidy_books$book == 1227] <- "The Expression of the Emotions in Man and Animals"
tidy_books$book[tidy_books$book == 1228] <- "On the Origin of Species By Means of Natural Selection"
tidy_books$book[tidy_books$book == 2300] <- "The Descent of Man, and Selection in Relation to Sex"

tidy_books
## # A tibble: 629,573 × 4
##    book                     linenumber chapter word   
##    <chr>                         <int>   <int> <chr>  
##  1 The Voyage of the Beagle          1       0 the    
##  2 The Voyage of the Beagle          1       0 voyage 
##  3 The Voyage of the Beagle          1       0 of     
##  4 The Voyage of the Beagle          1       0 the    
##  5 The Voyage of the Beagle          1       0 beagle 
##  6 The Voyage of the Beagle          1       0 by     
##  7 The Voyage of the Beagle          2       0 charles
##  8 The Voyage of the Beagle          2       0 darwin 
##  9 The Voyage of the Beagle          8       0 about  
## 10 The Voyage of the Beagle          8       0 the    
## # ℹ 629,563 more rows

Now that we have a tidy format with one word per row, we are ready for sentiment analysis. First lets use NRC.

nrc_joy <- get_sentiments("nrc") %>%
  filter(sentiment == "joy")

tidy_books %>%
  filter(book == "The Voyage of the Beagle") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)
## Joining with `by = join_by(word)`
## # A tibble: 277 × 2
##    word           n
##    <chr>      <int>
##  1 found        301
##  2 good         161
##  3 remarkable   114
##  4 green         95
##  5 kind          92
##  6 tree          86
##  7 present       85
##  8 food          78
##  9 beautiful     61
## 10 elevation     60
## # ℹ 267 more rows

we can also examine how sentiment changes throughout a work.

library(tidyr)

Charles_Darwin_sentiment <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(book, index = linenumber %% 80, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
  dplyr::mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`

Now lets plot it

library(ggplot2)

ggplot(Charles_Darwin_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

Here we will use all three of our dictionaries and examine how the sentiment changes across the arc of TVOTB.

library(tidyr)

voyage <- tidy_books %>%
  filter(book == "The Voyage of the Beagle")

voyage
## # A tibble: 208,118 × 4
##    book                     linenumber chapter word   
##    <chr>                         <int>   <int> <chr>  
##  1 The Voyage of the Beagle          1       0 the    
##  2 The Voyage of the Beagle          1       0 voyage 
##  3 The Voyage of the Beagle          1       0 of     
##  4 The Voyage of the Beagle          1       0 the    
##  5 The Voyage of the Beagle          1       0 beagle 
##  6 The Voyage of the Beagle          1       0 by     
##  7 The Voyage of the Beagle          2       0 charles
##  8 The Voyage of the Beagle          2       0 darwin 
##  9 The Voyage of the Beagle          8       0 about  
## 10 The Voyage of the Beagle          8       0 the    
## # ℹ 208,108 more rows
affin <- voyage %>%
  inner_join(get_sentiments("afinn")) %>%
  group_by(index = linenumber %/% 80) %>%
  summarise(sentiment = sum(value)) %>%
  mutate(method = "AFINN")
## Joining with `by = join_by(word)`
bing_and_nrc <- bind_rows(
  voyage %>%
    inner_join(get_sentiments("bing")) %>%
    mutate(method = "Bing et al."), 
  voyage %>%
    inner_join(get_sentiments("nrc")) %>%
                 filter(sentiment %in% c("positive", "negative"))
              ) %>%
    mutate(method = "NRC") %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
              values_from = n,
              values_fill = 0) %>%
  mutate(sentiment = positive - negative)
## Joining with `by = join_by(word)`
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("nrc")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 79 of `x` matches multiple rows in `y`.
## ℹ Row 10386 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

We can now estimate the net sentiment (positive - negative) in each chunk of the novel text for each lexion (dictionary). Lets bind them all together and visualize with ggplot

bind_rows(affin, bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")
## Warning: Removed 1 rows containing missing values (`position_stack()`).

Lets look at the counts based on each dictionary

get_sentiments("nrc") %>%
  filter(sentiment %in% c("positive", "negative")) %>%
  count(sentiment)
## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3316
## 2 positive   2308
get_sentiments("bing") %>%
  count(sentiment)
## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005
bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()
## Joining with `by = join_by(word)`
bing_word_counts
## # A tibble: 2,421 × 3
##    word       sentiment     n
##    <chr>      <chr>     <int>
##  1 great      positive    964
##  2 like       positive    705
##  3 well       positive    701
##  4 good       positive    384
##  5 doubt      negative    306
##  6 respect    positive    266
##  7 wild       negative    266
##  8 bright     positive    252
##  9 remarkable positive    247
## 10 savages    negative    217
## # ℹ 2,411 more rows

This can be shown visually, and we can pip straight into ggplot2

bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scale = "free_y") +
  labs(x = "Contribution to Sentiment", y = NULL)

Lets spot an anomoly in the dataset.

custom_stop_words <- bind_rows(tibble(words = c("wild", "dark", "great", "like"), lexicon = c("custom")), stop_words)

custom_stop_words
## # A tibble: 1,153 × 3
##    words lexicon word     
##    <chr> <chr>   <chr>    
##  1 wild  custom  <NA>     
##  2 dark  custom  <NA>     
##  3 great custom  <NA>     
##  4 like  custom  <NA>     
##  5 <NA>  SMART   a        
##  6 <NA>  SMART   a's      
##  7 <NA>  SMART   able     
##  8 <NA>  SMART   about    
##  9 <NA>  SMART   above    
## 10 <NA>  SMART   according
## # ℹ 1,143 more rows

Word Clouds!

we can see that tidy text mining and sentiment analysis works well with ggplot2, but having our data in tidy format leads to other nice graphing techniques

lets use the wordcloud package!!

library(wordcloud)
## 
## Attaching package: 'wordcloud'
## The following object is masked from 'package:gplots':
## 
##     textplot
tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))
## Joining with `by = join_by(word)`
## Warning in wordcloud(word, n, max.words = 100): female could not be fit on
## page. It will not be plotted.
## Warning in wordcloud(word, n, max.words = 100): birds could not be fit on page.
## It will not be plotted.

lets also look at comparison.cloud(), which may require turing the dataframe into a matrix.

we can change to matrix using the acast() function.

library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tigerstats':
## 
##     tips
## The following object is masked from 'package:tidyr':
## 
##     smiths
tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"), max.words = 100)
## Joining with `by = join_by(word)`

Looking at units beyond words

#bingnegative <- get_sentiments("bing") %>%
  #filter(sentiment == "negative")

#wordcounts <- tidy_books %>%
 # group_by(book, chapter) %>%
 # summarize(words = n())

#tidy_books %>%
 # semi_join(bingnegative) %>%
  #group_by(book, chapter) %>%
  #summarize(negativewords = n()) %>%
  #left_join(wordcounts, by = c("book", "chapter")) %>%
  #mutate(ratio = negativewords/words) %>%
 # filter(chapter !=0) %>%
 # slice_max(ratio, n = 1) %>%
 # ungroup()

N-Grams

So far we’ve only looked at single words, but many intresting analyses are based on the relationship between words.

Lets look at some methods of tidytext for calculating and visualizing word relationships.

library(dplyr)
library(tidytext)

darwin_books <- gutenberg_download(c(944, 1227, 1228, 2300), mirror = "https://gutenberg.nabasny.com/")
## Warning: ! Could not download a book at
##   https://gutenberg.nabasny.com//1/2/2/1228/1228.zip.
## ℹ The book may have been archived.
## ℹ Alternatively, You may need to select a different mirror.
## → See https://www.gutenberg.org/MIRRORS.ALL for options.
colnames(darwin_books)[1] <- "book"

darwin_books$book[darwin_books$book == 944] <- "The Voyage of the Beagle"
darwin_books$book[darwin_books$book == 1227] <- "The Expression of the Emotions in Man and Animals"
darwin_books$book[darwin_books$book == 1228] <- "On the Orgin of Species By Means of Natural Selection"
darwin_books$book[darwin_books$book == 2300] <- "The Descent of Man, and Selection in Relation to Sex"

darwin_bigrams <- darwin_books %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2)

darwin_bigrams
## # A tibble: 580,636 × 2
##    book                     bigram        
##    <chr>                    <chr>         
##  1 The Voyage of the Beagle the voyage    
##  2 The Voyage of the Beagle voyage of     
##  3 The Voyage of the Beagle of the        
##  4 The Voyage of the Beagle the beagle    
##  5 The Voyage of the Beagle beagle by     
##  6 The Voyage of the Beagle charles darwin
##  7 The Voyage of the Beagle <NA>          
##  8 The Voyage of the Beagle <NA>          
##  9 The Voyage of the Beagle <NA>          
## 10 The Voyage of the Beagle <NA>          
## # ℹ 580,626 more rows

Counting and filtering n-gram

darwin_bigrams %>%
  count(bigram, sort = TRUE)
## # A tibble: 206,416 × 2
##    bigram       n
##    <chr>    <int>
##  1 of the    9359
##  2 <NA>      7283
##  3 in the    4214
##  4 on the    3465
##  5 to the    2297
##  6 it is     1527
##  7 that the  1523
##  8 and the   1356
##  9 the same  1348
## 10 of a      1345
## # ℹ 206,406 more rows
library(tidyr)

bigrams_separated <- darwin_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) 

New bigram counts

bigram_counts <- bigrams_filtered %>%
  unite(bigram, word1, word2, sep = " ")

bigram_counts
## # A tibble: 79,272 × 2
##    book                     bigram        
##    <chr>                    <chr>         
##  1 The Voyage of the Beagle charles darwin
##  2 The Voyage of the Beagle NA NA         
##  3 The Voyage of the Beagle NA NA         
##  4 The Voyage of the Beagle NA NA         
##  5 The Voyage of the Beagle NA NA         
##  6 The Voyage of the Beagle NA NA         
##  7 The Voyage of the Beagle online edition
##  8 The Voyage of the Beagle NA NA         
##  9 The Voyage of the Beagle degree symbol 
## 10 The Voyage of the Beagle degs italics  
## # ℹ 79,262 more rows

We may also be interested in trigrams, which are three word combos

darwin_books %>%
  unnest_tokens(trigram, text, token = "ngrams", n = 3) %>%
  separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
  filter(!word1 %in% stop_words$word,
         !word2 %in% stop_words$word,
         !word3 %in% stop_words$word) %>%
  count(word1, word2, word3, sort = TRUE)
## # A tibble: 17,771 × 4
##    word1         word2  word3           n
##    <chr>         <chr>  <chr>       <int>
##  1 <NA>          <NA>   <NA>         7965
##  2 tierra        del    fuego          84
##  3 secondary     sexual characters     79
##  4 captain       fitz   roy            45
##  5 de            la     physionomie    30
##  6 domestication vol    ii             26
##  7 vol           ii     pp             22
##  8 vertebrates   vol    iii            21
##  9 proc          zoolog soc            18
## 10 proc          zool   soc            17
## # ℹ 17,761 more rows
darwin_books
## # A tibble: 62,882 × 2
##    book                     text                                                
##    <chr>                    <chr>                                               
##  1 The Voyage of the Beagle "  THE VOYAGE OF THE BEAGLE BY"                     
##  2 The Voyage of the Beagle "  CHARLES DARWIN"                                  
##  3 The Voyage of the Beagle ""                                                  
##  4 The Voyage of the Beagle ""                                                  
##  5 The Voyage of the Beagle ""                                                  
##  6 The Voyage of the Beagle ""                                                  
##  7 The Voyage of the Beagle ""                                                  
##  8 The Voyage of the Beagle "About the online edition."                         
##  9 The Voyage of the Beagle ""                                                  
## 10 The Voyage of the Beagle "The degree symbol is represented as \"degs.\" Ital…
## # ℹ 62,872 more rows

Lets analyze some bigrams

bigrams_filtered %>%
  filter(word2 == "selection") %>%
  count(book, word1, sort = TRUE)
## # A tibble: 24 × 3
##    book                                                 word1           n
##    <chr>                                                <chr>       <int>
##  1 The Descent of Man, and Selection in Relation to Sex sexual        254
##  2 The Descent of Man, and Selection in Relation to Sex natural       156
##  3 The Descent of Man, and Selection in Relation to Sex unconscious     6
##  4 The Descent of Man, and Selection in Relation to Sex continued       5
##  5 The Expression of the Emotions in Man and Animals    natural         4
##  6 The Descent of Man, and Selection in Relation to Sex artificial      2
##  7 The Descent of Man, and Selection in Relation to Sex ordinary        2
##  8 The Descent of Man, and Selection in Relation to Sex careful         1
##  9 The Descent of Man, and Selection in Relation to Sex consequent      1
## 10 The Descent of Man, and Selection in Relation to Sex la              1
## # ℹ 14 more rows

lets again look at tf-idf across bigrams across Darwins works.

bigram_tf_idf <- bigram_counts %>%
  count(book, bigram) %>%
  bind_tf_idf(bigram, book, n) %>%
  arrange(desc(tf_idf))

bigram_tf_idf
## # A tibble: 51,213 × 6
##    book                                       bigram     n      tf   idf  tf_idf
##    <chr>                                      <chr>  <int>   <dbl> <dbl>   <dbl>
##  1 The Expression of the Emotions in Man and… nerve…    47 0.00350 1.10  0.00384
##  2 The Descent of Man, and Selection in Rela… sexua…   138 0.00315 1.10  0.00346
##  3 The Expression of the Emotions in Man and… la ph…    35 0.00260 1.10  0.00286
##  4 The Voyage of the Beagle                   bueno…    54 0.00245 1.10  0.00269
##  5 The Voyage of the Beagle                   capta…    53 0.00240 1.10  0.00264
##  6 The Descent of Man, and Selection in Rela… secon…   101 0.00231 1.10  0.00254
##  7 The Voyage of the Beagle                   fitz …    50 0.00227 1.10  0.00249
##  8 The Expression of the Emotions in Man and… muscl…    30 0.00223 1.10  0.00245
##  9 The Expression of the Emotions in Man and… orbic…    29 0.00216 1.10  0.00237
## 10 The Descent of Man, and Selection in Rela… sexua…   254 0.00580 0.405 0.00235
## # ℹ 51,203 more rows
bigram_tf_idf %>%
  arrange(desc(tf_idf)) %>%
  group_by(book) %>%
  slice_max(tf_idf, n = 10) %>%
  ungroup() %>%
  mutate(bigram = reorder(bigram, tf_idf)) %>%
  ggplot(aes(tf_idf, bigram, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free") +
  labs(x = "tf-idf of bigrams", y = NULL)

Using bigrams to provide context in sentiment analysis

bigrams_separated %>%
  filter(word1 == "not") %>%
  count(word1, word2, sort = TRUE)
## # A tibble: 740 × 3
##    word1 word2      n
##    <chr> <chr>  <int>
##  1 not   be        90
##  2 not   only      89
##  3 not   have      80
##  4 not   a         78
##  5 not   to        74
##  6 not   the       63
##  7 not   at        56
##  8 not   been      54
##  9 not   appear    51
## 10 not   know      47
## # ℹ 730 more rows
AFINN <- get_sentiments("afinn")

AFINN
## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ℹ 2,467 more rows

we can examine the most frequent words that were preceded by “not”, and associate with sentiment.

not_words <- bigrams_separated %>%
  filter(word1 == "not") %>%
  inner_join(AFINN, by = c(word2 = "word")) %>%
  count(word2, value, sort = TRUE)

not_words
## # A tibble: 97 × 3
##    word2     value     n
##    <chr>     <dbl> <int>
##  1 like          2    10
##  2 doubt        -1     9
##  3 wish          1     7
##  4 admit        -1     5
##  5 easy          1     5
##  6 pretend      -1     5
##  7 reach         1     5
##  8 difficult    -1     4
##  9 extend        1     4
## 10 help          2     4
## # ℹ 87 more rows

we can examine the most frequent words that were preceded by “not”, and associate with sentiment.

not_words <- bigrams_separated %>%
  filter(word1 == "not") %>%
  inner_join(AFINN, by = c(word2 = "word")) %>%
  count(word2, value, sort = TRUE)

not_words
## # A tibble: 97 × 3
##    word2     value     n
##    <chr>     <dbl> <int>
##  1 like          2    10
##  2 doubt        -1     9
##  3 wish          1     7
##  4 admit        -1     5
##  5 easy          1     5
##  6 pretend      -1     5
##  7 reach         1     5
##  8 difficult    -1     4
##  9 extend        1     4
## 10 help          2     4
## # ℹ 87 more rows

Lets visualize

library(ggplot2)

not_words %>%
  mutate(contribution = n * value) %>%
  arrange(desc(abs(contribution))) %>%
  head(20) %>%
  mutate(word2 = reorder(word2, contribution)) %>%
  ggplot(aes(n * value, word2, fill = n * value > 0 )) +
  geom_col(show.legend = FALSE) +
  labs(x = "Sentiment value * number or occurences", y = "words preceded by \"not\"")

negation_words <- c("not", "no", "never", "non", "without")

negated_words <- bigrams_separated %>%
  filter(word1 %in% negation_words) %>%
  inner_join(AFINN, by = c(word2 = "word")) %>%
  count(word1, word2, value, sort = TRUE)

negated_words
## # A tibble: 182 × 4
##    word1   word2   value     n
##    <chr>   <chr>   <dbl> <int>
##  1 no      doubt      -1   173
##  2 no      great       3    14
##  3 not     like        2    10
##  4 not     doubt      -1     9
##  5 not     wish        1     7
##  6 without doubt      -1     7
##  7 not     admit      -1     5
##  8 not     easy        1     5
##  9 not     pretend    -1     5
## 10 not     reach       1     5
## # ℹ 172 more rows

Lets visualize the negation words

negated_words %>%
  mutate(contribution = n * value,
         word2= reorder(paste(word2, word1, sep = "_"), contribution)) %>%
  group_by(word1) %>%
  slice_max(abs(contribution), n = 12, with_ties = FALSE) %>%
  ggplot(aes(word2, contribution, fill = n * value > 0)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~word1, scales = "free") +
  scale_x_discrete(labels = function(x) gsub("_.+$", "", x)) +
  xlab("words preceded by negation term") +
  ylab("sentiment value * # of occurences") +
  coord_flip()

Visualize a network of bigrams with ggraph

library(igraph)
## 
## Attaching package: 'igraph'
## The following object is masked from 'package:mosaic':
## 
##     compare
## The following objects are masked from 'package:lubridate':
## 
##     %--%, union
## The following object is masked from 'package:plotly':
## 
##     groups
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
bigram_counts <- bigrams_filtered %>%
  count(word1, word2, sort = TRUE)

bigram_graph <- bigram_counts %>%
  filter(n > 20) %>%
  graph_from_data_frame()
## Warning in graph_from_data_frame(.): In `d' `NA' elements were replaced with
## string "NA"
bigram_graph
## IGRAPH 70c22b6 DN-- 158 106 -- 
## + attr: name (v/c), n (e/n)
## + edges from 70c22b6 (vertex names):
##  [1] NA        ->NA          sexual    ->selection   vol       ->ii         
##  [4] natural   ->selection   lower     ->animals     sexual    ->differences
##  [7] breeding  ->season      secondary ->sexual      south     ->america    
## [10] sexual    ->characters  del       ->fuego       tierra    ->del        
## [13] vol       ->iii         de        ->la          bright    ->colours    
## [16] sexual    ->difference  distinct  ->species     tail      ->feathers   
## [19] closely   ->allied      vocal     ->organs      zoological->gardens    
## [22] natural   ->history     buenos    ->ayres       captain   ->fitz       
## + ... omitted several edges
library(ggraph)
set.seed(1234)

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)
## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

we can also add directionality to this network

set.seed(1234)

a <- grid::arrow(type = "closed", length = unit(0.15, "inches"))

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a, end_cap = circle(0.7, 'inches')) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label=name), vjust = 1, hjust = 1) +
  theme_void()

Word Frequencies

There are a lot of words that may not be important, these are the stop words.

One way to remedy this is to look at inverse document frequency words, which decreases the weight for commonly used words and increases the weight for words that are not used very much.

Term frequency in Darwins works

library(dplyr)
library(tidytext)

book_words <- gutenberg_download(c(944, 1227, 1228, 2300), mirror = "https://gutenberg.nabasny.com/")
## Warning: ! Could not download a book at
##   https://gutenberg.nabasny.com//1/2/2/1228/1228.zip.
## ℹ The book may have been archived.
## ℹ Alternatively, You may need to select a different mirror.
## → See https://www.gutenberg.org/MIRRORS.ALL for options.
colnames(book_words)[1] <- "book"

book_words$book[book_words$book == 944] <- "The Voyage of the Beagle"
book_words$book[book_words$book == 1227] <- "The Expression of the Emotions in Man and Animals"
book_words$book[book_words$book == 1228] <- "On the Origin of Species By Means of Natural Selection"
book_words$book[book_words$book == 2300] <- "The Descent of Man, and Selection in Relation to Sex"

Now lets disect

book_words <- book_words %>%
  unnest_tokens(word, text) %>%
  count(book, word, sort = TRUE)

book_words
## # A tibble: 35,579 × 3
##    book                                                 word      n
##    <chr>                                                <chr> <int>
##  1 The Descent of Man, and Selection in Relation to Sex the   25490
##  2 The Voyage of the Beagle                             the   16930
##  3 The Descent of Man, and Selection in Relation to Sex of    16762
##  4 The Voyage of the Beagle                             of     9438
##  5 The Descent of Man, and Selection in Relation to Sex in     8882
##  6 The Expression of the Emotions in Man and Animals    the    8045
##  7 The Descent of Man, and Selection in Relation to Sex and    7854
##  8 The Descent of Man, and Selection in Relation to Sex to     5901
##  9 The Voyage of the Beagle                             and    5768
## 10 The Voyage of the Beagle                             a      5328
## # ℹ 35,569 more rows
book_words$n <- as.numeric(book_words$n)

total_words <- book_words %>%
  group_by(book) %>%
  summarize(total = sum(n))

book_words
## # A tibble: 35,579 × 3
##    book                                                 word      n
##    <chr>                                                <chr> <dbl>
##  1 The Descent of Man, and Selection in Relation to Sex the   25490
##  2 The Voyage of the Beagle                             the   16930
##  3 The Descent of Man, and Selection in Relation to Sex of    16762
##  4 The Voyage of the Beagle                             of     9438
##  5 The Descent of Man, and Selection in Relation to Sex in     8882
##  6 The Expression of the Emotions in Man and Animals    the    8045
##  7 The Descent of Man, and Selection in Relation to Sex and    7854
##  8 The Descent of Man, and Selection in Relation to Sex to     5901
##  9 The Voyage of the Beagle                             and    5768
## 10 The Voyage of the Beagle                             a      5328
## # ℹ 35,569 more rows
book_words <- cross_join(book_words, total_words)

book_words
## # A tibble: 35,579 × 4
##    book                                                 word      n  total
##    <chr>                                                <chr> <dbl>  <dbl>
##  1 The Descent of Man, and Selection in Relation to Sex the   25490 629573
##  2 The Voyage of the Beagle                             the   16930 629573
##  3 The Descent of Man, and Selection in Relation to Sex of    16762 629573
##  4 The Voyage of the Beagle                             of     9438 629573
##  5 The Descent of Man, and Selection in Relation to Sex in     8882 629573
##  6 The Expression of the Emotions in Man and Animals    the    8045 629573
##  7 The Descent of Man, and Selection in Relation to Sex and    7854 629573
##  8 The Descent of Man, and Selection in Relation to Sex to     5901 629573
##  9 The Voyage of the Beagle                             and    5768 629573
## 10 The Voyage of the Beagle                             a      5328 629573
## # ℹ 35,569 more rows

you can see that the usual supspects are the most common words, but don’t tell us anything about what the books topic is.

library(ggplot2)

ggplot(book_words, aes(n/total, fill = book)) +
  geom_histogram(show.legend = FALSE) +
  xlim(NA, 0.0009) +
  facet_wrap(~book, ncol = 2, scales = "free_y")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 125 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 3 rows containing missing values (`geom_bar()`).

Ziphf’s Law

The frequency that a wordss appears is inversely proportional to its rank when predicting a topic.

Lets apply Ziphf’s law to Darwin’s work

freq_by_rank <- book_words %>%
  group_by(book) %>%
  dplyr::mutate(rank = row_number(),
         `term frequency` = n/total) %>%
  ungroup()

freq_by_rank
## # A tibble: 35,579 × 6
##    book                                word      n  total  rank `term frequency`
##    <chr>                               <chr> <dbl>  <dbl> <int>            <dbl>
##  1 The Descent of Man, and Selection … the   25490 629573     1          0.0405 
##  2 The Voyage of the Beagle            the   16930 629573     1          0.0269 
##  3 The Descent of Man, and Selection … of    16762 629573     2          0.0266 
##  4 The Voyage of the Beagle            of     9438 629573     2          0.0150 
##  5 The Descent of Man, and Selection … in     8882 629573     3          0.0141 
##  6 The Expression of the Emotions in … the    8045 629573     1          0.0128 
##  7 The Descent of Man, and Selection … and    7854 629573     4          0.0125 
##  8 The Descent of Man, and Selection … to     5901 629573     5          0.00937
##  9 The Voyage of the Beagle            and    5768 629573     3          0.00916
## 10 The Voyage of the Beagle            a      5328 629573     4          0.00846
## # ℹ 35,569 more rows
freq_by_rank %>%
  ggplot(aes(rank, `term frequency`, color = book)) +
  geom_line(size = 1.1, alpha = 0.8, show.legend = FALSE) +
  scale_x_log10() +
  scale_y_log10()

book_tf_idf <- book_words %>%
  bind_tf_idf(word, book, n)

book_tf_idf
## # A tibble: 35,579 × 7
##    book                                   word      n  total     tf   idf tf_idf
##    <chr>                                  <chr> <dbl>  <dbl>  <dbl> <dbl>  <dbl>
##  1 The Descent of Man, and Selection in … the   25490 629573 0.0820     0      0
##  2 The Voyage of the Beagle               the   16930 629573 0.0813     0      0
##  3 The Descent of Man, and Selection in … of    16762 629573 0.0539     0      0
##  4 The Voyage of the Beagle               of     9438 629573 0.0453     0      0
##  5 The Descent of Man, and Selection in … in     8882 629573 0.0286     0      0
##  6 The Expression of the Emotions in Man… the    8045 629573 0.0729     0      0
##  7 The Descent of Man, and Selection in … and    7854 629573 0.0253     0      0
##  8 The Descent of Man, and Selection in … to     5901 629573 0.0190     0      0
##  9 The Voyage of the Beagle               and    5768 629573 0.0277     0      0
## 10 The Voyage of the Beagle               a      5328 629573 0.0256     0      0
## # ℹ 35,569 more rows

Lets look at terms with height tf-idf in Darwin’s works

book_tf_idf %>%
  select(-total) %>%
  arrange(desc(tf_idf))
## # A tibble: 35,579 × 6
##    book                                        word      n      tf   idf  tf_idf
##    <chr>                                       <chr> <dbl>   <dbl> <dbl>   <dbl>
##  1 The Expression of the Emotions in Man and … tears   126 1.14e-3 1.10  1.25e-3
##  2 The Expression of the Emotions in Man and … blush   114 1.03e-3 1.10  1.13e-3
##  3 The Descent of Man, and Selection in Relat… sexu…   745 2.40e-3 0.405 9.71e-4
##  4 The Descent of Man, and Selection in Relat… sele…   621 2.00e-3 0.405 8.10e-4
##  5 The Voyage of the Beagle                    sea     351 1.69e-3 0.405 6.84e-4
##  6 The Voyage of the Beagle                    degs    117 5.62e-4 1.10  6.18e-4
##  7 The Expression of the Emotions in Man and … eyeb…   149 1.35e-3 0.405 5.47e-4
##  8 The Voyage of the Beagle                    isla…   271 1.30e-3 0.405 5.28e-4
##  9 The Descent of Man, and Selection in Relat… shewn   143 4.60e-4 1.10  5.05e-4
## 10 The Expression of the Emotions in Man and … frown    46 4.17e-4 1.10  4.58e-4
## # ℹ 35,569 more rows

Lets look at a visualization for these high tf_idf words

book_tf_idf %>%
  group_by(book) %>%
  slice_max(tf_idf, n = 15) %>%
  ungroup() %>%
  ggplot(aes(tf_idf, reorder(word, tf_idf), fill = book)) +
  geom_col(show.legend= FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free") +
  labs(x = "tf-idf", y = NULL)

Intro to Shiny

#library(shiny)

#annotation example

#ui <- fluidPage(
  #selectInput("dataset", label = "Dataset", choices = ls("packages:datasets")),
  #verbatimTextOutput("summary"), 
  #tableOutput("table")
#)

#server <- function(input, output,session) {
  #output$summary <- renderPrint({
    #dataset <- get(input$dataset, "package::datasets")
    #summary(dataset)
 # })
#}

#shinyApp(ui, server)

Shiny Example

#library(shiny)
#library(vroom)
#library(tidyverse)

#dir.create("neiss")

#download <- funtcion(name) {
  #url <- "https://raw.githubusercontent.com/hadley/mastering-shiny/main/neiss/"
  #download.file(paste0(url, name), paste0("neiss/", name), quiet = TRUE)
#}

#injuries <- vroom::vroom("neiss/injuries.tsv.gz")
#injuries

#products <- vroom::vroom("neiss/products.tsv")
#products

#population <- vroom::vroom("neiss/population.tsv")
#population

#prod_codes <- setNames(products$prod_code, products$title)

#injuries %>%
  #mutate(diag = fct_lump(fct_infreq(diag), n = 5)) %>%
  #group_by(diag) %>%
  #summarise(n = as.integer(sum(weight)))

#count_top <- function(df, var, n = 5) {
  #df %>%
   # mutate({{var }} := fct_lump(fct_infreq({{ var }}), n = n)) %>%
    #group_by({{ var }}) %>%
    #summarise(n = as.integer(sum(weight)))
#}

#ui <- fluidPage(
  #fluidRow(
    #column(8,
          # selectInput("code", "Product",
                     #  choices = setNames(products$prod_code, products$title),
                      # width = "100%"
                     #  )
           #),
    #column(2, selectInput("y", "Y axis", c("rate", "count")))
 # ),
  
  #fluidRow(
    #column(6,
           #selectInput("code", "Product", choices = prod_codes)
    #)
  #),
  #fluidRow(
   # column(4, tableOutput("diag")),
   # column(4, tableOutput("body_part")),
   # column(4, tableOutput("location"))
  #),
  #fluidRow(
    #column(2, actionButton("story", "Tell me a story")), 
    #column(10, textOutput("narrative"))
  #),
  #fluidRow(
    #column(12, plotOutput("age_sex"))
  #)
#)

#server <- function(input, output, session) {
  #selected <- reactive(injuries %>% filter(prod_code == input$code))
  
  #output$diag <- renderTable(count_top(selected(), diag), width = "100%")
  #output$body_part <- renderTable(count_top(selected(), body_part), width = "100%")
  #output$location <- renderTable(count_top(selected(), location), width = "100%")
  
#summary <- reactive({
  #selected() %>%
    #count(age, sex, wt = weight) %>%
    #left_join(population, by = c("age", "sex")) %>%
    #mutate(rate = n / population * 1e4)
#})

#output$age_sex <- renderPlot({
  #if (input$y == "count") {
    #summary() %>%
      #ggplot(aes(age, n, color = sex)) +
      #geom_line()+
      #labs(y = "Estimated number of injuries")
  #} else {
    #summary() %>%
      #ggplot(aes(age, rate, color = sex)) +
      #geom_line(na.rm = TRUE) +
      #labs(y = "Injuries per 10,000 people")
    #}
  #}, res = 96)

  #narrative_sample <- eventReactive(
    #list(input$story, selected()), 
    #selected() %>% pull(narrative) %>% sample(1)
  #)
  
  #output$narrative <- renderText(narrative_sample())

#}

#shinyApp(ui, server)

Shiny Sidebars

#ui <- fluidPage(
  #titlePanel("Central limit theorem"),
  #sidebarLayout(
    #sidebarPanel(
      #numericInput("m", "number of samples:", 2, min = 1, max = 100)
   # ),
    #mainPanel(
      #plotOutput("hist")
   # )
 # )
#)

#server <- function(input, output, session) {
  #output$hist <- renderPlot({
    #means <- replicate(1e4, mean(runif(input$m)))
    #hist(means, breaks = 20)
  #}, res = 96)
#}

#shinyApp(ui, server)

Shiny Multipage

#gglotex <- "yadayadayada"

#ui <- fluidPage(
  #sidebarLayout(
    #sidebarPanel(
      #textOutput("panel")
   # ),
    #mainPanel(
      #tabsetPanel(
        #id = "tabset",
        #tabPanel("panel 1", ggplotex),
        #tabPanel("panel 2", "two"),
        #tabPanel("panel 3", "three")
      #)
    #)
  #)
#)

#server <- function(input, output, session) {
  #output$panel <- renderText({
    #paste("Current panel: ", input$tabset)
  #})
#}

#shinyApp(ui, server)

Shiny Navbars

#ui <- fluidPage(
  #navlistPanel(
    #id = "tabset",
    #"Heading 1",
    #tabPanel("panel 1", "Panel one contents"),
    #"Heading 2",
    #tabPanel("panel 2", "Panel two contents"),
   # tabPanel("panel 3", "Panel three contents")
  #)
#)

#server <- function(input, output, session) {
  #output$panel <- renderText({
    #paste("Current panel: ", input$tabset)
  #})
#}

#shinyApp(ui, server)

Shiny NavBars Dropdown

#ui <- navbarPage(
  #"Page title", 
  #tabPanel("panel1", "one"), 
  #tabPanel("panel 2", "two"),
  #tabPanel("pannel 3", "three"),
  #navbarMenu("subpanels",
             #tabPanel("panel 4a", "four-a"),
             #tabPanel("panel 4b", "four-b"),
             #tabPanel("panel 4c", "four-c")
             #)
#)
#server <- function(input, output, session) {
  #output$panel <- renderText({
   # paste("Current panel:", input$tabset)
  #})
#}

#shinyApp(ui, server)

Shiny Thematic

#BiocManager::install("thematic", update = FALSE)

#library(ggplot2)
#library(thematic)

#ui <- fluidPage(
  #theme = bslib::bs_theme(bootswatch = "darkly"),
  #titlePanel("A themed plot"),
  #plotOutput("plot"),
  
#)

#server <- function(input, output, session) {
  #thematic::thematic_shiny()
  
  #output$plot <- renderPlot({
    #ggplot(mtcars, aes(wt, mpg)) +
      #geom_point() +
      #geom_smooth()
  #}, res = 96)
#}

#shinyApp(ui, server)

Shiny Brushing

#library(ggplot2)

#ui <- fluidPage(
 # plotOutput("plot", brush = "plot_brush"),
  #tableOutput("data")
#)

#server <- function(input, output, session) {
  #output$plot <- renderPlot({
    #ggplot(mtcars, aes(wt, mpg)) + geom_point()
  #}, res = 96)
  
  #output$data <- renderTable({
   # brushedPoints(mtcars, input$plot_brush)
  #})
#}

#shinyApp(ui, server)

#ui <- fluidPage(
 # plotOutput("plot", brush = "plot_brush", dblclick = "plot_reset")
#)
#server <- function(input, output, session) {
  #selected <- reactiveVal(rep(FALSE, nrow(mtcars)))
  
  #observeEvent(input$plot_brush, {
   # brushed <- brushedPoints(mtcars, input$plot-brush, allRows = TRUE$selected_)
   # selected(brushed | selected())
  #})
  #observeEvent(input$plot_reset, {
    #selected(rep(FALSE, nrow(mtcars)))
  #})
  
  #output$plot <- renderPlot({
    #mtcars$sel <- selected()
    #ggplot(mtcars, aes(wt, mpg)) +
     # geom_point(aes(colour = sel)) +
      #scale_colour_discrete(limits = c("TRUE", "FALSE"))
  #}, res = 96)
#}

#shinyApp(ui, server)

Shiny Images

#setwd("~/Desktop/Classroom/Myfiles")

#puppies <- tibble::tribble(
  #~breed, ~id, ~author,
  #"corgi", "eoqnr8ikwFE", "alvannee",
  #"labrador", "KCdYn0xu2fU", "shaneguymon",
  #"spaniel", "TzjMd7i5WQI", "_redo_"
#)

#ui <- fluidPage(
  #selectInput("id", "Pick a Breed", choices = setNames(puppies$id, puppies$breed)),
  #htmlOutput("source"),
  #imageOutput("photo")
#)

#server <- function(input, output, session) {
  #output$photo <- renderImage({
    #list(
     # src = file.path("puppy-photos", paste0(input$id, ".jpg")),
      #concentType = "image/jpeg",
      #width = 500,
      #height = 600,
    #)
  #}, deleteFile = FALSE)
  
  #output$source <- renderUI({
    #info <- puppies[puppies$id == input$id, , drop = FALSE]
    #HTML(glue::glue("<p>
     # <a href='https://unsplash.com/photos/{info$id}'>original</a> by
     # <a href='https://unsplash.com/@{info$author}'>{info$author}</a>
    #</p>"))
 # })
  
#}

#shinyApp(ui, server)

Shiny Interactive

#ui <- fluidPage(
 # plotOutput("plot", click = "plot_click"),
 # verbatimTextOutput("info")
#)

#server <- function(input, output) {
  #output$plot <- renderPlot({
   # plot(mtcars$wt, mtcars$mpg)
  #}, res = 96)
  
  #output$info <- renderPrint({
   # req(input$plot_click)
   # x <- round(input$plot_click$x, 2)
   # y <- round(input$plot_click$y, 2)
   # cat("[", x, ", ", y , "]", sep = "")
  #})
#}


#shinyApp(ui, server)

Shiny Responsive Graphs

#library(ggplot2)

#ui <- fluidPage(
  #selectInput("x", "x variable", choices = names(iris)),
  #selectInput("y", "y variable", choices = names(iris)),
  #selectInput("geom", "geom", c("print", "smooth", "jitter")),
  #plotOutput("plot")
#)

#server <- function(input, output, session) {
  #plot_geom <- reactive({
    #switch(input$geom, 
           #point = geom_point(),
           #smooth = geom_smooth(se = FALSE),
           #jitter = geom_jitter()
      #)
 # })
  
  #output$plot <- renderPlot({
    #ggplot(iris, aes(.data[[input$x]], .data[[input$y]])) +
      #plot_geom()
  #}, res = 96)
#}

#shinyApp(ui, server)

Shiny Select Boxes

#library(dplyr)

#setwd("H:/Data Products/Shiny/Examples/SelectBoxes")

#health <- vroom::vroom("us_county_sociohealth_data.csv", col_types = list(), na = "")
#health %>%
  #select(state, county, everything()) %>%
  #arrange(county)

#ui <- fluidPage(
  #selectInput("state", "state", choices = unique(health$state)),
  #selectInput("county", "county", choices = NULL),
  #tableOutput("data")
#)

#server <- function(input, output, session) {
  #state <- reactive({
   # filter(health, state == input$state)
  #})
  
  #observeEvent(state(), {
    #choices <- unique(state()$county)
    #updateSelectInput(inputId = "county", choices = choices)
  #})
  
  #county <- reactive({
    #req(input$county)
    #filter(state(), county == input$county)
  #})
  
  #output$data <- renderTable({
    #req(input$county)
    #county() %>%
      #filter(county == input$county) %>%
      #select(2:9)
  #})
#}

Twitter Sentiments

#setwd("~/Desktop/classroom/myfiles")

# Lets load the twitter api package

#install.packages('academictwitterR')

#library(academictwitteR)

#bearer_token <- "AAAAAAAAAAAAAAAAAAAAAByFhgEAAAAAl4QlOCyArwygclOFewUo5ARSaAG%3DvJpyDIOBKBVdtgnEv#7wrpdwoeI5wbQLbxfSYElUGZuwixOkbQm"

#tweets20us <- 
  #get_all_tweets(
    #"covid-19 has:geo",
    #"2020-06-01T01:00:00z",
    #"2020-06-08T01:00:00z",
    #bearer_token,
    #n = 5000,
    #country = "us"
    
  #)


#tweets20us <- 
  #get_all_tweets(
    #"covid-19 has:geo",
    #"2020-01-01T01:00:00z",
    #"2020-01-08T01:00:00z",
    #bearer_token,
    #n = 5000,
    #country = "us"
    
  #)


#tweets20us <- 
  #get_all_tweets(
    #"covid-19 has:geo",
    #"2020-01-01T01:00:00z",
    #"2020-01-08T01:00:00z",
    #bearer_token,
    #n = 5000,
    #country = "us"
    
  #)

#```{r}
#First thing lets load our data

#Lets load the Canada Data



#tweets20CA <- read.csv("tweets20CA.CSV", row.names = 1)
#tweets21CA <- read.csv("tweets21CA.CSV", row.names = 1)
#tweets22CA <- read.csv("tweets22CA.CSV", row.names = 1)

#```

#Lets load the US data
#```{r}

#tweets20US <- read.csv("tweets20US.CSV", row.names = 1)
#tweets21US <- read.csv("tweets21US.CSV", row.names = 1)
#tweets22US <- read.csv("tweets22US.CSV", row.names = 1)

#```

#Lets load the British Data
#```{r}

#tweets20GB <- read.csv("tweets20GB.CSV", row.names = 1)
#tweets21GB <- read.csv("tweets21GB.CSV", row.names = 1)
#tweets22GB <- read.csv("tweets22GB.CSV", row.names = 1)

#```

#Now lets load our required libraries

#```{r}

#library(lubridate)
#library(ggplot2)
#library(dplyr)
#library(readr)

#```

#Lets combine our US tweets 

#```{r}

#tweetsUS <- bind_rows(tweets21US %>%
                        #mutate(year = "y2021"),
                      #tweets22US %>%
                      #  mutate(year = "y2022"),
                      #tweets20US %>%
                      #  mutate(year = "year2020")) %>%
  #mutate(timestamp = ymd_hms(created_at))

#```

#Lets combine our GB tweets

#```{r}

#tweetsGB <- bind_rows(tweets21GB %>%
                     #   mutate(year = "y2021"),
                     # tweets22GB %>%
                     #   mutate(year = "y2022"),
                     # tweets20GB %>%
                     #   mutate(year = "year2020")) %>%
 # mutate(timestamp = ymd_hms(created_at))

#```

#Lets combine our Canada Tweets

#```{r}

#tweetsCA <- bind_rows(tweets21CA %>%
                     #   mutate(year = "y2021"),
                     # tweets22CA %>%
                     #   mutate(year = "y2022"),
                     # tweets20CA %>%
                     #   mutate(year = "year2020")) %>%
 # mutate(timestamp = ymd_hms(created_at))

#```


#Now lets clean our US tweets

#```{r}

#remove_reg <- "&amp;|&lt;|&gt;"
#tidy_tweetsUS <- tweetsUS %>%
 # filter(!str_detect(text, "^RT")) %>%
 # mutate(text = str_remove_all(text, remove_reg)) %>%
 # unnest_tokens(word, text, token = "tweets") %>%
  #filter(!word %in% stop_words$word,
         #!word %in% str_remove_all(stop_words$word, "'"),
        # str_detect(word, "[a-z]"))
#```

#Now lets clean our GB tweets

#```{r}

#remove_reg <- "&amp;|&lt;|&gt;"
#tidy_tweetsGB <- tweetsGB %>%
  #filter(!str_detect(text, "^RT")) %>%
  #mutate(text = str_remove_all(text, remove_reg)) %>%
  #unnest_tokens(word, text, token = "tweets") %>%
  #filter(!word %in% stop_words$word,
        # !word %in% str_remove_all(stop_words$word, "'"),
         #str_detect(word, "[a-z]"))
#```

#Now lets clean our Canada tweets

#```{r}

#remove_reg <- "&amp;|&lt;|&gt;"
#tidy_tweetsCA <- tweetsCA %>%
 # filter(!str_detect(text, "^RT")) %>%
  #mutate(text = str_remove_all(text, remove_reg)) %>%
  #unnest_tokens(word, text, token = "tweets") %>%
  #filter(!word %in% stop_words$word,
  #       !word %in% str_remove_all(stop_words$word, "'"),
   #      str_detect(word, "[a-z]"))
#```

#Lets take a look at our data

#```{r}

#ggplot(tweetUS, aes(x = timestamp, fill = year)) +
  #geom_histogram(position = "identity", bins = 20, show.legend = FALSE) + 
  #facet-wrap(~year, ncol = 1)

#```

#lets look at our GB data

#```{r}

#ggplot(tweetGB, aes(x = timestamp, fill = year)) +
 # geom_histogram(position = "identity", bins = 20, show.legend = FALSE) + 
  #facet-wrap(~year, ncol = 1)

#```

#lets look at our Canada tweets

#```{r}

#ggplot(tweetCA, aes(x = timestamp, fill = year)) +
  #geom_histogram(position = "identity", bins = 20, show.legend = FALSE) + 
 # facet-wrap(~year, ncol = 1)

#```

#lets look at the word frequencies for the US covid data
#```{r}

#frequencyUS <- tidy_tweetsUS %>%
  #count(year, word, sort = TRUE) %>%
  #left_join(tidy_tweetsUS %>%
              #count(year, name = "total")) %>%
  #mutate(freq = n/total)

#frequencyUS
#```


#Lets look at the Gb frequenices for the covid data
#```{r}

#frequencyGB <- tidy_tweetsGB %>%
  #count(year, word, sort = TRUE) %>%
  #left_join(tidy_tweetsGB %>%
              #count(year, name = "total")) %>%
  #mutate(freq = n/total)

#frequencyGB
#```

#Lets look at the CA frequencies for the covid data

#```{r}

#frequencyCA <- tidy_tweetsCA %>%
 # count(year, word, sort = TRUE) %>%
 # left_join(tidy_tweetsCA %>%
  #            count(year, name = "total")) %>%
 # mutate(freq = n/total)

#frequencyCA
#```


#frequencies for US
#```{r}

#library(tidyr)

#frequencyUS2 <- frequencyUS %>%
 # select(year, word, freq) %>%
 # pivot_wider(names_from = year, values_from = freq) %>%
 # arrange('y2020', 'y2021', 'y2022')

#head(frequencyUS2)

#```

#frequenices for GB
#```{r}

#library(tidyr)

#frequencyGB2 <- frequencyGB %>%
#  select(year, word, freq) %>%
#  pivot_wider(names_from = year, values_from = freq) %>%
#  arrange('y2020', 'y2021', 'y2022')

#head(frequencyGB2)

#```

#frequenices for CA
#```{r}

#library(tidyr)

#frequencyCA2 <- frequencyCA %>%
  #select(year, word, freq) %>%
  #pivot_wider(names_from = year, values_from = freq) %>%
  #arrange('y2020', 'y2021', 'y2022')

#head(frequencyCA2)

#```

#```{r}
#ggplot(frequencyUS2, aes(2020, 2022)) + 
#geom_jitter(alpha = 0.1, size = 2.5, width = 0.25, height = 0.25) +
#geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
#scale_x_log10(labels = percent_format()) +
#scale_y_log10(labels = percent_format()) +
#geom_abline(color = "red")
#```

#```{r}

#library(dplyr)
#library(stringr)

#nrc_joy <- get_sentiments("nrc") %>%
  #filter(sentiment == "joy")

#tidy_tweetsUS %>%
 # filter(year == "y2020") %>%
  #inner_join(nrc_joy) %>%
  #count(word, sort = TRUE)

#```


#```{r}

#nrc_anger <- %>%
 # filter(sentiment == "anger")

#tidy_tweetsGB %>%
  #filter(year == "y2020") %>%
  #inner_join(nrc_anger) %>%
  #count(word, sort = TRUE)
#```

#```{r}

#nrc_fear <- %>%
  #filter(sentiment == "fear")

#tidy_tweetsCA %>%
  #filter(year == "y2020") %>%
  #inner_join(nrc_fear) %>%
  #count(word, sort = TRUE)
#```

#GB sentiment plot

#```{r}

#GB_Covid_sentiment <- tidy_tweetsGB %>%
 # inner_join(get_sentiments("bing")) %>%
 # count(year, index = id %/% 80, sentiment) %>%
 # pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
 # mutate(sentiment = positive - negative)

#```

#```{r}

#library(ggplot2)

#ggplot(GB_Covid_Sentiment, aes(sentiment, fill = year)) +
  #geom_histogram(show.legend = FALSE) +
  #facet_wrap(~year, ncol = 2, scales = "free_x")
#```


#```{r}

#GB_Covid_Sentiment_Means <- aggregate(x = GB_Covid_Sentiment$sentiment, by = list(GB_Covid_Sentiment$year), FUN = mean)

#GB_Covid_Sentiment_Means 

#```

#US sentiment plot 

#```{r}

#US_Covid_sentiment <- tidy_tweetsUS %>%
#  inner_join(get_sentiments("bing")) %>%
 # count(year, index = id %/% 80, sentiment) %>%
 # pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
 # mutate(sentiment = positive - negative)

#```

#```{r}

#library(ggplot2)

#ggplot(US_Covid_Sentiment, aes(sentiment, fill = year)) +
 # geom_histogram(show.legend = FALSE) +
 # facet_wrap(~year, ncol = 2, scales = "free_x")
#```


#```{r}

#US_Covid_Sentiment_Means <- aggregate(x = US_Covid_Sentiment$sentiment, by = list(US_Covid_Sentiment$year), FUN = mean)

#US_Covid_Sentiment_Means 

#```


#Canada sentiment plot

#```{r}

#CA_Covid_sentiment <- tidy_tweetsCA %>%
 # inner_join(get_sentiments("bing")) %>%
 # count(year, index = id %/% 80, sentiment) %>%
 # pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
 # mutate(sentiment = positive - negative)

#```

#```{r}

#library(ggplot2)

#ggplot(CA_Covid_Sentiment, aes(sentiment, fill = year)) +
  #geom_histogram(show.legend = FALSE) +
  #facet_wrap(~year, ncol = 2, scales = "free_x")
#```


#```{r}

#CA_Covid_Sentiment_Means <- aggregate(x = CA_Covid_Sentiment$sentiment, by = #list(CA_Covid_Sentiment$year), FUN = mean)

#CA_Covid_Sentiment_Means 

#```

#```{r}
#bing_word_counts_US <- tidy_tweetsUS %>%
 # inner_join(get_sentiments("bing")) %>%
 # count(word, sentiment, sort = TRUE) %>%
 # ungroup()

#bing_word_counts_US

#```

#```{r}

#bing_word_counts_US %>%
 # group_by(sentiment) %>% 
 # slice_max(n, n = 10) %>%
 # ungroup() %>%
 # mutate(word = reorder(word, n)) %>%
 # ggplot(aes(n, word, fill = sentiment)) +
 # geom_col(show.legend = FALSE) +
  #facet_wrap(~sentiment, scales = "free_y") +
  #labs(x = "Contribution to sentiment", y = NULL)
#```

#```{r}
#custom_stop_words <- bind_rows(tibble(word = c("trump", "positive"),
                                      #lexicon = c("custom")),
                              # stop_words)
#custom_stop_words
#```

## word clouds! ##

#```{r}
#library(wordcloud)

#tidy_tweetsUS %>% 
 # anti_join(custom_stop_words) %>%
 # inner_join(get_sentiments("bing")) %>%
 # count(word, sentiment, sort = TRUE) %>%
 # acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  #comparison.cloud(colors = c("gray20", "gray80"),
                   #max.words = 100)
#```

#```{r}

#tidy-tweetsUS %>%
 # anti_join(custom_stop_words) %>%
 # count(word) %>%
  #with(wordcloud(word, n, max.words = 100))

#```

#```{r}

#bingnegative <- get_sentiments("bing") %>%
 # filter(sentiment == "negative")

#wordcounts<- tidy_tweetsCA %>%
 # group_by(year) %>%
 # summarize(words = n())

#tidy_tweetsCA %>%
  #semi_join(bingnegative) %>%
  #group_by(year) %>%
  #summarize(negativewords = n ()) %>%
  #left_join(wordcounts, by = "year") %>%
 # mutate(ratio = negativewords/words) %>%
  #slice_max(ratio, n = 3) %>%
  #ungroup()
#```