Complete Final Project

RUN the first two chunks before knitting. IN order to complete the knitting process these two chunks must be loaded prior to knitting Accept the two Selections by typing 1. These two chunks originate from the stack overflow database.

options(textdata_download_dir = tempdir())
textdata::lexicon_afinn()

## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ℹ 2,467 more rows

options(textdata_download_dir = tempdir())
textdata::lexicon_nrc()

## # A tibble: 13,872 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # ℹ 13,862 more rows

Now lets take a look at some ggplot2 barplots

We’ll start with making a dataframe based on the tooth data.

df <- data.frame(dose = c("D0.5", "D1", "D2"),
len = c(4.2, 10, 29.5))

df

##   dose  len
## 1 D0.5  4.2
## 2   D1 10.0
## 3   D2 29.5

And now lets make a second dataframe

df2 <- data.frame(supp=rep(c("VC", "OJ"), each = 3),
dose = rep(c("D0.5", "D1", "D2"), 2),
len = c(6.8, 15, 33, 4.2, 10, 29.5))

df2

##   supp dose  len
## 1   VC D0.5  6.8
## 2   VC   D1 15.0
## 3   VC   D2 33.0
## 4   OJ D0.5  4.2
## 5   OJ   D1 10.0
## 6   OJ   D2 29.5

Lets load up GGplot 2

library(ggplot2)

lets set out parameters for ggplot

theme_set(
theme_classic()) +
theme(legend.position = "top")

## List of 97
##  $ line                      :List of 6
##   ..$ colour       : chr "black"
##   ..$ linewidth    : num 0.5
##   ..$ linetype     : num 1
##   ..$ lineend      : chr "butt"
##   ..$ arrow        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_line" "element"
##  $ rect                      :List of 5
##   ..$ fill         : chr "white"
##   ..$ colour       : chr "black"
##   ..$ linewidth    : num 0.5
##   ..$ linetype     : num 1
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ text                      :List of 11
##   ..$ family       : chr ""
##   ..$ face         : chr "plain"
##   ..$ colour       : chr "black"
##   ..$ size         : num 11
##   ..$ hjust        : num 0.5
##   ..$ vjust        : num 0.5
##   ..$ angle        : num 0
##   ..$ lineheight   : num 0.9
##   ..$ margin       : 'margin' num [1:4] 0points 0points 0points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ title                     : NULL
##  $ aspect.ratio              : NULL
##  $ axis.title                : NULL
##  $ axis.title.x              :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 2.75points 0points 0points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.title.x.top          :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 0
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 0points 2.75points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.title.x.bottom       : NULL
##  $ axis.title.y              :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 1
##   ..$ angle        : num 90
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 2.75points 0points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.title.y.left         : NULL
##  $ axis.title.y.right        :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 0
##   ..$ angle        : num -90
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 0points 0points 2.75points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text                 :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : chr "grey30"
##   ..$ size         : 'rel' num 0.8
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.x               :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 2.2points 0points 0points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.x.top           :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 0
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 0points 2.2points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.x.bottom        : NULL
##  $ axis.text.y               :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 1
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 2.2points 0points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.y.left          : NULL
##  $ axis.text.y.right         :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 0
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 0points 0points 2.2points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.ticks                :List of 6
##   ..$ colour       : chr "grey20"
##   ..$ linewidth    : NULL
##   ..$ linetype     : NULL
##   ..$ lineend      : NULL
##   ..$ arrow        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_line" "element"
##  $ axis.ticks.x              : NULL
##  $ axis.ticks.x.top          : NULL
##  $ axis.ticks.x.bottom       : NULL
##  $ axis.ticks.y              : NULL
##  $ axis.ticks.y.left         : NULL
##  $ axis.ticks.y.right        : NULL
##  $ axis.ticks.length         : 'simpleUnit' num 2.75points
##   ..- attr(*, "unit")= int 8
##  $ axis.ticks.length.x       : NULL
##  $ axis.ticks.length.x.top   : NULL
##  $ axis.ticks.length.x.bottom: NULL
##  $ axis.ticks.length.y       : NULL
##  $ axis.ticks.length.y.left  : NULL
##  $ axis.ticks.length.y.right : NULL
##  $ axis.line                 : list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ axis.line.x               : NULL
##  $ axis.line.x.top           : NULL
##  $ axis.line.x.bottom        : NULL
##  $ axis.line.y               : NULL
##  $ axis.line.y.left          : NULL
##  $ axis.line.y.right         : NULL
##  $ legend.background         :List of 5
##   ..$ fill         : NULL
##   ..$ colour       : logi NA
##   ..$ linewidth    : NULL
##   ..$ linetype     : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ legend.margin             : 'margin' num [1:4] 5.5points 5.5points 5.5points 5.5points
##   ..- attr(*, "unit")= int 8
##  $ legend.spacing            : 'simpleUnit' num 11points
##   ..- attr(*, "unit")= int 8
##  $ legend.spacing.x          : NULL
##  $ legend.spacing.y          : NULL
##  $ legend.key                :List of 5
##   ..$ fill         : chr "grey95"
##   ..$ colour       : logi NA
##   ..$ linewidth    : NULL
##   ..$ linetype     : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ legend.key.size           : 'simpleUnit' num 1.2lines
##   ..- attr(*, "unit")= int 3
##  $ legend.key.height         : NULL
##  $ legend.key.width          : NULL
##  $ legend.text               :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : 'rel' num 0.8
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ legend.text.align         : NULL
##  $ legend.title              :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 0
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ legend.title.align        : NULL
##  $ legend.position           : chr "top"
##  $ legend.direction          : NULL
##  $ legend.justification      : chr "center"
##  $ legend.box                : NULL
##  $ legend.box.just           : NULL
##  $ legend.box.margin         : 'margin' num [1:4] 0cm 0cm 0cm 0cm
##   ..- attr(*, "unit")= int 1
##  $ legend.box.background     : list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ legend.box.spacing        : 'simpleUnit' num 11points
##   ..- attr(*, "unit")= int 8
##  $ panel.background          :List of 5
##   ..$ fill         : chr "grey92"
##   ..$ colour       : logi NA
##   ..$ linewidth    : NULL
##   ..$ linetype     : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ panel.border              : list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ panel.spacing             : 'simpleUnit' num 5.5points
##   ..- attr(*, "unit")= int 8
##  $ panel.spacing.x           : NULL
##  $ panel.spacing.y           : NULL
##  $ panel.grid                :List of 6
##   ..$ colour       : chr "white"
##   ..$ linewidth    : NULL
##   ..$ linetype     : NULL
##   ..$ lineend      : NULL
##   ..$ arrow        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_line" "element"
##  $ panel.grid.major          : NULL
##  $ panel.grid.minor          :List of 6
##   ..$ colour       : NULL
##   ..$ linewidth    : 'rel' num 0.5
##   ..$ linetype     : NULL
##   ..$ lineend      : NULL
##   ..$ arrow        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_line" "element"
##  $ panel.grid.major.x        : NULL
##  $ panel.grid.major.y        : NULL
##  $ panel.grid.minor.x        : NULL
##  $ panel.grid.minor.y        : NULL
##  $ panel.ontop               : logi FALSE
##  $ plot.background           :List of 5
##   ..$ fill         : NULL
##   ..$ colour       : chr "white"
##   ..$ linewidth    : NULL
##   ..$ linetype     : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ plot.title                :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : 'rel' num 1.2
##   ..$ hjust        : num 0
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 0points 5.5points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ plot.title.position       : chr "panel"
##  $ plot.subtitle             :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 0
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 0points 5.5points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ plot.caption              :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : 'rel' num 0.8
##   ..$ hjust        : num 1
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 5.5points 0points 0points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ plot.caption.position     : chr "panel"
##  $ plot.tag                  :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : 'rel' num 1.2
##   ..$ hjust        : num 0.5
##   ..$ vjust        : num 0.5
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ plot.tag.position         : chr "topleft"
##  $ plot.margin               : 'margin' num [1:4] 5.5points 5.5points 5.5points 5.5points
##   ..- attr(*, "unit")= int 8
##  $ strip.background          :List of 5
##   ..$ fill         : chr "grey85"
##   ..$ colour       : logi NA
##   ..$ linewidth    : NULL
##   ..$ linetype     : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ strip.background.x        : NULL
##  $ strip.background.y        : NULL
##  $ strip.clip                : chr "inherit"
##  $ strip.placement           : chr "inside"
##  $ strip.text                :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : chr "grey10"
##   ..$ size         : 'rel' num 0.8
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 4.4points 4.4points 4.4points 4.4points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ strip.text.x              : NULL
##  $ strip.text.x.bottom       : NULL
##  $ strip.text.x.top          : NULL
##  $ strip.text.y              :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : num -90
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ strip.text.y.left         :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : num 90
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ strip.text.y.right        : NULL
##  $ strip.switch.pad.grid     : 'simpleUnit' num 2.75points
##   ..- attr(*, "unit")= int 8
##  $ strip.switch.pad.wrap     : 'simpleUnit' num 2.75points
##   ..- attr(*, "unit")= int 8
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi TRUE
##  - attr(*, "validate")= logi TRUE

Lets start with some basic barplots using the tooth data

f <- ggplot(df, aes(x = dose, y = len))

f + geom_col()

Now lets change the fill, and add labels to the top

f + geom_col(fill = "darkblue") + geom_text(aes(label = len), vjust = -0.3)

Now lets add the labels inside the bars

f + geom_col(fill = "darkblue") + geom_text(aes(label = len), vjust = 1.6, color = "white")

Now lets change the barplot colors by group

f + geom_col(aes(color = dose), fill = "white") + scale_color_manual(values =c("blue", "gold", "red"))

This is kind of hard to see, so lets change the fill.

f + geom_col(aes(fill = dose)) + scale_fill_manual(values = c("blue", "gold", "red"))

Ok how do we do this with multiple groups

library(ggplot2)

ggplot(df2, aes(x = dose, y = len, fill = supp)) + 
  geom_col(position = position_stack()) + 
  scale_color_manual(values = c("blue", "gold")) +
  scale_fill_manual(values = c("blue", "gold"))

p <- ggplot(df2, aes(x = dose, y = len, fill = supp)) +
  geom_col(aes(color = supp), position = position_dodge(0.8), width = 0.7) +
  scale_fill_manual(values = c("blue", "gold")) 

print(p)

Now lets add those labesl to the dodged barplot

p + geom_text(aes(label = len, group = supp), position = position_dodge(0.8), vjust = -0.2, size =3.5)

Now what if we want to add labels to our stacked barplots? For this we need dplyr

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

df2 <- df2 %>%
group_by(dose)%>%
arrange(dose, desc(supp)) %>%
dplyr::mutate(lab_ypos = cumsum(len) -0.5 - len)

df2

## # A tibble: 6 × 4
## # Groups:   dose [3]
##   supp  dose    len lab_ypos
##   <chr> <chr> <dbl>    <dbl>
## 1 VC    D0.5    6.8     -0.5
## 2 OJ    D0.5    4.2      6.3
## 3 VC    D1     15       -0.5
## 4 OJ    D1     10       14.5
## 5 VC    D2     33       -0.5
## 6 OJ    D2     29.5     32.5

Now lets recreate our stacked graphs

ggplot(df2, aes(x=dose, y=len)) +
geom_col(aes(fill = supp), width = 0.7) + geom_text(aes(y = lab_ypos, label = len, group = supp), color = "white") +
scale_color_manual(values = c("blue", "gold")) +
scale_fill_manual(values = c ("blue", "gold"))

Lets look at some Boxplots

data('ToothGrowth')

Lets change the dose to a factor, and look at the top of the dataframe

ToothGrowth$dose <- as.factor(ToothGrowth$dose)

head(ToothGrowth, 4)

##    len supp dose
## 1  4.2   VC  0.5
## 2 11.5   VC  0.5
## 3  7.3   VC  0.5
## 4  5.8   VC  0.5

Lets load ggplot

library(ggplot2)

Lets set the theme for our plots to classic

theme_set(theme_minimal() +
theme(legend.position = "top"))

Lets start with a very basic boxplot with dose vs length

p <- ggplot(ToothGrowth, aes(x=dose, y=len)) + 
  geom_boxplot() +
  facet_grid(~supp)
p

tg <- ggplot(ToothGrowth, aes(x = dose, y = len))
tg + geom_boxplot()

Now lets look at a boxplot with points for the mean

tg + geom_boxplot(notch = TRUE, fill = "lightgrey") + stat_summary(fun.y = mean, geom = "point", shape= 18, size = 2.5, color = "indianred")

## Warning: The `fun.y` argument of `stat_summary()` is deprecated as of ggplot2 3.3.0.
## ℹ Please use the `fun` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

we can also change the scale number of variables included, and their order

tg + geom_boxplot() +
scale_x_discrete(limits = c("0.5", "2"))

## Warning: Removed 20 rows containing missing values (`stat_boxplot()`).

lets put our x axis in descending order

tg + geom_boxplot() +
scale_x_discrete(limits = c("2", "1", "0.5"))

’’’ we can alos change box plot colors by groups

tg + geom_boxplot(aes(color = dose))

scale_color_manual(values = c("indianred", "blue1", "green1"))

## <ggproto object: Class ScaleDiscrete, Scale, gg>
##     aesthetics: colour
##     axis_order: function
##     break_info: function
##     break_positions: function
##     breaks: waiver
##     call: call
##     clone: function
##     dimension: function
##     drop: TRUE
##     expand: waiver
##     get_breaks: function
##     get_breaks_minor: function
##     get_labels: function
##     get_limits: function
##     guide: legend
##     is_discrete: function
##     is_empty: function
##     labels: waiver
##     limits: NULL
##     make_sec_title: function
##     make_title: function
##     map: function
##     map_df: function
##     n.breaks.cache: NULL
##     na.translate: TRUE
##     na.value: grey50
##     name: waiver
##     palette: function
##     palette.cache: NULL
##     position: left
##     range: environment
##     rescale: function
##     reset: function
##     scale_name: manual
##     train: function
##     train_df: function
##     transform: function
##     transform_df: function
##     super:  <ggproto object: Class ScaleDiscrete, Scale, gg>

what if we want to display our data subset by oj vs vitamin c

tg2 <- tg + geom_boxplot(aes(fill = supp), position = position_dodge(0.9)) + scale_fill_manual(values = c("#999999",
  "#E69F00"))

tg2

we can also arrange this as two plots with facet_wrap

tg2 + facet_wrap(~supp)

set.seed(1234)
wdata = data.frame(
sex = factor(rep(c("F", "M"), each = 200)),
weight = c(rnorm(200, 56), rnorm(200, 58))
)

head(wdata, 4)

##   sex   weight
## 1   F 54.79293
## 2   F 56.27743
## 3   F 57.08444
## 4   F 53.65430

Now lets load dplyr

library(dplyr)

mu <- wdata %>%
group_by(sex) %>%
summarize(grp.mean = mean(weight))

Now lets load the plotting package

library (ggplot2)
theme_set(
theme_minimal() +
theme(legend.position = "bottom")
)

Now lets create a ggplot object

a <- ggplot (wdata, aes(x = weight))
a + geom_histogram(bins = 30, color = "black", fill = "grey") +
geom_vline(aes(xintercept = mean(weight)),
linetype = "dashed", size = 0.6)

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

) Now lets change the color by group

a + geom_histogram(aes(color = sex), fill = "white", position = "identity") + scale_color_manual(values = c("#00AF88", "#E78800"))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

a + geom_histogram(aes(color = sex), fill = "white", position = "identity") + scale_fill_manual(values = c("#00AF88", "#E78800"))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

what if we want of combine density plots nad histograms?

a + geom_histogram(aes(y = stat(density)),
color = "black", fill = "white") + 
geom_density(alpha = 0.2, fill = "#FF6666")

## Warning: `stat(density)` was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

a + geom_histogram(aes(y = stat(density), color = sex),
                   fill = "white", position = "identity") +
  geom_density(aes(color = sex), size = 1) +
  scale_color_manual(values = c("indianred", "lightblue1"))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Firsts lets load the required packages

library(ggplot2)

theme_set(
  theme_dark() +
    theme(legend.position = "top")
)

Firsts lets initiate a ggplot object called to

data("ToothGrowth")
ToothGrowth$dose <- as.factor(ToothGrowth$dose)
tg <- ggplot(ToothGrowth, aes(x=dose, y = len))

data ("ToothGrowth")
ToothGrowth$dose <- as.factor(ToothGrowth$dose)
tg <- ggplot(ToothGrowth, aes(x = dose, y = len))

lets create a dotplot with a summary statistic

tg + geom_dotplot(binaxis = "y", stackdir = "center", fill = "white") +
stat_summary(fun = mean, fun.args = list(mult=1))

## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

## Warning: Removed 3 rows containing missing values (`geom_segment()`).

tg + geom_boxplot(width = 0.5) +
geom_dotplot(binaxis = "y", stackdir = "center", fill = "white")

## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

tg + geom_violin(trim = FALSE) +
geom_dotplot(binaxis = "y", stackdir = "center", fill = "#999999") +
stat_summary(fun = mean, fun.args = list(mult=1))

## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

## Warning: Removed 3 rows containing missing values (`geom_segment()`).

Lets create a dotplot with multiple groups

tg + geom_boxplot(width = 0.5) +
geom_dotplot(aes(fill = supp), binaxis = "y", stackdir = "center") +
scale_fill_manual(values = c("indianred", "lightblue1"))

## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

tg + geom_boxplot(aes(color = supp), width = 0.5, position = position_dodge(0.8)) +
  geom_dotplot(aes(fill = supp, color = supp), binaxis = "y", stackdir = "center",
               dotsize = 0.8, position = position_dodge(0.8)) +
  scale_fill_manual(values = c("#00AF88", "#C78800")) + scale_color_manual(values = c("#00AF88", "#C78800"))

## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

Now lets change it up and look at some line plots

We’ll start by making a custom dataframe kinda like the tooth dataset. this way we can see the lines and stuff that we are studying

df <- data.frame(dose = c("D0.5", "D1", "D2"),
len = c(4.2, 10, 29.5))

Now lets create a second dataframe for plotting by groups

df2 <- data.frame(supp = rep(c("VC", "03"), each = 3),
dose = rep(c("D0.5", "D1", "D2"), 2),
len = c(6.8, 15, 33, 4.2, 10, 29.5))
df2

##   supp dose  len
## 1   VC D0.5  6.8
## 2   VC   D1 15.0
## 3   VC   D2 33.0
## 4   03 D0.5  4.2
## 5   03   D1 10.0
## 6   03   D2 29.5

Now lets again load ggplot 2 and set a theme

library(ggplot2)
theme_set(
theme_gray())+
theme(legend.position = "right")

## List of 97
##  $ line                      :List of 6
##   ..$ colour       : chr "black"
##   ..$ linewidth    : num 0.5
##   ..$ linetype     : num 1
##   ..$ lineend      : chr "butt"
##   ..$ arrow        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_line" "element"
##  $ rect                      :List of 5
##   ..$ fill         : chr "white"
##   ..$ colour       : chr "black"
##   ..$ linewidth    : num 0.5
##   ..$ linetype     : num 1
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ text                      :List of 11
##   ..$ family       : chr ""
##   ..$ face         : chr "plain"
##   ..$ colour       : chr "black"
##   ..$ size         : num 11
##   ..$ hjust        : num 0.5
##   ..$ vjust        : num 0.5
##   ..$ angle        : num 0
##   ..$ lineheight   : num 0.9
##   ..$ margin       : 'margin' num [1:4] 0points 0points 0points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ title                     : NULL
##  $ aspect.ratio              : NULL
##  $ axis.title                : NULL
##  $ axis.title.x              :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 2.75points 0points 0points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.title.x.top          :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 0
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 0points 2.75points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.title.x.bottom       : NULL
##  $ axis.title.y              :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 1
##   ..$ angle        : num 90
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 2.75points 0points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.title.y.left         : NULL
##  $ axis.title.y.right        :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 0
##   ..$ angle        : num -90
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 0points 0points 2.75points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text                 :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : chr "grey30"
##   ..$ size         : 'rel' num 0.8
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.x               :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 2.2points 0points 0points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.x.top           :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : num 0
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 0points 2.2points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.x.bottom        : NULL
##  $ axis.text.y               :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 1
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 2.2points 0points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.text.y.left          : NULL
##  $ axis.text.y.right         :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 0
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 0points 0points 2.2points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ axis.ticks                :List of 6
##   ..$ colour       : chr "grey20"
##   ..$ linewidth    : 'rel' num 0.5
##   ..$ linetype     : NULL
##   ..$ lineend      : NULL
##   ..$ arrow        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_line" "element"
##  $ axis.ticks.x              : NULL
##  $ axis.ticks.x.top          : NULL
##  $ axis.ticks.x.bottom       : NULL
##  $ axis.ticks.y              : NULL
##  $ axis.ticks.y.left         : NULL
##  $ axis.ticks.y.right        : NULL
##  $ axis.ticks.length         : 'simpleUnit' num 2.75points
##   ..- attr(*, "unit")= int 8
##  $ axis.ticks.length.x       : NULL
##  $ axis.ticks.length.x.top   : NULL
##  $ axis.ticks.length.x.bottom: NULL
##  $ axis.ticks.length.y       : NULL
##  $ axis.ticks.length.y.left  : NULL
##  $ axis.ticks.length.y.right : NULL
##  $ axis.line                 : list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ axis.line.x               : NULL
##  $ axis.line.x.top           : NULL
##  $ axis.line.x.bottom        : NULL
##  $ axis.line.y               : NULL
##  $ axis.line.y.left          : NULL
##  $ axis.line.y.right         : NULL
##  $ legend.background         :List of 5
##   ..$ fill         : NULL
##   ..$ colour       : logi NA
##   ..$ linewidth    : NULL
##   ..$ linetype     : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ legend.margin             : 'margin' num [1:4] 5.5points 5.5points 5.5points 5.5points
##   ..- attr(*, "unit")= int 8
##  $ legend.spacing            : 'simpleUnit' num 11points
##   ..- attr(*, "unit")= int 8
##  $ legend.spacing.x          : NULL
##  $ legend.spacing.y          : NULL
##  $ legend.key                :List of 5
##   ..$ fill         : chr "grey50"
##   ..$ colour       : logi NA
##   ..$ linewidth    : NULL
##   ..$ linetype     : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ legend.key.size           : 'simpleUnit' num 1.2lines
##   ..- attr(*, "unit")= int 3
##  $ legend.key.height         : NULL
##  $ legend.key.width          : NULL
##  $ legend.text               :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : 'rel' num 0.8
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ legend.text.align         : NULL
##  $ legend.title              :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 0
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ legend.title.align        : NULL
##  $ legend.position           : chr "right"
##  $ legend.direction          : NULL
##  $ legend.justification      : chr "center"
##  $ legend.box                : NULL
##  $ legend.box.just           : NULL
##  $ legend.box.margin         : 'margin' num [1:4] 0cm 0cm 0cm 0cm
##   ..- attr(*, "unit")= int 1
##  $ legend.box.background     : list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ legend.box.spacing        : 'simpleUnit' num 11points
##   ..- attr(*, "unit")= int 8
##  $ panel.background          :List of 5
##   ..$ fill         : chr "grey50"
##   ..$ colour       : logi NA
##   ..$ linewidth    : NULL
##   ..$ linetype     : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ panel.border              : list()
##   ..- attr(*, "class")= chr [1:2] "element_blank" "element"
##  $ panel.spacing             : 'simpleUnit' num 5.5points
##   ..- attr(*, "unit")= int 8
##  $ panel.spacing.x           : NULL
##  $ panel.spacing.y           : NULL
##  $ panel.grid                :List of 6
##   ..$ colour       : chr "grey42"
##   ..$ linewidth    : NULL
##   ..$ linetype     : NULL
##   ..$ lineend      : NULL
##   ..$ arrow        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_line" "element"
##  $ panel.grid.major          :List of 6
##   ..$ colour       : NULL
##   ..$ linewidth    : 'rel' num 0.5
##   ..$ linetype     : NULL
##   ..$ lineend      : NULL
##   ..$ arrow        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_line" "element"
##  $ panel.grid.minor          :List of 6
##   ..$ colour       : NULL
##   ..$ linewidth    : 'rel' num 0.25
##   ..$ linetype     : NULL
##   ..$ lineend      : NULL
##   ..$ arrow        : logi FALSE
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_line" "element"
##  $ panel.grid.major.x        : NULL
##  $ panel.grid.major.y        : NULL
##  $ panel.grid.minor.x        : NULL
##  $ panel.grid.minor.y        : NULL
##  $ panel.ontop               : logi FALSE
##  $ plot.background           :List of 5
##   ..$ fill         : NULL
##   ..$ colour       : chr "white"
##   ..$ linewidth    : NULL
##   ..$ linetype     : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ plot.title                :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : 'rel' num 1.2
##   ..$ hjust        : num 0
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 0points 5.5points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ plot.title.position       : chr "panel"
##  $ plot.subtitle             :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 0
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 0points 0points 5.5points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ plot.caption              :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : 'rel' num 0.8
##   ..$ hjust        : num 1
##   ..$ vjust        : num 1
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 5.5points 0points 0points 0points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ plot.caption.position     : chr "panel"
##  $ plot.tag                  :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : 'rel' num 1.2
##   ..$ hjust        : num 0.5
##   ..$ vjust        : num 0.5
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ plot.tag.position         : chr "topleft"
##  $ plot.margin               : 'margin' num [1:4] 5.5points 5.5points 5.5points 5.5points
##   ..- attr(*, "unit")= int 8
##  $ strip.background          :List of 5
##   ..$ fill         : chr "grey15"
##   ..$ colour       : logi NA
##   ..$ linewidth    : NULL
##   ..$ linetype     : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_rect" "element"
##  $ strip.background.x        : NULL
##  $ strip.background.y        : NULL
##  $ strip.clip                : chr "inherit"
##  $ strip.placement           : chr "inside"
##  $ strip.text                :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : chr "grey90"
##   ..$ size         : 'rel' num 0.8
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : 'margin' num [1:4] 4.4points 4.4points 4.4points 4.4points
##   .. ..- attr(*, "unit")= int 8
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ strip.text.x              : NULL
##  $ strip.text.x.bottom       : NULL
##  $ strip.text.x.top          : NULL
##  $ strip.text.y              :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : num -90
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ strip.text.y.left         :List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : num 90
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi TRUE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  $ strip.text.y.right        : NULL
##  $ strip.switch.pad.grid     : 'simpleUnit' num 2.75points
##   ..- attr(*, "unit")= int 8
##  $ strip.switch.pad.wrap     : 'simpleUnit' num 2.75points
##   ..- attr(*, "unit")= int 8
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi TRUE
##  - attr(*, "validate")= logi TRUE

Now lets do some basic line plots. First we will build a function to display all the different line types

generateRLineTypes <- function() {
  oldPar <- par()
  par(font = 2, mar = c(0, 0, 0, 0))
  
  plot(1, pch = "", ylim = c(0, 6), xlim = c(0, 0.7), axes = FALSE, xlab = "", ylab = "")
  
  for(i in 0:6) {
    lines(c(0.3, 0.7), c(i, i), lty = i, lwd = 3) 
  }
  
  text(rep(0, 7), 0:6, labels = c(
    "0. 'Blank'", "1. 'Solid'", "2. 'Dashed'", "3. 'Dotted'", 
    "4. 'Dotdash'", "5. 'Longdash'", "6. 'Twodash'"
  ))
  
  par(mar = oldPar$mar, font = oldPar$font)
}

generateRLineTypes()

Now lets build a basic line plot

p <- ggplot(data = df, aes(x = dose, y = len, group = 1))
p + geom_line() + geom_point()

Now lets modify the line type and color

p + geom_line(linetype = "dashed", color = "steelblue") +
  geom_point(color = "steelblue")

p + geom_step() + geom_point()

Now lets move on to making multiple groups. First we’ll make our ggplot object

p <- ggplot(df2, aes(x = dose, y = len, group = supp))

Now lets change line types and point shapes by group

p + geom_line(aes(linetype = supp, color = supp)) +
  geom_point(aes(shape = supp, color = supp)) +
  scale_color_manual(values = c("red", "blue"))

Now lets look at line plots with a numeric x axis

df3 <- data.frame(supp = rep(c("vc", "03"), each  = 3),
                  dose = rep (c("0.3", "1", "2"), 2),
                  len = c(0.8, 15, 33, 4.2, 10, 29.5))

df3

##   supp dose  len
## 1   vc  0.3  0.8
## 2   vc    1 15.0
## 3   vc    2 33.0
## 4   03  0.3  4.2
## 5   03    1 10.0
## 6   03    2 29.5

Now lets plot where both axises are treated as continous labels

df3dose <- as.numeric(as.vector(df3$dose))
ggplot(data = df3, aes(x=dose, y=len, group = supp, color = supp)) +
geom_line() + geom_point()

Now lets look at a line graph with having the x axis having the x axis as dates, we’ll use the built in economics time series for this example.

head(economics)

## # A tibble: 6 × 6
##   date         pce    pop psavert uempmed unemploy
##   <date>     <dbl>  <dbl>   <dbl>   <dbl>    <dbl>
## 1 1967-07-01  507. 198712    12.6     4.5     2944
## 2 1967-08-01  510. 198911    12.6     4.7     2945
## 3 1967-09-01  516. 199113    11.9     4.6     2958
## 4 1967-10-01  512. 199311    12.9     4.9     3143
## 5 1967-11-01  517. 199498    12.8     4.7     3066
## 6 1967-12-01  525. 199657    11.8     4.8     3018

ggplot(data = economics, aes(x = date, y = pop)) +
geom_line()

NOw lets subset the data

ss <- subset(economics, date > as.Date("2006-1-1"))
ggplot(data = ss, aes(x = date, y = pop)) + geom_line()

we can also change the line size, for instance by another variable like unemployment

ggplot(data = economics, aes(x = date, y = pop)) +
geom_line(aes(size = unemploy/pop))

we can also plot multiple time-series data

ggplot(economics, aes(x = date))+
  geom_line(aes(y = psavert), color = "darkred") +
  geom_line(aes(y = unemploy), color = "steelblue", linetype = "twodash")

) )

Lastly, lets make this into an area plot

ggplot(economics, aes(x=date)) +
         geom_area(aes(y = psavert), fill = "#999999",
                   color = "#999999", alpha = 0.5) +
  geom_area(aes(y = unemploy), fill = "#E69F00", 
            color = "#E69D00", alpha = 0.5)

First lets load the required packages

library(ggplot2)
library(ggridges)


#BiocManager::install("ggridges")

Now lets load some sample data

?airquality

air <- ggplot(airquality) + aes(Temp, Month, group = Month) + geom_density_ridges()

air

## Picking joint bandwidth of 2.65

Now lets add some pazzaz to our graph

library(viridis)

## Loading required package: viridisLite

ggplot(airquality) + aes(Temp, Month, group = Month, fill = ..x..) +
geom_density_ridges_gradient() +
scale_fill_viridis(option = "C", name = "Temp")

## Warning: The dot-dot notation (`..x..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(x)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Picking joint bandwidth of 2.65

Last thing we will do is create a facet plot for all our data.

library(tidyr)
airquality %>%
  gather(key = "Measurement", value = "value", Ozone, Solar.R, Wind, Temp) %>%
  ggplot() + aes(value, Month, group = Month) +
  geom_density_ridges() +
  facet_wrap(~ Measurement, scales = "free")

## Picking joint bandwidth of 11

## Picking joint bandwidth of 40.1

## Picking joint bandwidth of 2.65

## Picking joint bandwidth of 1.44

## Warning: Removed 44 rows containing non-finite values
## (`stat_density_ridges()`).

A density plot is a nice alternative to a histogram

set.seed(1234)

wdata = data.frame(
sex = factor(rep(c("r", "c"), each = 200)),
weight = c(rnorm(200, 58), rnorm(200, 58)))

library(dplyr)
Mu <- wdata %>%
group_by(sex) %>%
dplyr::summarise(grp.mean = mean(weight))

Now lets load the graphing package

library(ggplot2)
theme_set(
  theme_dark() +
  theme(legend.position = "right"))

Now lets do the basic plot function, first we will create a ggplot object

d <- ggplot (wdata, aes(x <- weight))

Now lets do a basic density plot

d + geom_density() +
geom_vline(aes(xintercept = mean(weight)), linetype = "dashed")

Now lets change the y axos to count instead of density

d + geom_density(aes(y = stat(count)), fill = "lightgray") +
geom_vline(aes(xintercept = mean(weight)), linetype = "dashed")

d + geom_density(aes(color = sex)) +
  scale_color_manual(values = c("darkgray", "gold"))

Lastly, lets fill the density plots

d + geom_density(aes(fill = sex), alpha = 0.4) +
  geom_vline(aes(xintercept = grp.mean, color = sex), data = Mu, linetype = "dashed") +
scale_color_manual(values = c("gray", "gold"))+
scale_fill_manual(values = c("gray", "gold"))

First lets load our required package

library(ggplot2)
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

install.packages("plotly")

## Installing package into '/home/student/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)

library(plotly)

Lets start with a scatter plot of the orange dataset

Orange <- as.data.frame(Orange)

plot_ly(data = Orange, x = ~age, y = ~circumference)

## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter

## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

Now lets add some more info

plot_ly(data = Orange, x = ~age, y = ~circumference, color = ~Tree, size = ~age,
text = ~paste("Tree 10:", Tree, "<br>Age:", age, "circ:", circumference))

## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter

## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

Now lets create a random distribution and add it to our dataframe

trace_1 <- rnorm(nrow(Orange), mean = 120, sd = 10)
new_data <- cbind(Orange, trace_1)

We’ll use the random numbers as lines on the graph

plot_ly(data = new_data, x = ~age, y = ~circumference, color = ~Tree, size = ~age,
        text = ~paste ("Tree ID:", Tree, "<br>Age:", age, "<br>Circ", circumference)) %>%
  add_trace(y = ~trace_1, mode = 'lines') %>%
  add_trace(y = ~circumference, mode = 'markers')

## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

Now lets create a graph with the option of showing as a scatter or line, and add labels.

plot_ly(data = Orange, x = ~age, y = ~circumference,
        color = ~Tree, size = ~circumference,
        text = ~paste("Tree ID:", Tree, "<br>Age:", age, "Circ:", circumference)) %>%
add_trace(y = ~circumference, mode = 'markers') %>%
  layout(
    title = "plot or orange data with switchable trace",
    updatemenus = list(
      list(
        type = 'downdrop',
        y = 0.8,
        buttons = list(
          list(method = "restyle",
               args = list('mode', 'markers'),
               label = "Marker"
               ),
          list(method = "restyle",
               args = list('mode', 'lines'),
               labels = "Lines"
               )
          )
        )
      )
    )

## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plotly.com/r/reference/#scatter

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

## Warning: `line.width` does not currently support multiple values.

First lets load our required packages

library(plotly)

Now lets create a random 3d matrix

d <- data.frame(
  x <- seq(1,10, by = 0.5), y <- seq(1,10, by = 0.5)
)

z <- matrix(rnorm(length(d$x) + length(d$y)), nrow = length(d$x), ncol = length(d$y))

## Warning in matrix(rnorm(length(d$x) + length(d$y)), nrow = length(d$x), : data
## length differs from size of matrix: [38 != 19 x 19]

Now lets plot our 1D data

plot_ly(d, x<-x, y = -y, z = -z) %>%
add_surface()

Lets add some more aspects to it, such as at topography

plot_ly(d, x = -x, y =-y, z = -z) %>%
  add_surface(
    contours = list(
      z = list(
        show = TRUE,
        usecolormap = TRUE,
        highlightcolor = "#FF0000",
        project = list(z=TRUE)
)
)
)

Now lets look at a 3d scatter plot

plot_ly(longley, x = ~GNP, y= ~Population, z = ~Employed, marker = list(color = ~GNP)) %>%
  add_markers()

) ) )

First lets load our required libraries

library(ggplot2)
library(dplyr)
library(plotrix)

theme_set(
  theme_classic() +
    theme(legend.position = 'top'))

Lets again use the tooth data for this exercise

df <- ToothGrowth
df$dose <- as.factor(df$dose)

Now lets use dplyr for manipulation purposes

df.summary <-df %>%
group_by(dose) %>%
dplyr::summarise(
sd = sd(len, na.rm = TRUE),
len = mean(len))

df.summary

## # A tibble: 3 × 3
##   dose     sd   len
##   <fct> <dbl> <dbl>
## 1 0.5    4.50  10.6
## 2 1      4.42  19.7
## 3 2      3.77  26.1

Lets now look at some key functions

geom_crossbar() for hollow bars with middle indicated by a horizontal line geom_errorbar() for error bar geom_errorbarh() for horizontal error bars geom_linerange() for drawing an interval represented by a vertical line geom_pointrange() for creating an interval represented by a vertical line; with a point in the middle

lets start by creating a ggplot object

tg <- ggplot(
  df.summary,
  aes(x = dose, y = len, ysin = len - sd,ymax =  - len +sd ))

Now lets look at the most basic error plots

tg <- ggplot(
  df.summary,
  aes(x = dose, y = len, ymin = len -sd, ymax = len + sd)
  
)

tg + geom_pointrange()

tg + geom_errorbar(width = 0.2) +
geom_point(size = 1.5)

NOw lets create horizontal error bars by manipulating our graph

ggplot(df, aes(dose, len)) +
  geom_jitter(position = position_jitter(0.2), color = "darkgray") +
  geom_pointrange(aes(ymin = len+sd, ymax = len+sd), data = df.summary)

ggplot(df.summary, aes(x=len, y=dose, xmin = len+sd, xmax = len+sd)) +
geom_point() +
geom_errorbarh(height = 0.2)

This just gives you an idea of error bars on the horizontal axis Nowlets look at adding jitter points (actual measurements) to our data

ggplot(df, aes(dose, len)) +
  geom_jitter(position = position_jitter(0.2), color = "darkgray") +
  geom_pointrange(aes(ymin = len+sd, ymax = len+sd), data = df.summary)

Now lets try error bars on a violin plot

ggplot(df, aes(dose, len)) +
  geom_violin(color = "darkgray", trim = FALSE) +
  geom_pointrange(aes(ymin = len - sd, ymax = len+sd), data = df.summary)

Now how about with a line graph?

library(ggplot2)

ggplot(df.summary, aes(x = dose, y = len)) +
  geom_line(aes(group = 1)) + 
  geom_errorbar(aes(ymin = len , ymax = len ), width = 0.2) +   
  geom_point(size = 2)

Now lets make a bar graph with half error bars

ggplot(df.summary, aes(dose, len)) +
  geom_col(fill = "lightgray", color = 'black') +
  geom_errorbar(aes(ymin = len, ymax = len), width = 0.2)

You can see that by not specifying nmin = len.stderr, we have in essence cut our error bar in half. HOw about we add jitter points to line plots? we need to use the orginal dataframe for the jitter plot, and the summary df for the geom layers.

ggplot(df,aes(dose, len)) +
geom_jitter(position = position_jitter(0.2), color = "darkgray") +
geom_line(aes(group = 1), data = df.summary) +
geom_errorbar(
aes(ymin = len, ymax = len),
data = df.summary, width = 0.2) +
geom_point(data =df.summary, size = 0.2)

what about adding jitterpoints to a barplot

ggplot(df, aes(dose, len)) +
geom_col(data = df.summary, fill = NA, color = "black") +
geom_jitter(position = position_jitter(0.3), color = "blue") +
geom_errorbar(aes(ymin = len, ymax = len),
data = df.summary, width = 0.2)

what if we wanted to have our error bars per group? (03 vs VC)

df.summary2 <- df %>%
  group_by(dose, supp) %>%
  dplyr::summarise(
    sd = sd(len),
    stderr = std.error(len),
    len = mean(len))

## `summarise()` has grouped output by 'dose'. You can override using the
## `.groups` argument.

df.summary2

## # A tibble: 6 × 5
## # Groups:   dose [3]
##   dose  supp     sd stderr   len
##   <fct> <fct> <dbl>  <dbl> <dbl>
## 1 0.5   OJ     4.46  1.41  13.2 
## 2 0.5   VC     2.75  0.869  7.98
## 3 1     OJ     3.91  1.24  22.7 
## 4 1     VC     2.52  0.795 16.8 
## 5 2     OJ     2.66  0.840 26.1 
## 6 2     VC     4.80  1.52  26.1

Now you can see we have mean and error for each dose and supp

library(ggplot2)

ggplot(df.summary2, aes(x = dose, y = len, color = supp)) +
  geom_pointrange(aes(ymin = len - stderr, ymax = len + stderr), 
                  position = position_dodge(0.3)) + 
  scale_color_manual(values = c("indianred", "lightblue"))

How about line plots with multiple error bars?

ggplot(df.summary2, aes(dose, len)) +
geom_line(aes(linetype = supp, group = supp)) +
geom_point() +
geom_errorbar(aes(ymin = len, ymax = len, group = supp), width = 0.2)

And the same with a bar plot

library(ggplot2)

ggplot(df.summary2, aes(x = dose, y = len)) +
  geom_col(aes(fill = supp), position = position_dodge(0.8), width = 0.7) +
  geom_errorbar(aes(ymin = len - stderr, ymax = len + stderr, group = supp), 
                width = 0.2, 
                position = position_dodge(0.8)) + 
  scale_fill_manual(values = c("indianred", "lightblue"))

Now lets add some jitterpoints

ggplot(df, aes(dose, len, color = supp)) +
geom_jitter(position = position_dodge(0.2)) +
geom_line(aes(group = supp), data = df.summary2) +
geom_point() +
geom_errorbar(aes(ymin = len, ymax = len, group = supp), data = df.summary2, width = 0.2)

library(ggplot2)

ggplot(df, aes(x = dose, y = len, color = supp)) +
  geom_col(data = df.summary2, aes(fill = supp), 
           position = position_dodge(0.8), width = 0.7, color = "black") +  
  geom_jitter(
    position = position_jitterdodge(jitter.width = 0.2, dodge.width = 0.8)
  ) +  
  geom_errorbar(
    aes(ymin = len - stderr, ymax = len + stderr),  
    data = df.summary2,
    width = 0.2, position = position_dodge(0.8)
  ) +  
  scale_color_manual(values = c("indianred", "lightblue")) +  
  theme(legend.position = "top")

) ) ) Now lets do an empirical cumulative function. This reports any given number percentile of individuals that are above or below that threshold

library(class)

set.seed(1234)

wdata <- data.frame(
  sex = factor(rep(c("F", "M"), each = 200)),  
  weight = c(rnorm(200, 55), rnorm(200, 58)))

Now lets look at our datafrmae

head(wdata, 5)

##   sex   weight
## 1   F 53.79293
## 2   F 55.27743
## 3   F 56.08444
## 4   F 52.65430
## 5   F 55.42912

Now lets load our plotting package

library(ggplot2)
theme_set(
theme_classic()+
theme(legend.position = "button")
)

Now lets create our ECDF Plot

ggplot(wdata, aes(x = weight)) + 
  stat_ecdf(aes(color = sex, linetype = sex), size = 1.5) +  
  scale_color_manual(values = c("#00AF88", "#E78900")) +
  labs(y = "ECDF", x = "Weight")

Now lets look at our dataframe

head(wdata, 5)

##   sex   weight
## 1   F 53.79293
## 2   F 55.27743
## 3   F 56.08444
## 4   F 52.65430
## 5   F 55.42912

Now lets take a look a qq plots. These are used to determine if the given data follows a normal distribution

install.packages("ggpubr")

## Installing package into '/home/student/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)

set.seed(1234)

Now lets randomly generate some data

wdata = data.frame(
sex = factor(rep(c("F", "M"), each = 200)),
weight = c(rnorm(200, 55), rnorm(200, 58)))

)

create a qq plot of the weight

library(ggplot2)
theme_set(
  theme_minimal() +
    theme(legend.position = "top"))

)

library(ggplot2)

ggplot(wdata, aes(sample = weight)) +  
  stat_qq(aes(color = sex)) + 
  scale_color_manual(values = c("#0073C2FF", "#FC4E07")) +
  labs(y = "Weight", x = "Theoretical Quantiles")

wdata <- data.frame(
  weight = c(50, 55, 60, 65, 70, 75),
  sex = c("Male", "Female", "Male", "Female", "Male", "Female"))

library(ggpubr)

ggqqplot(wdata, x = "weight",
         color = "sex",
         palette = c("#0073C2FF", "#FC4E07"),
         ggtheme = theme_pubclean())

Now what would a non-normal distribution look like?

library(mnonr)  


data2 <- mnonr::mnonr(  
  n = 1000, 
  p = 2, 
  ms = 3, 
  mk = 61, 
  Sigma = matrix(c(1, 0.5, 0.5, 1), 2, 2),  
  initial = NULL
)


data2 <- as.data.frame(data2)

Now lets plot the non normal data

ggplot(data2, aes(sample=V1)) +
stat_qq()

ggqqplot(data2, x= "V1",
palette = "#0073c2FF",
ggtheme = theme_pubclean())

Lets look at how to put multiple plots together into a single figure

library(ggpubr)
library(ggplot2)

theme_set(
theme_bw() +
theme(legend.position = "top"))

First lets create a nice boxplot

lets load the data

df <- ToothGrowth
df$dose <- as.factor(df$dose)

and create the plot object

p <- ggplot(df, aes(x = dose, y = len)) +
geom_boxplot(aes(fill = supp), position = position_dodge(0.9)) +
scale_fill_manual(values = c("#00A788", "#E78200"))
p

Now lets look at the gvplot facit function

p+facet_grid(rows = vars(supp))

Now lets do a facet with multiple variables

p + facet_wrap(vars(dose), ncol = 2)

Now how do we combine multiple plots using ggarrange()

Lets start by making some basic plots, first we will define a color palette and data

my3cols <- c("#e7e800", "#2E9FDF", "#FC4E07")
ToothGrowth$dose <- as.factor(ToothGrowth$dose)

Now lets make some basic plots

p <- ggplot(ToothGrowth, aes(x = dose, y = len))
bxp <- p + geom_boxplot(aes(color = dose)) +
  scale_color_manual(values = my3cols)

Ok now lets do a dotplot

dp <- p + geom_dotplot(aes(color = dose, fill = dose),
binaxis = 'y', stackdir = 'center') +
scale_color_manual(values = my3cols) +
scale_fill_manual(values = my3cols)

Now lastly lets create a lineplot

lp <- ggplot(economics, aes(x=date, y=psavert)) +
geom_line(color = "indianred")

Now we can make the figure

figure <- ggarrange(bxp,dp,lp, labels = c("A", "B", "C"), ncol =2, nrow = 2)

## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

figure

This looks great, but we can make it even better

figure2 <- ggarrange(
lp,
ggarrange(bxp, dp, ncol=2, nlabs = c ("B", "C")),
nrow = 2,
labels = "A")

## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

## Warning in as_grob.default(plot): Cannot convert object of class character into
## a grob.

## Warning in as_grob.default(plot): Cannot convert object of class listggarrange
## into a grob.

figure2

Ok this looks good, but you’ll notice that there are two legends that are the same.

ggarrange(
  bxp, dp, labels = c("A", "B"),
  common.legend = TRUE, legend = "bottom"
)

## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.

Lastly we should export the plot

ggexport(figure2, filename = "facetfigure.pdf")

## file saved to facetfigure.pdf

We can also export multiple plots to a pdf

ggexport(bxp, dp, lp, filename = "multi.pdf")

## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
## file saved to multi.pdf

ggexport(bxp, dp, lp, bxp, filename = "test2.pdf", nrow = 2, ncol =1)

## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
## file saved to test2.pdf

) ) lets get started with heatmaps

library(heatmap3)

Now lets get our data.

data <- ldeaths
data2 <- do.call(cbind, split(data, cycle(data)))
dimnames(data2) <- dimnames(.preformat.ts(data))

Now lets generate a heat map

heatmap(data2)

heatmap(data2, Rowv = NA, Colv = NA)

Now lets play with the colors

rc <- rainbow(nrow(data2), start = 0, end = 0.3)
cc <- rainbow(ncol(data2), start =0, end = 0.3)

Now lets apply our color selections

heatmap(data2, colsidecolors = cc)

## Warning in plot.window(...): "colsidecolors" is not a graphical parameter

## Warning in plot.xy(xy, type, ...): "colsidecolors" is not a graphical parameter

## Warning in title(...): "colsidecolors" is not a graphical parameter

library(RColorBrewer)
heatmap(data2, ColSideColors = cc,
col = colorRampPalette(brewer.pal(8, "PiYG"))(25))

There is more that we can customize

library(gplots)

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:plotrix':
## 
##     plotCI

## The following object is masked from 'package:stats':
## 
##     lowess

heatmap.2(data2, colSideColors = cc,
          col = colorRampPalette(brewer.pal(8, "PiYG"))(25))

## Warning in plot.window(...): "colSideColors" is not a graphical parameter

## Warning in plot.xy(xy, type, ...): "colSideColors" is not a graphical parameter

## Warning in title(...): "colSideColors" is not a graphical parameter

Missing Values If you encounter an unusual value in your dataset, and simply want to move on ot the rest of your analysis, you have two options:

Drop the entire row with the strange values:

diamonds <- diamonds

diamonds2 <- diamonds %>%
filter(between(y, 3, 20))

In this instance, y is the width of the diamond, so anything under 3 mm or above 20 is excluded I don’t recommend this option, just because there is one bad measurement doesn’t mean they are all bad instead, I recommend replacing unnusual values with missing values

diamonds3 <- diamonds %>%
dplyr::mutate(y = ifelse(y < 3 | y > 20, NA, y))

Like R, ggplot2 subscibes to the idea that missing values should not pass silently into the night.

ggplot(data = diamonds3, mapping = aes(x = x, y+y)) +
  geom_point()

## Warning: Removed 9 rows containing missing values (`geom_point()`).

If you want to supress that warning you ca nuse na.rn = TRUE

ggplot(data = diamonds3, mapping = aes(x = x, y = y)) +
geom_point(na.rn = TRUE)

## Warning in geom_point(na.rn = TRUE): Ignoring unknown parameters: `na.rn`

## Warning: Removed 9 rows containing missing values (`geom_point()`).

Other times you want to understand what makes observations with missing values different to the observation with recorded values. for example. in the NYCflights13 dataset. missing values in the dep_time variable indicate that the flight was cancelled. So you might want ot compare the scheduled departure times for cancelled and non-cancelled times.

library(nycflights13)

) What if we want to know what our outliers are?

First we need to load the required libraries


```r
library(ggplot2)

And reload the dataset because we removed outliers


Lets create a function using the grubb test to identify all outliers. The grubbs test identifies outliers in a univariate dataset that is presumed to come from a normal distribution.


Now we can create a histogram showing where the outliers were


Categorical Variables


```r
library(ggplot2)

ggplot(data = diamonds, mapping = aes(x = price)) +
geom_freqpoly(mapping = aes(color = cut), bindwidth = 500)

## Warning in geom_freqpoly(mapping = aes(color = cut), bindwidth = 500): Ignoring
## unknown parameters: `bindwidth`

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ITs hard to see the difference in distribution becasue the counts differ so much

ggplot(diamonds) +
geom_bar(mapping = aes(x = cut))

To make the comparison easier we have to swap the display on the y-axis, Instead of displaying count, we’ll display density, which is the count standardized so that the area under the curve is one.

ggplot(data = diamonds, mapping = aes(x = price, y = ..density..)) +
geom_freqpoly(mapping = aes(color = cut), bindwidth = 500)

## Warning in geom_freqpoly(mapping = aes(color = cut), bindwidth = 500): Ignoring
## unknown parameters: `bindwidth`

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

It appears that fair diamonds (the lowest cut quality) have the highest average price. BUt maybe that the frequency of polygons are a little hard to interpret.

Another alternative is the boxplto. A boxplot is a type of visual shorthand for a distribution of values

ggplot (data = diamonds, mapping = aes(x = cut, y = price)) +
geom_boxplot()

We see much less information about the distribution, but the boxplots are much more compact, so we can more easliy compare them. It supports the conterintuitive finding that better quality diamonds are cheaper on average:

lets look at some car data

ggplot(data = mpg, mapping = aes(x =class, y = hwy)) +
geom_boxplot()

ggplot(data = mpg) +
geom_boxplot (mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy))

If you have long variable names, you can switch the axis and flip it 90 degrees.

ggplot(data=mpg) +
geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy))+ coord_flip(
)

To visualize the correlation between two continuous variables, we can use a scatter plot.

ggplot(data = diamonds) +
geom_point(mapping = aes(x = carat, y = price))

scatterplots become less useful as the size of you dataset grows, because we get overplot, we can fix this using the alpha aesthetic

ggplot(data = diamonds) +
  geom_point(mapping = aes(x = carat, y = price), alpha = 1/100)

First lets load a required library

library(RCurl)

## 
## Attaching package: 'RCurl'

## The following object is masked from 'package:tidyr':
## 
##     complete

library(dplyr)

Now lets get our data

site <- "https://raw.githubusercontent.com/nytimes/covid-19-data/master/colleges/colleges.csv"
college_data <- read.csv(site)

First lets use the str funciton, this shows the structure of the object

str(college_data)

## 'data.frame':    1948 obs. of  9 variables:
##  $ date      : chr  "2021-05-26" "2021-05-26" "2021-05-26" "2021-05-26" ...
##  $ state     : chr  "Alabama" "Alabama" "Alabama" "Alabama" ...
##  $ county    : chr  "Madison" "Montgomery" "Limestone" "Lee" ...
##  $ city      : chr  "Huntsville" "Montgomery" "Athens" "Auburn" ...
##  $ ipeds_id  : chr  "100654" "100724" "100812" "100858" ...
##  $ college   : chr  "Alabama A&M University" "Alabama State University" "Athens State University" "Auburn University" ...
##  $ cases     : int  41 2 45 2742 220 4 263 137 49 76 ...
##  $ cases_2021: int  NA NA 10 567 80 NA 49 53 10 35 ...
##  $ notes     : chr  "" "" "" "" ...

What if we want to arrange our dataset alphabetically by college?

alphabetical <- college_data %>%
arrange(college_data$college)

The glimpse package is another way to preview data

glimpse(college_data)

## Rows: 1,948
## Columns: 9
## $ date       <chr> "2021-05-26", "2021-05-26", "2021-05-26", "2021-05-26", "20…
## $ state      <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Ala…
## $ county     <chr> "Madison", "Montgomery", "Limestone", "Lee", "Montgomery", …
## $ city       <chr> "Huntsville", "Montgomery", "Athens", "Auburn", "Montgomery…
## $ ipeds_id   <chr> "100654", "100724", "100812", "100858", "100830", "102429",…
## $ college    <chr> "Alabama A&M University", "Alabama State University", "Athe…
## $ cases      <int> 41, 2, 45, 2742, 220, 4, 263, 137, 49, 76, 67, 0, 229, 19, …
## $ cases_2021 <int> NA, NA, 10, 567, 80, NA, 49, 53, 10, 35, 5, NA, 10, NA, 19,…
## $ notes      <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",…

we can also subset with select()

college_cases <- select(college_data, college, cases)

We can also filter or subset with the filter function

Louisiana_cases <- filter(college_data, state == "Louisiana")

Lets filter our a smaller amoutn of states

South_cases <- filter(college_data, state == "Louisiana" | state == "Texas"| state == "Arkansas" | state == "Mississippi")

Lets look at some time series data First we’ll load the required libraries

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(dplyr)
library(ggplot2)
library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:plotrix':
## 
##     rescale

## The following object is masked from 'package:viridis':
## 
##     viridis_pal

Now lets load some data

state_site <- "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv"
state_Data <- read.csv(state_site)

Lets create group_by object using the state column

state_cases <- group_by(state_Data, state)
class(state_cases)

## [1] "grouped_df" "tbl_df"     "tbl"        "data.frame"

How many measurements were made by state? This gives us an idea of when states started reporting it

Days_since_first_reported <- tally(state_cases)

Lets visualize some data First lets start off with some definintions Data - obvious - the stuff we wnat to visualize Layer - made of generic elements and requisite statistical information. Include geometric objects which represents the plot

scales - used to map values in the data space that is used for creation of values (color, size, shape, etc) coordinate system - describes how the data coordinates are mapped together in relation to the plan on the graphic Faceting - how to break up data in to subsets to display multiple types of groups of data Theme - controls the finer points of the display, such as font size and background color.

options(repr.plot.width = 6, repr.plot.height = 6)
class(college_data)

## [1] "data.frame"

head(college_data)

##         date   state     county       city ipeds_id
## 1 2021-05-26 Alabama    Madison Huntsville   100654
## 2 2021-05-26 Alabama Montgomery Montgomery   100724
## 3 2021-05-26 Alabama  Limestone     Athens   100812
## 4 2021-05-26 Alabama        Lee     Auburn   100858
## 5 2021-05-26 Alabama Montgomery Montgomery   100830
## 6 2021-05-26 Alabama     Walker     Jasper   102429
##                           college cases cases_2021 notes
## 1          Alabama A&M University    41         NA      
## 2        Alabama State University     2         NA      
## 3         Athens State University    45         10      
## 4               Auburn University  2742        567      
## 5 Auburn University at Montgomery   220         80      
## 6  Bevill State Community College     4         NA

summary(college_data)

##      date              state              county              city          
##  Length:1948        Length:1948        Length:1948        Length:1948       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    ipeds_id           college              cases          cases_2021    
##  Length:1948        Length:1948        Min.   :   0.0   Min.   :   0.0  
##  Class :character   Class :character   1st Qu.:  32.0   1st Qu.:  23.0  
##  Mode  :character   Mode  :character   Median : 114.5   Median :  65.0  
##                                        Mean   : 363.5   Mean   : 168.1  
##                                        3rd Qu.: 303.0   3rd Qu.: 159.0  
##                                        Max.   :9914.0   Max.   :3158.0  
##                                                         NA's   :337     
##     notes          
##  Length:1948       
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

Now lets take a look at a different dataset

iris <- as.data.frame(iris)
class(iris)

## [1] "data.frame"

head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

summary(iris)

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
##

Lets start by creating a scatter plot of the college data

ggplot(data = college_data, aes(x = cases, y = cases_2021)) +
  geom_point() +
  theme_minimal()

## Warning: Removed 337 rows containing missing values (`geom_point()`).

Now lets do the iris data

ggplot(data = iris, aes(x = Sepal.Width, y = Sepal.Length)) +
geom_point() +
theme_minimal()

Lets color coordinate our college data

ggplot(data = iris, aes(x = Sepal.Width, y = Sepal.Length)) +
geom_point() +
theme_minimal()

Lets color coordinate the iris data

ggplot(data = iris, aes(x = Sepal.Width, y = Sepal.Length, color = Species)) +
geom_point() +
theme_minimal()

Lets run a simple histogram of our Louisiana Case Data

hist(Louisiana_cases$cases, freq = NULL, density = NULL, breaks = 10, xlab = "total cases", ylab = "frequency", 
main = "Total College Covid-19 Infection (Louisiana)")

Lets run a simple histogram for the Iris data

hist(iris$Sepal.Width,freq = NULL, density = NULL, breaks = 10, xlab = "sepal width", ylab = "frequency", main = "Iris Sepal Width")

histogram_college <- ggplot(data = Louisiana_cases, aes(x = cases),
histogram_college + geom_histogram(bindwidth = 100, color = "black", aes (fill = county))+
xlab("cases") + ylab("frequency") + ggtitle ("Histogram of Covid 19 cases in Louisiana"))

Lets create a ggplot for the IRIS data

’’‘(r) histogram_iris <- ggplot(data = iris, aes(x = sepal.width)) histogram_Iris + geom_histogram(bindinth = 0.2, color = “black”, aes(fill = species)) + xlab(“Sepal width”) + ylab(“Frequency”) + ggtitle(“Histogram of Iris Sepal Width by Species” )’’’ Maybe a density plot makes more sense for our college data

’’‘(r) ggplot(south_cases) + geom_density(aes(x = cases, fill = state), alpha = 0.25)’’’ Lets do it with the Iris data

’’‘(r) ggplot(iris)+ geom_density(aes(x = sepal.width, fill = species), alpha = 0.25)’’’

Lets do it with the Iris data

ggplot(iris)+
geom_density(aes(x = Sepal.Width, fill = Species), alpha = 0.25)

library(ggplot2)

ggplot(data = iris, aes(x = Species, y = Sepal.Length, fill = Species)) +
  geom_violin(alpha = 0.6) +  # Semi-transparent violins
  geom_jitter(width = 0.1, color = "black", alpha = 0.5) +  # Add jittered points
  theme_classic() +
  theme(legend.position = "none") +  # Remove legend
  labs(title = "Sepal Length Distribution by Species", 
       x = "Species", 
       y = "Sepal Length")

ggplot(data = South_cases, aes(x = state, y = cases, color = state) ) +
geom_violin()+
theme_gray() +
theme(legend.position = "none")

Now lets take a look at risidual plots. This is a graph that displays the residuals on the vertical axis, and the independent variable on the horizontal. In the event that the points in a residual plot are dispersed in a random manner around the horizontal axis, it is appropriate to use a linear regression. If they are not randomly dispersed, a non linear model is more appropriate.

Lets start with the iris data

ggplot(lm(Sepal.Length = Sepal.Width, data = iris)) +
geom_point(aes(x = .fitted, y = .resid))

## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
##  extra argument 'Sepal.Length' will be disregarded

Now look at the southern states cases

library(ggplot2)
library(broom)
library(dplyr)


if (!exists("South_cases")) {
  stop("Error: The dataset 'South_cases' is not found.")
}


South_cases <- South_cases %>%
  dplyr::mutate(across(where(is.character), as.factor))


South_cases <- South_cases %>%
  filter(!is.na(cases_2021) & !is.nan(cases_2021) & is.finite(cases_2021))


South_cases$cases_2021 <- as.numeric(South_cases$cases_2021)


if (nrow(South_cases) == 0) {
  stop("Error: No valid data available after removing invalid values.")
}

A linear model is not a good call for the state cases Now lets do some correlations

obesity_data <- data.frame(
  age = c(19, 18, 28, 33, 32, 31, 46, 37, 37, 60, 25, 62, 23, 56, 27, 19, 52, 23, 56, 30, 
          60, 30, 18, 34, 37, 59, 63, 55, 23, 31, 22, 18, 19, 63, 28, 19, 62, 26, 35, 60, 
          24, 31, 41, 37, 38),
  sex = c("female", "male", "male", "male", "male", "female", "female", "female", "male", 
          "female", "male", "female", "male", "female", "male", "male", "female", "male", 
          "male", "male", "female", "female", "male", "female", "male", "female", "female", 
          "female", "male", "male", "male", "female", "female", "male", "male", "female", 
          "male", "male", "female", "female", "female", "male", "female", "male", "male"),
  
  height = c( 3, 5, 4, 3, 2, 4, 7, 5, 4, 5, 6, 4, 5, 6, 4, 3, 5, 6, 5, 4, 3, 2, 4, 5, 3,
              2, 4, 5, 6, 4, 5, 6, 5, 4, 5, 6, 4, 3, 4, 6, 4, 5, 4, 3, 4))

charges = c( 59, 70, 39, 50, 40, 58, 58, 48, 39, 28, 49, 99, 84, 83, 48, 68, 84, 49, 69, 39, 60,
             49, 50, 38, 59, 50, 49, 84, 83, 82, 81, 49, 85, 96, 84, 73, 82, 38, 69, 84, 39, 99,
             87, 77, 87)

bmi = c( 25, 26, 27, 28, 29, 27, 29, 27, 26, 28, 29, 27, 28, 29, 28, 27, 28, 27, 26, 27, 28, 26,
         25, 27, 27, 28, 26, 27, 28, 29, 28, 27, 29, 30, 28, 26, 27, 28, 28, 29, 20, 29, 27, 29,
         30)



head(obesity_data)

##   age    sex height
## 1  19 female      3
## 2  18   male      5
## 3  28   male      4
## 4  33   male      3
## 5  32   male      2
## 6  31 female      4

library(tidyr)
library(plyr)

## ------------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## ------------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following object is masked from 'package:ggpubr':
## 
##     mutate

## The following objects are masked from 'package:plotly':
## 
##     arrange, mutate, rename, summarise

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

library(dplyr)

Lets look at the structure of the dataset

str(obesity_data)

## 'data.frame':    45 obs. of  3 variables:
##  $ age   : num  19 18 28 33 32 31 46 37 37 60 ...
##  $ sex   : chr  "female" "male" "male" "male" ...
##  $ height: num  3 5 4 3 2 4 7 5 4 5 ...

class(obesity_data)

## [1] "data.frame"

summary(obesity_data)

##       age           sex                height     
##  Min.   :18.0   Length:45          Min.   :2.000  
##  1st Qu.:24.0   Class :character   1st Qu.:4.000  
##  Median :32.0   Mode  :character   Median :4.000  
##  Mean   :36.6                      Mean   :4.356  
##  3rd Qu.:52.0                      3rd Qu.:5.000  
##  Max.   :63.0                      Max.   :7.000

Now lets look at the distribution for insurance charges

hist(college_data$cases)

we can also get an idea of the distribution using a boxplot

Now lets look at correlations. the cor() command is used to determine correlations between two vectors, all of the columns of a data frame, or two data frames. The cov() command, on the otherhand examines the covariance. the cor.test( command carries out a test as to the significance of the correlation)

cor(college_data$cases, college_data$cases_2021)

## [1] NA

This test uses a spearman Rho correlation, or you can use Kendall’s tau by specifying it

cor(college_data$cases, college_data$cases_2021, method = 'kendall')

## [1] NA

This correlation measures strength of a correlation between -1 and 1.

Now lets look at the Tietjen=Moore test. This is used for univariate datasets. The algorithm depicts the detection fo the outliers in a univariate dataset.

TietjenMoore <- function(dataseries, k) {
  n <- length(dataseries)
  
 
  r <- abs(dataseries - mean(dataseries))
  
  
  df <- data.frame(dataseries, r)
  dfs <- df[order(df$r), ] 
 
  klarge <- (n - k + 1):n  
  subdataseries <- dfs$dataseries[-klarge]  
  
  ksub <- sum((subdataseries - mean(subdataseries))^2)  
  all <- sum((df$dataseries - mean(df$dataseries))^2)  
  
 
  return(ksub / all)
}

This function helps to compute the absolute resudialuls and sorts data according to the size of the residuals. Later, we will focus on the computation of sum of squares.

FindOutliersTietjenMooretest <- function(dataseries, k, alpha = 0.5) {
  
  ek <- TietjenMoore(dataseries, k)

 
  test <- numeric(10000)  
  
  for (i in 1:10000) {  
    simulated_data <- rnorm(length(dataseries)) 
    test[i] <- TietjenMoore(simulated_data, k)  
  }

  
  Talpha <- quantile(test, alpha)

  
  return(list(T = ek, Talpha = Talpha))
}

This function helps us to compute the critical values based on simulation data. Now lets demonstrate these functions with sample data and the obesity dataset for evaluating this algorithm.

THe critical region for the TIetjen-Moore test is determind by simulation. The simulation is perfomred by generating a standard normal random sample of size n and computing the Tietjen Moore test statistic. Typically, 10,000 ram samples are used. The values of the Tietjen-moore statistic obtained from the data is compared to this reference distribution. The values of the test statistic is between Zero and one. If there are no outliers in the data, the test statistic is close to 1. If there are outliers the test statistic will be closer to zero. Thus, the test is always a lower, one-tailed test regardless of which test statistic is used, Lk or EK.

First we will look at charges

boxplot(college_data$cases)

FindOutliersTietjenMooretest(college_data$cases, 4)

## $T
## [1] 0.7931992
## 
## $Talpha
##      50% 
## 0.977314

Lets check out bmi

boxplot(college_data$cases_2021)

FindOutliersTietjenMooretest(college_data$cases, 2)

## $T
## [1] 0.8789064
## 
## $Talpha
##       50% 
## 0.9876867

Probability Plots

library(ggplot2)
library(tigerstats)

## Loading required package: abd

## Loading required package: nlme

## 
## Attaching package: 'nlme'

## The following object is masked from 'package:dplyr':
## 
##     collapse

## Loading required package: lattice

## Loading required package: grid

## Loading required package: mosaic

## Registered S3 method overwritten by 'mosaic':
##   method                           from   
##   fortify.SpatialPolygonsDataFrame ggplot2

## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.

## 
## Attaching package: 'mosaic'

## The following object is masked from 'package:Matrix':
## 
##     mean

## The following object is masked from 'package:plyr':
## 
##     count

## The following object is masked from 'package:scales':
## 
##     rescale

## The following object is masked from 'package:plotrix':
## 
##     rescale

## The following object is masked from 'package:plotly':
## 
##     do

## The following objects are masked from 'package:dplyr':
## 
##     count, do, tally

## The following object is masked from 'package:ggplot2':
## 
##     stat

## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
##     quantile, sd, t.test, var

## The following objects are masked from 'package:base':
## 
##     max, mean, min, prod, range, sample, sum

## Welcome to tigerstats!
## To learn more about this package, consult its website:
##  http://homerhanumat.github.io/tigerstats

we will use the probability plot function and their output dnorm: density function of the normal distribution. using the density, it is possible to determine the probability of events. or for examples, you may wonder “what is the likelihood that a person has an IQ of exactly 1407 in this case, you would need to retrieve the density of the IQ distribution at values 140. The IQ distribution can be modeled wiht a mean of 100 and a standrad deviation of 15. the corresponding density is:

)) )

bmi.mean <- mean(college_data$cases)
bmi.sd <- sd(college_data$cases)

lets create a plot of our normal distribution

cases.dist <- dnorm(
  college_data$cases, 
  mean = mean(college_data$cases, na.rm = TRUE), 
  sd = sd(college_data$cases, na.rm = TRUE)
)


cases.df <- data.frame(
  "bmi" = college_data$cases, 
  "density" = cases.dist
)


ggplot(cases.df, aes(x = bmi, y = density)) +
  geom_point()

This gives us the probability of every single point occuring Now lets use the pnorm function for more info

cases.dist <- pnorm(
  college_data$cases, 
  mean = mean(college_data$cases, na.rm = TRUE), 
  sd = sd(college_data$cases, na.rm = TRUE)
)


cases.df <- data.frame(
  "bmi" = college_data$cases, 
  "Density" = cases.dist
)


ggplot(cases.df, aes(x = bmi, y = Density)) +
  geom_point()

What if we want to find the probability of the bmi being greater than 40 in our distribution?

pp_greater <- function(x) {
  paste(round(100 * pnorm(x, mean = 30.66339, sd = 6.09818, lower.tail = FALSE), 2), "%")
}

pp_greater(40)

## [1] "6.29 %"

what about the probability that a bmi is less than 40 in our population?

pp_less <- function(x) (
paste(round(100*(1*pnorm(x, mean = 30.66339, sd = 6.09818, lower.tail = FALSE)), 2), "%")
)

pp_less(40)

## [1] "6.29 %"

What if we want to find the area in between?

p_between <- pnorm(40, mean = 30.66339, sd = 6.09818) - pnorm(20, mean = 30.66339, sd = 6.09818)


print(p_between)

## [1] 0.8969428

library(ggplot2)

x_vals <- seq(10, 50, length.out = 300)  
y_vals <- dnorm(x_vals, mean = 30.66339, sd = 6.09818)  


df <- data.frame(x = x_vals, y = y_vals)


ggplot(df, aes(x, y)) +
  geom_line(color = "blue") +
  geom_area(data = subset(df, x >= 20 & x <= 40), aes(x, y), fill = "blue", alpha = 0.3) +
  ggtitle("Normal Distribution with Shaded Region (20 to 40)") +
  theme_minimal()

WHat if we want to know the quantities? Lets use the pnorm function. WE need to assume a normal distribution for this.

What bmi respresents the lowest 1% of the population?

qnorm(0.01, mean = 30.66339, sd = 6.09818, lower.tail = TRUE)

## [1] 16.4769

What if you wnat a random sampling of values within your distribution?

subset <- rnorm(50, mean = 30.66339, sd = 6.09818)
hist(subset)

subset2 <- rnorm(5000, mean = 30.66339, sd = 6.09818)
 
 hist(subset2)

Shapiro-wilk Test

So now we know how to generate a normal distribution, how do we tell if our samples came from a normal distribution

?shapiro.test

shapiro.test(college_data$cases[1:5])

## 
##  Shapiro-Wilk normality test
## 
## data:  college_data$cases[1:5]
## W = 0.60868, p-value = 0.0008162

You can see here, with a small sample size, we would reject the ull hypothesis that the samples came from a normal distribution. we can increase the power of the test by increasing the sample size.

shapiro.test(college_data$cases[1:1000])

## 
##  Shapiro-Wilk normality test
## 
## data:  college_data$cases[1:1000]
## W = 0.42554, p-value < 2.2e-16

Now lets check out age

shapiro.test(college_data$cases[1:1001])

## 
##  Shapiro-Wilk normality test
## 
## data:  college_data$cases[1:1001]
## W = 0.42538, p-value < 2.2e-16

And lastly bmi

shapiro.test(college_data$cases[1:1002])

## 
##  Shapiro-Wilk normality test
## 
## data:  college_data$cases[1:1002]
## W = 0.4252, p-value < 2.2e-16

TIme series data

First lets load our packages

library(readr)

## 
## Attaching package: 'readr'

## The following object is masked from 'package:scales':
## 
##     col_factor

library(readxl)

Date - date of measurements time - time of measurement CO(GT) - average hourly c02 PT08, s1(C0) - tin oxide hourly average sensor response NMHC - average hourly non-metallix hydrocarbon concentration C6HC - average benzene concentration PT08.53(NMHC) - titania average hourly sensor response NQX - Average hourly NO2 concentration N02 - Average hourly N02 concentration T - temper RJH - relative humidity AH - Absolute humidity

str(college_data)

## 'data.frame':    1948 obs. of  9 variables:
##  $ date      : chr  "2021-05-26" "2021-05-26" "2021-05-26" "2021-05-26" ...
##  $ state     : chr  "Alabama" "Alabama" "Alabama" "Alabama" ...
##  $ county    : chr  "Madison" "Montgomery" "Limestone" "Lee" ...
##  $ city      : chr  "Huntsville" "Montgomery" "Athens" "Auburn" ...
##  $ ipeds_id  : chr  "100654" "100724" "100812" "100858" ...
##  $ college   : chr  "Alabama A&M University" "Alabama State University" "Athens State University" "Auburn University" ...
##  $ cases     : int  41 2 45 2742 220 4 263 137 49 76 ...
##  $ cases_2021: int  NA NA 10 567 80 NA 49 53 10 35 ...
##  $ notes     : chr  "" "" "" "" ...

library(tidyr)
library(dplyr)
library(lubridate)
library(hms)

## 
## Attaching package: 'hms'

## The following object is masked from 'package:lubridate':
## 
##     hms

library(ggplot2)

Lets det rid of the date in the time column

college_data$cases <- as_hms(college_data$cases)
glimpse(college_data)

## Rows: 1,948
## Columns: 9
## $ date       <chr> "2021-05-26", "2021-05-26", "2021-05-26", "2021-05-26", "20…
## $ state      <chr> "Alabama", "Alabama", "Alabama", "Alabama", "Alabama", "Ala…
## $ county     <chr> "Madison", "Montgomery", "Limestone", "Lee", "Montgomery", …
## $ city       <chr> "Huntsville", "Montgomery", "Athens", "Auburn", "Montgomery…
## $ ipeds_id   <chr> "100654", "100724", "100812", "100858", "100830", "102429",…
## $ college    <chr> "Alabama A&M University", "Alabama State University", "Athe…
## $ cases      <time> 00:00:41, 00:00:02, 00:00:45, 00:45:42, 00:03:40, 00:00:04…
## $ cases_2021 <int> NA, NA, 10, 567, 80, NA, 49, 53, 10, 35, 5, NA, 10, NA, 19,…
## $ notes      <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",…

’’’ Notice we have an outlier in our data

First we’ll look at the unrest_token funciton

Lets start by looking at an Emily Dickenson passage

text <- c("Because I could not stop from Death-", 
"He kindly stopped for me -",
"The Carriage held but just ourselves =",
"and Immortality")
text

## [1] "Because I could not stop from Death-"  
## [2] "He kindly stopped for me -"            
## [3] "The Carriage held but just ourselves ="
## [4] "and Immortality"

This is a typical character vector that we might want to analyze. In order to turn it into a tidytext dataset, we first need to put it into a dataframe.

library(dplyr)
text_df <- tibble(line = 1:4, text = text)

text_df

## # A tibble: 4 × 2
##    line text                                  
##   <int> <chr>                                 
## 1     1 Because I could not stop from Death-  
## 2     2 He kindly stopped for me -            
## 3     3 The Carriage held but just ourselves =
## 4     4 and Immortality

Reminder: A tibble is a modern class of data fram within R. It is available in the dplyr and tibble packages, that has a convenient print method, will not convert strongs to factors, and does not use row names. Tibbles are great for use with tidy tools.

Next we will use the ‘unrest_tokens’ function.

First we have the output column name that will be created as the text is unested into it

library(tidytext)

text_df %>%
  unnest_tokens(word, text)

## # A tibble: 20 × 2
##     line word       
##    <int> <chr>      
##  1     1 because    
##  2     1 i          
##  3     1 could      
##  4     1 not        
##  5     1 stop       
##  6     1 from       
##  7     1 death      
##  8     2 he         
##  9     2 kindly     
## 10     2 stopped    
## 11     2 for        
## 12     2 me         
## 13     3 the        
## 14     3 carriage   
## 15     3 held       
## 16     3 but        
## 17     3 just       
## 18     3 ourselves  
## 19     4 and        
## 20     4 immortality

Lets use the JaneAusten R package to analyze some Jane Austen texts. There are 6 books in this package.

library(janeaustenr)
library(dplyr)
library(stringr)

original_books <- austen_books() %>%
  group_by(book) %>%
  dplyr::mutate(linenumber = row_number(),
         chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]",
                                                 ignore_case = TRUE)))) %>%
  ungroup()
original_books

## # A tibble: 73,422 × 4
##    text                    book                linenumber chapter
##    <chr>                   <fct>                    <int>   <int>
##  1 "SENSE AND SENSIBILITY" Sense & Sensibility          1       0
##  2 ""                      Sense & Sensibility          2       0
##  3 "by Jane Austen"        Sense & Sensibility          3       0
##  4 ""                      Sense & Sensibility          4       0
##  5 "(1811)"                Sense & Sensibility          5       0
##  6 ""                      Sense & Sensibility          6       0
##  7 ""                      Sense & Sensibility          7       0
##  8 ""                      Sense & Sensibility          8       0
##  9 ""                      Sense & Sensibility          9       0
## 10 "CHAPTER 1"             Sense & Sensibility         10       1
## # ℹ 73,412 more rows

To work with this as a tidy dataset, we need to restructure it in the one-token-per-row format, which as we saw earlier is done with the unrest_tokens() function

library(tidytext)
tidy_books <- original_books %>%
  unnest_tokens(word, text)
tidy_books

## # A tibble: 725,055 × 4
##    book                linenumber chapter word       
##    <fct>                    <int>   <int> <chr>      
##  1 Sense & Sensibility          1       0 sense      
##  2 Sense & Sensibility          1       0 and        
##  3 Sense & Sensibility          1       0 sensibility
##  4 Sense & Sensibility          3       0 by         
##  5 Sense & Sensibility          3       0 jane       
##  6 Sense & Sensibility          3       0 austen     
##  7 Sense & Sensibility          5       0 1811       
##  8 Sense & Sensibility         10       1 chapter    
##  9 Sense & Sensibility         10       1 1          
## 10 Sense & Sensibility         13       1 the        
## # ℹ 725,045 more rows

This function uses the tokenizers package to separate each line of text in the original dataframe into tokens. The default tokenizing is for words, but other options including character, n-grass, sentences, lines, or paragraphs can be used. Now that the data is in a one-word-per-row format, we can manipulate it with tools like dplyr. Often in text analysis, we will want to remove stop words, stop words are words that are NOT USEFUL for an analysis. These include words like the, of, to, and, and so forth. We can remove words (kept in the tidytext dataset ‘stop_words’) with an anti_join().

data("stop_words")

tidy_books <- tidy_books %>%
  anti_join(stop_words)

## Joining with `by = join_by(word)`

The stop words dataset in the tidyset package contains stop words from three lexicons. We can use them all together, as we have here, or filter() to only use one set of stop words if thats more appropriate for your analysis.

tidy_books %>%
  count(word, sort = TRUE)

## # A tibble: 13,914 × 2
##    word       n
##    <chr>  <int>
##  1 miss    1855
##  2 time    1337
##  3 fanny    862
##  4 dear     822
##  5 lady     817
##  6 sir      806
##  7 day      797
##  8 emma     787
##  9 sister   727
## 10 house    699
## # ℹ 13,904 more rows

Because we’ve been using tidy tools, our word counts are stored in a tidy data frame. This allows us to pipe this directly into ggplot 2. For example, we can create a visualizaton of the most common words.

library(ggplot2)
tidy_books %>%
  count(word, sort = TRUE) %>%
  filter(n > 600) %>%
  dplyr::mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word)) +
  geom_col() +
  labs(y = NULL, x = "word count")

The gutenburgR package This package provides access to the public domain works from the gutenberg project (www.gutenburg.org). This package includes tools for both downloading books and a complete dataset of project gutenburg metadata that can be used to find works of interest. We will mostly use the function gutenburg_download().

word frequencies

Lets look at some biology texts, starting with Darwin

The voyage of the Beagle =944 On the origin of Species by means of natural selection = 1228 The expression of emotions in man and animals = 1227 The decent of man, and selection in relation to sex = 2300

We can access these worlds using the gutenberg_download() and the Project Gutenberg IDnumbers.

library(gutenbergr)

darwin <- gutenberg_download(944)

## Determining mirror for Project Gutenberg from https://www.gutenberg.org/robot/harvest

## Using mirror http://aleph.gutenberg.org

library(gutenbergr)

darwin <- gutenberg_download(2300)

Lets break these into tokens

tidy_darwin <- darwin %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)

## Joining with `by = join_by(word)`

Lets check out what the most common darwin words are

tidy_darwin %>%
  count(word, sort =TRUE)

## # A tibble: 14,123 × 2
##    word          n
##    <chr>     <int>
##  1 male       1593
##  2 males      1274
##  3 female     1148
##  4 species    1097
##  5 sexes      1046
##  6 females     987
##  7 birds       936
##  8 sexual      745
##  9 animals     669
## 10 selection   621
## # ℹ 14,113 more rows

Now lets get some work from Thomas Hunt Morgan, who is credited with discovering chromosomes.

Regeneration = 57198 Thee genetic and operative evidence relating to secondary sexual characteristics = 57460 Evolution and Adaptation = 63540

gutenberg_metadata %>%
  filter(title == "Regeneration")

## # A tibble: 2 × 8
##   gutenberg_id title     author gutenberg_author_id language gutenberg_bookshelf
##          <int> <chr>     <chr>                <int> <chr>    <chr>              
## 1        19964 Regenera… Dye, …               45153 en       "Science Fiction"  
## 2        57198 Regenera… Morga…               34763 en       ""                 
## # ℹ 2 more variables: rights <chr>, has_text <lgl>

morgan <- gutenberg_download(57198)

morgan <- gutenberg_download(57460)

Lets tokenize

``{r} tidy_morgan <- morgan %>% unnest_tokens(word, text) %>% anti_join(stop_words)

What are the most common words?

``{r}
tidy_morgan %>%
  count(word, sort = TRUE)

Lastly lets look at Thomas Henry Haxley

Evidence as to mans place in nature - 2911

On the reception of the origin of Species - 2089

Evolution and Ethics, and other Essays- 2940

Science and Culture, and other essays = 52344

{r} huxley <- gutenberg_download(2931) ```{r} tidy_huxley <- huxley %>% unnest_tokens(word, text) %>% anti_join(stop_words)

``{r}
tidy_huxley %>%
  count(word, sort = TRUE)

Now, lets calculate the frequency for each word for the works of Darwin, Morgan and Haxley by binding the frames together

Now we need to change the table so that each author has its own row

Now lets plot

``{r} library(ggplot2) library(scales)

ggplot(frequency2, aes(x = Charles Darwin, y = Thomas Hunt Morgan, color = abs(Charles Darwin - Thomas Hunt Morgan))) + geom_abline(color = “gray40”, lty = 2) + geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) + geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) + scale_x_log10(labels = percent_format()) + scale_y_log10(labels = percent_format()) + scale_color_gradient(limits = c(0, 0.001), low = “darkslategray4”, high = “gray75”) + theme(legend.position = “none”) + labs(y = “Thomas Hunt Morgan”, x = “Charles Darwin”)

``{r}
  colnames(frequency2)

The sentiments datasets

There are a variety of methods and dictionaries that exists for evaluating the opinion or emotion of the text.

AFINN BING NRC

Bing categorizes words in a binary fashion into positive or negative NRC categorizes into positive, negative, anger, anticipation, disgust, fear, joy sadness, suprise, and trust AFFIN assigns a score between -5 and 5 with negative indicating negative sentiment, and 5 positive.

The function get_sentiments() allows us to get the specific sentiments lexicon with the measures for each one

library(tidytext)
install.packages("textdata")

## Installing package into '/home/student/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)

afinn <- get_sentiments("afinn")
afinn

## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ℹ 2,467 more rows

Lets look at bing

bing <- get_sentiments("bing")

bing

## # A tibble: 6,786 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faces     negative 
##  2 abnormal    negative 
##  3 abolish     negative 
##  4 abominable  negative 
##  5 abominably  negative 
##  6 abominate   negative 
##  7 abomination negative 
##  8 abort       negative 
##  9 aborted     negative 
## 10 aborts      negative 
## # ℹ 6,776 more rows

nrc <- get_sentiments("nrc")

nrc

## # A tibble: 13,872 × 2
##    word        sentiment
##    <chr>       <chr>    
##  1 abacus      trust    
##  2 abandon     fear     
##  3 abandon     negative 
##  4 abandon     sadness  
##  5 abandoned   anger    
##  6 abandoned   fear     
##  7 abandoned   negative 
##  8 abandoned   sadness  
##  9 abandonment anger    
## 10 abandonment fear     
## # ℹ 13,862 more rows

These libraries were created wither using crowdsourcing or cloud computing/ai like amazon Mechanical Turk, or by labor of one of the authors, and then validated with crowdsourcing.

Lets look at the words with a joy score from nrc

library(gutenbergr)
library(dplyr)
library(stringr)

darwin <- gutenberg_download(944)
tidy_books <- darwin %>%
  group_by(gutenberg_id) %>%
  dplyr::mutate(linenumber = row_number(), chapter = cumsum(str_detect(text, regex("^chapter [\\divxlc]", ignore_case = TRUE)))) %>%
  ungroup() %>%
  unnest_tokens(word, text)
tidy_books

## # A tibble: 208,118 × 4
##    gutenberg_id linenumber chapter word   
##           <int>      <int>   <int> <chr>  
##  1          944          1       0 the    
##  2          944          1       0 voyage 
##  3          944          1       0 of     
##  4          944          1       0 the    
##  5          944          1       0 beagle 
##  6          944          1       0 by     
##  7          944          2       0 charles
##  8          944          2       0 darwin 
##  9          944          8       0 about  
## 10          944          8       0 the    
## # ℹ 208,108 more rows

Lets add the book name instead of GID

colnames(tidy_books)[1] <- "book"

tidy_books$book[tidy_books$book == 944] <- "The Voyage of the Beagle"
tidy_books

## # A tibble: 208,118 × 4
##    book                     linenumber chapter word   
##    <chr>                         <int>   <int> <chr>  
##  1 The Voyage of the Beagle          1       0 the    
##  2 The Voyage of the Beagle          1       0 voyage 
##  3 The Voyage of the Beagle          1       0 of     
##  4 The Voyage of the Beagle          1       0 the    
##  5 The Voyage of the Beagle          1       0 beagle 
##  6 The Voyage of the Beagle          1       0 by     
##  7 The Voyage of the Beagle          2       0 charles
##  8 The Voyage of the Beagle          2       0 darwin 
##  9 The Voyage of the Beagle          8       0 about  
## 10 The Voyage of the Beagle          8       0 the    
## # ℹ 208,108 more rows

Now that we have a tidy format with one word per row, we are ready for sentiment analysis, Fist lets use NRC.

nrc_joy <- get_sentiments("nrc") %>%
  filter(sentiment == "joy")

tidy_books %>%
 filter(book == "The Voyage of the Beagle") %>%
  inner_join(nrc_joy) %>%
  count(word, sort = TRUE)

## Joining with `by = join_by(word)`

## # A tibble: 277 × 2
##    word           n
##    <chr>      <int>
##  1 found        301
##  2 good         161
##  3 remarkable   114
##  4 green         95
##  5 kind          92
##  6 tree          86
##  7 present       85
##  8 food          78
##  9 beautiful     61
## 10 elevation     60
## # ℹ 267 more rows

We can also examine how sentiment changes throughout a work.

library(tidyr)
library(dplyr)  

Charles_Darwin_sentiment <- tidy_books %>%
  inner_join(get_sentiments('bing')) %>%
  count(book, index = linenumber %/% 80, sentiment) %>% 
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%  
  dplyr::mutate(sentiment = positive - negative)

## Joining with `by = join_by(word)`

Now lets plot it

library(ggplot2)
ggplot(Charles_Darwin_sentiment, aes(index, sentiment, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free_x")

colnames(Charles_Darwin_sentiment)

## [1] "book"      "index"     "negative"  "positive"  "sentiment"

Lets compare the three sentiment dictions

There are seveal options for sentiment lexicons you might want some more inof on which is appropriate for your purpose. Here we will use all three of our dictionaries and examine how the sentiment changes across the arc of tvob

library(tidyr)
voyage <- tidy_books %>%
  filter(book == 'The Voyage of the Beagle')
voyage

## # A tibble: 208,118 × 4
##    book                     linenumber chapter word   
##    <chr>                         <int>   <int> <chr>  
##  1 The Voyage of the Beagle          1       0 the    
##  2 The Voyage of the Beagle          1       0 voyage 
##  3 The Voyage of the Beagle          1       0 of     
##  4 The Voyage of the Beagle          1       0 the    
##  5 The Voyage of the Beagle          1       0 beagle 
##  6 The Voyage of the Beagle          1       0 by     
##  7 The Voyage of the Beagle          2       0 charles
##  8 The Voyage of the Beagle          2       0 darwin 
##  9 The Voyage of the Beagle          8       0 about  
## 10 The Voyage of the Beagle          8       0 the    
## # ℹ 208,108 more rows

Lets again use interger division (%/%) to define larger sections of the text that span multiple lines, and we can use the same patterns with count(), pivot_wider(), to find the net sentiment in each of these section of text.

affin <- voyage %>%
  inner_join(get_sentiments("afinn")) %>%
  group_by(index = linenumber %/% 80) %>%
  dplyr::summarise(sentiment = sum(value)) %>%
  dplyr::mutate(method = "AFINN")

## Joining with `by = join_by(word)`

bing_and_nrc <- bind_rows(
  voyage %>%
    inner_join(get_sentiments("bing")) %>%
    dplyr::mutate(method = "Bing et al."),
  voyage %>%
    inner_join(get_sentiments("nrc") %>%
                 filter(sentiment %in% c("positive", "negative"))
               ) %>%
    dplyr::mutate(method = "NRC")) %>%
  count(method, index = linenumber %/% 80, sentiment) %>%
  pivot_wider(names_from = sentiment,
                values_from = n,
                values_fill = 0) %>%
  dplyr::mutate(sentiment = positive - negative)

## Joining with `by = join_by(word)`
## Joining with `by = join_by(word)`

## Warning in inner_join(., get_sentiments("nrc") %>% filter(sentiment %in% : Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1154 of `x` matches multiple rows in `y`.
## ℹ Row 4245 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

We can now estimate the new sentiment (positive - negative) in each chunk of the novel text for each lexicon (dictionary). Lets bind those all together and visualize with ggplot.

bind_rows(affin, bing_and_nrc) %>%
  ggplot(aes(index, sentiment, fill = method)) +
  geom_col(show.legned = FALSE) +
  facet_wrap(~method, ncol = 1, scales = "free_y")

## Warning in geom_col(show.legned = FALSE): Ignoring unknown parameters:
## `show.legned`

Lets look at the counts based on each dictionary

get_sentiments('nrc') %>%
  filter(sentiment %in% c("positive", "negative")) %>%
  count(sentiment)

## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   3316
## 2 positive   2308

get_sentiments('bing') %>%
  count(sentiment)

## # A tibble: 2 × 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

bing_word_counts <- tidy_books %>%
  inner_join(get_sentiments('bing')) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup ()

## Joining with `by = join_by(word)`

bing_word_counts

## # A tibble: 1,615 × 3
##    word       sentiment     n
##    <chr>      <chr>     <int>
##  1 great      positive    519
##  2 like       positive    366
##  3 well       positive    230
##  4 good       positive    161
##  5 wild       negative    118
##  6 remarkable positive    114
##  7 fine       positive    109
##  8 scarcely   negative     96
##  9 doubt      negative     80
## 10 broken     negative     74
## # ℹ 1,605 more rows

This can be shown visually, and we can pipe straight into ggplot2

bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>%
  ungroup() %>%
  dplyr::mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scale = "free_y") +
  labs(x = 'contribution to sentiment', y = NULL)

Lets spot an anomoly in the dataset.

custom_stop_words <- bind_rows(tibble(words = c("wild", "dark", "great", "like"), lexicon = c("custon")), stop_words)
custom_stop_words

## # A tibble: 1,153 × 3
##    words lexicon word     
##    <chr> <chr>   <chr>    
##  1 wild  custon  <NA>     
##  2 dark  custon  <NA>     
##  3 great custon  <NA>     
##  4 like  custon  <NA>     
##  5 <NA>  SMART   a        
##  6 <NA>  SMART   a's      
##  7 <NA>  SMART   able     
##  8 <NA>  SMART   about    
##  9 <NA>  SMART   above    
## 10 <NA>  SMART   according
## # ℹ 1,143 more rows

Word clouds we can see that tidy text mining and sentiment analysis workds well with ggplot2, but having our data in tidy format leads to other nice graphing techniques

lets use the wordcloud package!!

library(wordcloud)

## 
## Attaching package: 'wordcloud'

## The following object is masked from 'package:gplots':
## 
##     textplot

tidy_books %>%
  anti_join(stop_words) %>%
  count(word) %>%
  with(wordcloud(word, n, max.words = 100))

## Joining with `by = join_by(word)`

## Warning in wordcloud(word, n, max.words = 100): country could not be fit on
## page. It will not be plotted.

## Warning in wordcloud(word, n, max.words = 100): feet could not be fit on page.
## It will not be plotted.

## Warning in wordcloud(word, n, max.words = 100): america could not be fit on
## page. It will not be plotted.

## Warning in wordcloud(word, n, max.words = 100): inhabitants could not be fit on
## page. It will not be plotted.

## Warning in wordcloud(word, n, max.words = 100): time could not be fit on page.
## It will not be plotted.

## Warning in wordcloud(word, n, max.words = 100): water could not be fit on page.
## It will not be plotted.

## Warning in wordcloud(word, n, max.words = 100): scarcely could not be fit on
## page. It will not be plotted.

## Warning in wordcloud(word, n, max.words = 100): numerous could not be fit on
## page. It will not be plotted.

## Warning in wordcloud(word, n, max.words = 100): round could not be fit on page.
## It will not be plotted.

## Warning in wordcloud(word, n, max.words = 100): people could not be fit on
## page. It will not be plotted.

## Warning in wordcloud(word, n, max.words = 100): chile could not be fit on page.
## It will not be plotted.

Lets also look at comparison.clouds(), which may require turning the datafrome into a matrix.

We can change to matrix using the acast() function

library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tigerstats':
## 
##     tips

## The following object is masked from 'package:tidyr':
## 
##     smiths

tidy_books %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  acast(word ~ sentiment, value.var = "n", fill = 0) %>%
  comparison.cloud(colors = c("gray20", "gray80"), max.words = 100)

## Joining with `by = join_by(word)`

Looking at units beyond words

Lots of useful work can be done by tokenizing at the word level, but sometimes its nice to look at different units in text. FOr examle, we can look beyond just unigrams

Ex. I am not having a good day ``{r} bingnegative <- get_sentiments(“bing”) %>% filter(sentiment == “negative”) wordcounts <- tidy_books %>% group_by(book, chapter) %>% summarize(words = n()) tidy_books %>% semi_join(bingnegative) %>% group_by(book, chapter) %>% summarize(negativewords = n()) %>% left_join(wordcounts, by = c(“book”, “chapter”)) %>% dplyr::mutate(ratio = negativewords/words) %>% filter(chapter > 0) %>% slice_max(ratio, n = 1) %>% ungroup()

  
So far we have only looked at single words, but many interseting (more accurate) analyses are based on the relationship between words.

Lets look at some methods of tidytext for calculating and visualizing word relationships

```r
library(dplyr)
library(tidytext)
library(gutenbergr)


darwin_books <- gutenberg_download(c(944))


darwin_books <- darwin_books %>%
  dplyr::mutate(book = ifelse(gutenberg_id == 944, "The Voyage of the Beagle", NA))


darwin_bigrams <- darwin_books %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2)


darwin_bigrams

## # A tibble: 191,790 × 3
##    gutenberg_id book                     bigram        
##           <int> <chr>                    <chr>         
##  1          944 The Voyage of the Beagle the voyage    
##  2          944 The Voyage of the Beagle voyage of     
##  3          944 The Voyage of the Beagle of the        
##  4          944 The Voyage of the Beagle the beagle    
##  5          944 The Voyage of the Beagle beagle by     
##  6          944 The Voyage of the Beagle charles darwin
##  7          944 The Voyage of the Beagle <NA>          
##  8          944 The Voyage of the Beagle <NA>          
##  9          944 The Voyage of the Beagle <NA>          
## 10          944 The Voyage of the Beagle <NA>          
## # ℹ 191,780 more rows

This data is still in tidytext format, and is structured as one-token-per-row. Each token is a bigram. Counting and filtering n-grams

darwin_bigrams %>%
  count(bigram, sort = TRUE)

## # A tibble: 89,022 × 2
##    bigram       n
##    <chr>    <int>
##  1 of the    2787
##  2 <NA>      1378
##  3 in the    1244
##  4 on the     826
##  5 to the     780
##  6 of a       570
##  7 from the   558
##  8 it is      540
##  9 and the    470
## 10 by the     435
## # ℹ 89,012 more rows

Most of the common bigrams are stop-words. This can be a good time to use tidyr’s seperate command which splits a column into multiople based on a delimiter. THis will let us make a column for word one and word two.

library(tidyr)

bigrams_separated <- darwin_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
  filter(word1 %in% stop_words$word) %>%
  filter(word2 %in% stop_words$word)

bigrams_filtered

## # A tibble: 64,961 × 4
##    gutenberg_id book                     word1 word2
##           <int> <chr>                    <chr> <chr>
##  1          944 The Voyage of the Beagle of    the  
##  2          944 The Voyage of the Beagle about the  
##  3          944 The Voyage of the Beagle at    the  
##  4          944 The Voyage of the Beagle the   end  
##  5          944 The Voyage of the Beagle end   of   
##  6          944 The Voyage of the Beagle of    each 
##  7          944 The Voyage of the Beagle of    the  
##  8          944 The Voyage of the Beagle i     have 
##  9          944 The Voyage of the Beagle in    the  
## 10          944 The Voyage of the Beagle to    the  
## # ℹ 64,951 more rows

New bigram counts

bigram_counts <- bigrams_filtered %>%
  unite(bigram, word1, word2, sep = " ")
bigram_counts

## # A tibble: 64,961 × 3
##    gutenberg_id book                     bigram   
##           <int> <chr>                    <chr>    
##  1          944 The Voyage of the Beagle of the   
##  2          944 The Voyage of the Beagle about the
##  3          944 The Voyage of the Beagle at the   
##  4          944 The Voyage of the Beagle the end  
##  5          944 The Voyage of the Beagle end of   
##  6          944 The Voyage of the Beagle of each  
##  7          944 The Voyage of the Beagle of the   
##  8          944 The Voyage of the Beagle i have   
##  9          944 The Voyage of the Beagle in the   
## 10          944 The Voyage of the Beagle to the   
## # ℹ 64,951 more rows

We may also be interested in trigrams, which are three word combos.

trigrams <- darwin_books %>%
  unnest_tokens(trigrams, text, token = "ngrams", n = 3) %>%
  separate(trigrams, c("word1", "word2", "word3"), sep = " ") %>%
  filter(word1 %in% stop_words$word,
         word2 %in% stop_words$word,
         word3 %in% stop_words$word) %>%
  count(word1, word2, word3, sort = TRUE)

trigrams

## # A tibble: 19,175 × 4
##    word1 word2  word3     n
##    <chr> <chr>  <chr> <int>
##  1 one   of     the      99
##  2 part  of     the      94
##  3 some  of     the      78
##  4 parts of     the      63
##  5 it    is     a        59
##  6 in    the    same     52
##  7 must  have   been     47
##  8 that  of     the      47
##  9 the   number of       45
## 10 there is     a        44
## # ℹ 19,165 more rows

Lets analyze some bigrams

bigrams_filtered %>%
  filter(word1 == "selection") %>%
  count(book, word1, sort = TRUE)

## # A tibble: 0 × 3
## # ℹ 3 variables: book <chr>, word1 <chr>, n <int>

Lets again look at tf-idf across bigrams across Darwins works.

bigram_tf_idf <- bigram_counts %>%
  count(book, bigram) %>%
  bind_tf_idf(bigram, book, n) %>%
  arrange(desc(tf_idf))
bigram_tf_idf

## # A tibble: 12,482 × 6
##    book                     bigram              n        tf   idf tf_idf
##    <chr>                    <chr>           <int>     <dbl> <dbl>  <dbl>
##  1 The Voyage of the Beagle a better            6 0.0000924     0      0
##  2 The Voyage of the Beagle a brief             1 0.0000154     0      0
##  3 The Voyage of the Beagle a case              6 0.0000924     0      0
##  4 The Voyage of the Beagle a cause             3 0.0000462     0      0
##  5 The Voyage of the Beagle a certain          26 0.000400      0      0
##  6 The Voyage of the Beagle a clear             6 0.0000924     0      0
##  7 The Voyage of the Beagle a corresponding     2 0.0000308     0      0
##  8 The Voyage of the Beagle a course            1 0.0000154     0      0
##  9 The Voyage of the Beagle a different        13 0.000200      0      0
## 10 The Voyage of the Beagle a fact              6 0.0000924     0      0
## # ℹ 12,472 more rows

library(dplyr)
library(ggplot2)

bigram_tf_idf %>%
  group_by(book) %>%
  slice_max(tf_idf, n = 10) %>%
  ungroup() %>%
  arrange(desc(tf_idf)) %>%  
  dplyr::mutate(bigram = reorder(bigram, tf_idf),
         book = as.factor(book)) %>% 
  ggplot(aes(tf_idf, bigram, fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free") +
  labs(x = "tf~idf of bigrams", y = NULL)

Using bigrams to provide context in sentiment analysis

bigrams_separated %>%
  filter(word1 == "not") %>%
  count(word1, word2, sort = TRUE)

## # A tibble: 350 × 3
##    word1 word2      n
##    <chr> <chr>  <int>
##  1 not   a         35
##  2 not   have      25
##  3 not   appear    21
##  4 not   be        21
##  5 not   to        21
##  6 not   very      20
##  7 not   know      18
##  8 not   so        18
##  9 not   at        17
## 10 not   one       17
## # ℹ 340 more rows

By doing sentiment analysis on bigrams, we can examine how often sentiment~associated words are preceded by a modifier like “not” or other negating words.

install.packages("textdata")

## Installing package into '/home/student/R/x86_64-pc-linux-gnu-library/4.3'
## (as 'lib' is unspecified)

AFINN <- get_sentiments("afinn")
AFINN

## # A tibble: 2,477 × 2
##    word       value
##    <chr>      <dbl>
##  1 abandon       -2
##  2 abandoned     -2
##  3 abandons      -2
##  4 abducted      -2
##  5 abduction     -2
##  6 abductions    -2
##  7 abhor         -3
##  8 abhorred      -3
##  9 abhorrent     -3
## 10 abhors        -3
## # ℹ 2,467 more rows

WE can examine the most frequent words that were preceded by “not”, and associate with sentiment.

not_words <- bigrams_separated %>%
  filter(word1 == "not") %>%
  inner_join(AFINN, by = c(word2 = "word")) %>%
  count(word2, value, sort = TRUE)
not_words

## # A tibble: 46 × 3
##    word2     value     n
##    <chr>     <dbl> <int>
##  1 like          2     9
##  2 reach         1     5
##  3 doubt        -1     4
##  4 beautiful     3     2
##  5 difficult    -1     2
##  6 easy          1     2
##  7 great         3     2
##  8 help          2     2
##  9 advanced      1     1
## 10 affected     -1     1
## # ℹ 36 more rows

Lets visualize

library(ggplot2)
library(dplyr)

not_words %>%
  dplyr::mutate(contribution = n + value) %>%
  dplyr::mutate(word2 = reorder(word2, contribution)) %>%  
  arrange(desc(abs(contribution))) %>%
  head(20) %>%
  ggplot(aes(contribution, word2, fill = contribution > 0)) +
  geom_col(show.legend = FALSE) +
  labs(x = "Sentiment value + number of occurrences", y = "Words preceded by 'not'")

negation_words <- c("not", "no", "never", "non", "without")
negated_words <- bigrams_separated %>%
  filter(word1 %in% negation_words) %>%
  inner_join(AFINN, by = c(word2 = "word")) %>%
  count(word1, word2, value, sort = TRUE)
negated_words

## # A tibble: 86 × 4
##    word1   word2   value     n
##    <chr>   <chr>   <dbl> <int>
##  1 no      doubt      -1    38
##  2 no      great       3    10
##  3 not     like        2     9
##  4 without doubt      -1     7
##  5 not     reach       1     5
##  6 not     doubt      -1     4
##  7 never   failing    -2     3
##  8 never   fail       -2     2
##  9 never   failed     -2     2
## 10 never   forget     -1     2
## # ℹ 76 more rows

Lets visualize the negation words

negated_words %>%
  dplyr::mutate(contribution = n + value,
         word2 = reorder(paste(word2, word1, sep = "_"), contribution)) %>%
  group_by(word1) %>%
  slice_max(abs(contribution), n = 12, with_ties = FALSE) %>%
  ggplot(aes(word2, contribution, fill = n + value > 0)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~word1, scales = "free") +
  scale_x_discrete(labels = function(x) gsub("_.+$", "", x)) +
  xlab("words preceded by negation term") +
  ylab("sentiment value + # of occurences") +
  coord_flip()

visualize a network of bigrams with graph

library(igraph)

## 
## Attaching package: 'igraph'

## The following object is masked from 'package:mosaic':
## 
##     compare

## The following objects are masked from 'package:lubridate':
## 
##     %--%, union

## The following object is masked from 'package:class':
## 
##     knn

## The following object is masked from 'package:plotly':
## 
##     groups

## The following object is masked from 'package:tidyr':
## 
##     crossing

## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

bigram_counts <- bigrams_filtered %>%
  count(word1, word2, sort = TRUE)

bigram_graph <- bigram_counts %>%
  filter(n>20) %>%
  graph_from_data_frame()
bigram_graph

## IGRAPH 0879a9d DN-- 186 489 -- 
## + attr: name (v/c), n (e/n)
## + edges from 0879a9d (vertex names):
##  [1] of    ->the   in    ->the   on    ->the   to    ->the   of    ->a    
##  [6] from  ->the   it    ->is    and   ->the   by    ->the   at    ->the  
## [11] that  ->the   in    ->a     with  ->the   have  ->been  it    ->was  
## [16] the   ->same  for   ->the   as    ->the   one   ->of    to    ->be   
## [21] a     ->few   is    ->a     with  ->a     i     ->was   by    ->a    
## [26] the   ->whole of    ->these of    ->this  i     ->have  part  ->of   
## [31] to    ->a     they  ->are   and   ->a     in    ->this  had   ->been 
## [36] during->the   there ->is    the   ->most  a     ->very  the   ->other
## + ... omitted several edges

library(ggraph)
set.seed(1234)
ggraph(bigram_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)

## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

WE can also add diretionality to this network

set.seed(1234)
a <- grid::arrow(type = "closed", length = unit(0.15, "inches"))
ggraph(bigram_graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE, arrow = a, end_cap = circle(0.7, 'inches')) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label=name), vjust = 1, hjust = 1) +
  theme_void()

A central question in text mining is how to quantify what a document is about. we can do this but looking at words that make up the document, and measuing term frequency.

There are a lot of words that may not be important, these are the stop words.

One way to remedy this is to look at incerse document frequency words, which decreases the weight for commonly used worsd and increases the weight for words that are not used very much.

Term frequency in Darwins works

library(dplyr)
library(tidytext)
library(gutenbergr)

book_words <- gutenberg_download(c(944))
colnames(book_words)[1] <- "book"
book_words$book[book_words$book ==944] <- "The Voyage of the Beagle"

Now lets disect

book_words <- book_words %>%
  unnest_tokens(word, text) %>%
  count(book, word, sort = TRUE)

book_words

## # A tibble: 12,551 × 3
##    book                     word      n
##    <chr>                    <chr> <int>
##  1 The Voyage of the Beagle the   16930
##  2 The Voyage of the Beagle of     9438
##  3 The Voyage of the Beagle and    5768
##  4 The Voyage of the Beagle a      5328
##  5 The Voyage of the Beagle in     4294
##  6 The Voyage of the Beagle to     4093
##  7 The Voyage of the Beagle is     2414
##  8 The Voyage of the Beagle it     1998
##  9 The Voyage of the Beagle that   1939
## 10 The Voyage of the Beagle on     1869
## # ℹ 12,541 more rows

book_words$n <- as.numeric(book_words$n)
total_words <- book_words %>%
  group_by(book) %>%
  summarize(total = sum(n))
book_words

## # A tibble: 12,551 × 3
##    book                     word      n
##    <chr>                    <chr> <dbl>
##  1 The Voyage of the Beagle the   16930
##  2 The Voyage of the Beagle of     9438
##  3 The Voyage of the Beagle and    5768
##  4 The Voyage of the Beagle a      5328
##  5 The Voyage of the Beagle in     4294
##  6 The Voyage of the Beagle to     4093
##  7 The Voyage of the Beagle is     2414
##  8 The Voyage of the Beagle it     1998
##  9 The Voyage of the Beagle that   1939
## 10 The Voyage of the Beagle on     1869
## # ℹ 12,541 more rows

``{r} book_words <- left_join(book_words, total_words)

book_words


You can see that the usual suspexts are the most common words, but dont tell us anything about what teh books topic is.

``{r}
library(ggplot2)

ggplot(book_words, aes(n/total, fill = book)) +
  geom_histogram(show.legend = FALSE) +
  xlim(NA, 0.0009) +
  facet_wrap(~book, ncol = 2, scales = "free_y")

Zipf law

the frequency that a word appears is inversly proportianl to its rank when predicting a topic

Lets apply Zipfs law to Darwin’s work

``{r} freq_by_rank <- book_words %>% group_by(book) %>% dplyr::mutate(rank = row_number(), ‘term frequency’ = n/total) %>% ungroup() freq_by_rank


``{r}
freq_by_rank <- book_words %>%
  group_by(book) %>%
  dplyr::mutate(rank = row_number(),
         'term frequency' = n/total) %>%
  ungroup()
freq_by_rank

Lets us TF - IDF to find words for each document by decreasing the weight for commonly used words and increasing the weight for words that are not used very in a collection of documents.

book_tf_idf <- book_words %>%
  bind_tf_idf(word, book, n)

book_tf_idf

## # A tibble: 12,551 × 6
##    book                     word      n      tf   idf tf_idf
##    <chr>                    <chr> <dbl>   <dbl> <dbl>  <dbl>
##  1 The Voyage of the Beagle the   16930 0.0813      0      0
##  2 The Voyage of the Beagle of     9438 0.0453      0      0
##  3 The Voyage of the Beagle and    5768 0.0277      0      0
##  4 The Voyage of the Beagle a      5328 0.0256      0      0
##  5 The Voyage of the Beagle in     4294 0.0206      0      0
##  6 The Voyage of the Beagle to     4093 0.0197      0      0
##  7 The Voyage of the Beagle is     2414 0.0116      0      0
##  8 The Voyage of the Beagle it     1998 0.00960     0      0
##  9 The Voyage of the Beagle that   1939 0.00932     0      0
## 10 The Voyage of the Beagle on     1869 0.00898     0      0
## # ℹ 12,541 more rows

Lets look at terms with high tf-idf in Darwins works

``{r} book_tf_idf %>% select(-total) %>% arrange(desc(tf_idf))


Lets look at a visulization for these high tf-idf words

``{r}
library(forcats)
book_tf_idf %>%
  group_by(book) %>%
  slice_max(tf_idf, n = 15) %>%
  ungroup() %>%
  ggplot(aes(tf_idf, fct_reorder(word, tf_idf), fill = book)) +
  geom_col(show.legend= FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free") +
  labs(x = "tf-idf", y = NULL)

library(ggplot2)
library(dplyr)
library(forcats)


book_tf_idf$tf_idf <- as.numeric(book_tf_idf$tf_idf)


book_tf_idf$book <- as.factor(book_tf_idf$book)


book_tf_idf %>%
  group_by(book) %>%
  slice_max(tf_idf, n = 15) %>%
  ungroup() %>%
  ggplot(aes(tf_idf, fct_reorder(word, tf_idf), fill = book)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~book, ncol = 2, scales = "free") +
  labs(x = "tf-idf", y = NULL) +
  theme_minimal()

Complete Final Project

2025-02-15