Markdown-Feb17th

R Markdown

setup

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.6     v dplyr   1.0.4
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(extrafont)

## Registering fonts with R

library(skimr)
library(gplots)

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

library(corrplot)

## corrplot 0.84 loaded

setwd('C:/Users/megha/OneDrive/Desktop/workshop/')

mytheme <- theme(
  panel.border = element_blank(),
  panel.grid.major = element_line(
    color = 'grey60',
    linetype = 'dashed',
    size = 0.5),
  panel.grid.minor = element_blank(),
  panel.background = element_rect(colour = "white", fill = "white"),
  axis.line = element_line(colour = "white"),
  text = element_text(family = "Palatino Linotype", color = 'grey5'),
  axis.text.x = element_text(colour = "grey5", size = 10),
  axis.text.y = element_text(colour = "grey5", size = 10, ),
  axis.ticks = element_line(colour = "grey5"),title = element_text(
    colour = 'grey5',
    size = 10,
    vjust = -4
  )
)

read-in files using tidyverse

raw <- read_csv('thePerfectdata_raw.csv')

## 
## -- Column specification --------------------------------------------------------
## cols(
##   SampleID = col_character(),
##   Strain = col_character(),
##   Replicate = col_double(),
##   Trial = col_character(),
##   Exposed_CFU = col_double(),
##   NoExposure_CFU = col_double()
## )

clean <- read_csv('thePerfectdata_clean.csv')

## 
## -- Column specification --------------------------------------------------------
## cols(
##   SampleID = col_character(),
##   Strain = col_character(),
##   Replicate = col_double(),
##   Trial = col_double(),
##   Exposed_CFU = col_double(),
##   NoExposure_CFU = col_double()
## )

Summarize and clean

base R

summary(raw)

##    SampleID            Strain            Replicate    Trial          
##  Length:48          Length:48          Min.   :1   Length:48         
##  Class :character   Class :character   1st Qu.:1   Class :character  
##  Mode  :character   Mode  :character   Median :2   Mode  :character  
##                                        Mean   :2                     
##                                        3rd Qu.:3                     
##                                        Max.   :3                     
##                                                                      
##   Exposed_CFU        NoExposure_CFU     
##  Min.   :0.000e+00   Min.   :1.000e+00  
##  1st Qu.:2.950e+02   1st Qu.:3.110e+02  
##  Median :1.800e+04   Median :9.900e+03  
##  Mean   :1.305e+09   Mean   :1.334e+09  
##  3rd Qu.:9.734e+05   3rd Qu.:2.800e+04  
##  Max.   :1.000e+10   Max.   :1.000e+10  
##  NA's   :2           NA's   :3

table(raw$Strain)

## 
##   mut_PD  mutP_Da    mutpd    mutPD  mutPD_a   mutPDa wildtype       wt 
##        1        2        2       15        4        6        3        3 
##       WT 
##       12

tidyverse

skim(raw)

Data summary
Name	raw
Number of rows	48
Number of columns	6
_______________________
Column type frequency:
character	3
numeric	3
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
SampleID	1	7	8	48
Strain	1	2	8	9
Trial	1	1	4	9

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Replicate	0	1.00	2	8.300000e-01	1	1	2.0	3	3e+00	▇▁▇▁▇
Exposed_CFU	2	0.96	1304631625	3.404915e+09	0	295	17999.5	973385	1e+10	▇▁▁▁▁
NoExposure_CFU	3	0.94	1334008371	3.437492e+09	1	311	9900.0	28000	1e+10	▇▁▁▁▁

raw %>%
  dplyr::group_by(Strain) %>%
  skim()

Data summary
Name	Piped data
Number of rows	48
Number of columns	6
_______________________
Column type frequency:
character	2
numeric	3
________________________
Group variables	Strain

Variable type: character

skim_variable	Strain	complete_rate	min	max	n_unique
SampleID	mut_PD	1	8	8	1
SampleID	mutP_Da	1	8	8	2
SampleID	mutpd	1	7	8	2
SampleID	mutPD	1	7	8	15
SampleID	mutPD_a	1	7	8	4
SampleID	mutPDa	1	8	8	6
SampleID	wildtype	1	7	8	3
SampleID	wt	1	7	8	3
SampleID	WT	1	7	8	12
Trial	mut_PD	1	1	1	1
Trial	mutP_Da	1	1	1	1
Trial	mutpd	1	1	4	2
Trial	mutPD	1	1	3	7
Trial	mutPD_a	1	1	3	2
Trial	mutPDa	1	1	1	3
Trial	wildtype	1	1	1	3
Trial	wt	1	1	1	3
Trial	WT	1	1	1	6

Variable type: numeric

skim_variable	Strain	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
Replicate	mut_PD	0	1.00	3.000000e+00	NA	3	3.00	3.0	3.000000e+00	3	▁▁▇▁▁
Replicate	mutP_Da	0	1.00	2.500000e+00	7.100000e-01	2	2.25	2.5	2.750000e+00	3	▇▁▁▁▇
Replicate	mutpd	0	1.00	1.500000e+00	7.100000e-01	1	1.25	1.5	1.750000e+00	2	▇▁▁▁▇
Replicate	mutPD	0	1.00	2.000000e+00	8.500000e-01	1	1.00	2.0	3.000000e+00	3	▇▁▇▁▇
Replicate	mutPD_a	0	1.00	1.750000e+00	9.600000e-01	1	1.00	1.5	2.250000e+00	3	▇▁▃▁▃
Replicate	mutPDa	0	1.00	2.000000e+00	8.900000e-01	1	1.25	2.0	2.750000e+00	3	▇▁▇▁▇
Replicate	wildtype	0	1.00	1.670000e+00	5.800000e-01	1	1.50	2.0	2.000000e+00	2	▃▁▁▁▇
Replicate	wt	0	1.00	2.670000e+00	5.800000e-01	2	2.50	3.0	3.000000e+00	3	▃▁▁▁▇
Replicate	WT	0	1.00	1.920000e+00	9.000000e-01	1	1.00	2.0	3.000000e+00	3	▇▁▅▁▆
Exposed_CFU	mut_PD	0	1.00	1.259000e+03	NA	1259	1259.00	1259.0	1.259000e+03	1259	▁▁▇▁▁
Exposed_CFU	mutP_Da	0	1.00	9.809750e+05	2.474900e+02	980800	980887.50	980975.0	9.810625e+05	981150	▇▁▁▁▇
Exposed_CFU	mutpd	0	1.00	1.400000e+04	5.656850e+03	10000	12000.00	14000.0	1.600000e+04	18000	▇▁▁▁▇
Exposed_CFU	mutPD	2	0.87	2.307701e+09	4.385285e+09	0	11000.00	17000.0	2.300000e+04	9999999999	▇▁▁▁▂
Exposed_CFU	mutPD_a	0	1.00	8.000875e+05	9.994180e+04	750000	750000.00	750175.0	8.002625e+05	950000	▇▁▁▁▂
Exposed_CFU	mutPDa	0	1.00	4.853592e+05	5.317298e+05	35	75.00	475052.5	9.731000e+05	981150	▇▁▁▁▇
Exposed_CFU	wildtype	0	1.00	2.502133e+05	4.328280e+05	100	320.00	540.0	3.752700e+05	750000	▇▁▁▁▃
Exposed_CFU	wt	0	1.00	1.706700e+02	1.453100e+02	3	126.00	249.0	2.545000e+02	260	▃▁▁▁▇
Exposed_CFU	WT	0	1.00	2.500340e+09	4.522465e+09	30	157.50	505570.0	2.501569e+09	9999999999	▇▁▁▁▂
NoExposure_CFU	mut_PD	0	1.00	1.107700e+04	NA	11077	11077.00	11077.0	1.107700e+04	11077	▁▁▇▁▁
NoExposure_CFU	mutP_Da	0	1.00	1.205000e+02	1.344000e+01	111	115.75	120.5	1.252500e+02	130	▇▁▁▁▇
NoExposure_CFU	mutpd	0	1.00	1.950000e+04	1.202082e+04	11000	15250.00	19500.0	2.375000e+04	28000	▇▁▁▁▇
NoExposure_CFU	mutPD	3	0.80	2.500015e+09	4.522661e+09	6577	15750.00	24588.5	2.500022e+09	9999999999	▇▁▁▁▂
NoExposure_CFU	mutPD_a	0	1.00	1.215250e+03	2.193570e+03	11	40.25	175.0	1.350000e+03	4500	▇▁▁▁▂
NoExposure_CFU	mutPDa	0	1.00	5.000791e+06	8.366033e+06	3	115.75	2315.0	7.501125e+06	20000000	▇▁▂▁▂
NoExposure_CFU	wildtype	0	1.00	2.547800e+04	3.626276e+04	34	4717.00	9400.0	3.820000e+04	67000	▇▁▁▁▃
NoExposure_CFU	wt	0	1.00	6.133670e+03	5.356980e+03	1	4250.50	8500.0	9.200000e+03	9900	▃▁▁▁▇
NoExposure_CFU	WT	0	1.00	2.500004e+09	4.522668e+09	2	3582.50	7500.0	2.500007e+09	9999999999	▇▁▁▁▂

Qualitative

ggplot(raw, aes(Exposed_CFU)) + geom_histogram(bins = 15) + mytheme

## Warning: Removed 2 rows containing non-finite values (stat_bin).

ggplot(raw, aes(log(Exposed_CFU))) + geom_histogram(bins = 15) + mytheme

## Warning: Removed 3 rows containing non-finite values (stat_bin).

raw %>% pivot_longer(cols = c(Exposed_CFU,NoExposure_CFU),
                     names_to = 'study', values_to = 'CFU') %>%
  ggplot(.,aes(log(CFU), fill = Strain), alpha = 0.2) + geom_histogram(bins = 10,
                                                                  position = 'dodge')+
  facet_wrap(~Trial) + mytheme

## Warning: Removed 6 rows containing non-finite values (stat_bin).

raw %>% pivot_longer(cols = c(Exposed_CFU,NoExposure_CFU),
                     names_to = 'study', values_to = 'CFU') %>%
  ggplot(.,aes(log(CFU), fill = Strain), alpha = 0.2) + geom_histogram(bins = 10,
                                                                 position = 'dodge')+
  facet_wrap(~Trial) + mytheme

## Warning: Removed 6 rows containing non-finite values (stat_bin).

Quantative

clean %>% pivot_longer(cols = c(Exposed_CFU,NoExposure_CFU),
                       names_to = 'study', values_to = 'CFU')

## # A tibble: 72 x 6
##    SampleID Strain Replicate Trial study            CFU
##    <chr>    <chr>      <dbl> <dbl> <chr>          <dbl>
##  1 Sample1  WT             1     1 Exposed_CFU      100
##  2 Sample1  WT             1     1 NoExposure_CFU 67000
##  3 Sample2  WT             2     1 Exposed_CFU      249
##  4 Sample2  WT             2     1 NoExposure_CFU  8500
##  5 Sample3  WT             3     1 Exposed_CFU       75
##  6 Sample3  WT             3     1 NoExposure_CFU 10000
##  7 Sample4  mutPD          1     1 Exposed_CFU    10000
##  8 Sample4  mutPD          1     1 NoExposure_CFU 11000
##  9 Sample5  mutPD          2     1 Exposed_CFU    11900
## 10 Sample5  mutPD          2     1 NoExposure_CFU 15000
## # ... with 62 more rows

clean %>% pivot_longer(cols = c(Exposed_CFU,NoExposure_CFU),
                       names_to = 'study', values_to = 'CFU') %>%
  ggplot(.,aes(log(CFU), fill = Strain), alpha = 0.2) + geom_histogram(bins = 10,
                                                                      position = 'dodge')+
  facet_wrap(~Trial) + mytheme

t.test

clean_q <- clean %>% pivot_longer(cols = c(Exposed_CFU,NoExposure_CFU),
                                  names_to = 'study', values_to = 'CFU')

t.test(CFU ~ study, data = clean_q)

## 
##  Welch Two Sample t-test
## 
## data:  CFU by study
## t = 1.8121, df = 69.776, p-value = 0.07428
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -17897.59 373320.54
## sample estimates:
##    mean in group Exposed_CFU mean in group NoExposure_CFU 
##                     299923.6                     122212.1

t.test(log(CFU) ~ study, data = clean_q)

## 
##  Welch Two Sample t-test
## 
## data:  log(CFU) by study
## t = 1.1484, df = 66.569, p-value = 0.2549
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.633364  2.349046
## sample estimates:
##    mean in group Exposed_CFU mean in group NoExposure_CFU 
##                     9.421546                     8.563705

ANOVA and ad-hoc Tukeys HSD

aov_test <- aov(log(CFU) ~ study, data = clean_q)
summary(aov_test)

##             Df Sum Sq Mean Sq F value Pr(>F)
## study        1   13.2   13.25   1.319  0.255
## Residuals   70  703.1   10.04

TukeyHSD(aov_test)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = log(CFU) ~ study, data = clean_q)
## 
## $study
##                                  diff       lwr      upr     p adj
## NoExposure_CFU-Exposed_CFU -0.8578411 -2.347695 0.632013 0.2547233

aov_test <- aov(log(CFU) ~ study + Strain + study*Strain, data = clean_q)
summary(aov_test)

##              Df Sum Sq Mean Sq F value   Pr(>F)    
## study         1   13.2   13.25   6.604 0.012439 *  
## Strain        2   35.5   17.74   8.845 0.000395 ***
## study:Strain  2  535.2  267.62 133.429  < 2e-16 ***
## Residuals    66  132.4    2.01                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

TukeyHSD(aov_test)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = log(CFU) ~ study + Strain + study * Strain, data = clean_q)
## 
## $study
##                                  diff       lwr        upr     p adj
## NoExposure_CFU-Exposed_CFU -0.8578411 -1.524312 -0.1913702 0.0124394
## 
## $Strain
##                    diff        lwr        upr     p adj
## mutPDa-mutPD  0.2120216 -0.7682327  1.1922760 0.8625496
## WT-mutPD     -1.3717902 -2.3520446 -0.3915358 0.0037224
## WT-mutPDa    -1.5838118 -2.5640662 -0.6035574 0.0007191
## 
## $`study:Strain`
##                                                   diff        lwr       upr
## NoExposure_CFU:mutPD-Exposed_CFU:mutPD      0.72527496 -0.9717201  2.422270
## Exposed_CFU:mutPDa-Exposed_CFU:mutPD        4.66753018  2.9705351  6.364525
## NoExposure_CFU:mutPDa-Exposed_CFU:mutPD    -3.51821192 -5.2152070 -1.821217
## Exposed_CFU:WT-Exposed_CFU:mutPD           -3.45262467 -5.1496198 -1.755630
## NoExposure_CFU:WT-Exposed_CFU:mutPD         1.43431924 -0.2626759  3.131314
## Exposed_CFU:mutPDa-NoExposure_CFU:mutPD     3.94225522  2.2452601  5.639250
## NoExposure_CFU:mutPDa-NoExposure_CFU:mutPD -4.24348688 -5.9404820 -2.546492
## Exposed_CFU:WT-NoExposure_CFU:mutPD        -4.17789963 -5.8748947 -2.480905
## NoExposure_CFU:WT-NoExposure_CFU:mutPD      0.70904428 -0.9879508  2.406039
## NoExposure_CFU:mutPDa-Exposed_CFU:mutPDa   -8.18574210 -9.8827372 -6.488747
## Exposed_CFU:WT-Exposed_CFU:mutPDa          -8.12015485 -9.8171499 -6.423160
## NoExposure_CFU:WT-Exposed_CFU:mutPDa       -3.23321094 -4.9302060 -1.536216
## Exposed_CFU:WT-NoExposure_CFU:mutPDa        0.06558725 -1.6314078  1.762582
## NoExposure_CFU:WT-NoExposure_CFU:mutPDa     4.95253116  3.2555361  6.649526
## NoExposure_CFU:WT-Exposed_CFU:WT            4.88694391  3.1899488  6.583939
##                                                p adj
## NoExposure_CFU:mutPD-Exposed_CFU:mutPD     0.8081742
## Exposed_CFU:mutPDa-Exposed_CFU:mutPD       0.0000000
## NoExposure_CFU:mutPDa-Exposed_CFU:mutPD    0.0000010
## Exposed_CFU:WT-Exposed_CFU:mutPD           0.0000015
## NoExposure_CFU:WT-Exposed_CFU:mutPD        0.1448298
## Exposed_CFU:mutPDa-NoExposure_CFU:mutPD    0.0000001
## NoExposure_CFU:mutPDa-NoExposure_CFU:mutPD 0.0000000
## Exposed_CFU:WT-NoExposure_CFU:mutPD        0.0000000
## NoExposure_CFU:WT-NoExposure_CFU:mutPD     0.8224164
## NoExposure_CFU:mutPDa-Exposed_CFU:mutPDa   0.0000000
## Exposed_CFU:WT-Exposed_CFU:mutPDa          0.0000000
## NoExposure_CFU:WT-Exposed_CFU:mutPDa       0.0000067
## Exposed_CFU:WT-NoExposure_CFU:mutPDa       0.9999972
## NoExposure_CFU:WT-NoExposure_CFU:mutPDa    0.0000000
## NoExposure_CFU:WT-Exposed_CFU:WT           0.0000000

heatmaps

heatmap(as.matrix(clean[5:6]))

heatmap.2(as.matrix(clean[5:6]))

cor(iris[1:4], method = 'pearson', use = 'pairwise.complete.obs')-> all

corrplot(all, method = 'circle')

heatmap basics for ggplot

iris %>% pivot_longer(cols = -Species) %>%
  ggplot(., aes(x = Species, y = name, fill = value)) +
  geom_tile() +
  ylab('measurement')

### as a correlation plot

all %>% as_tibble %>% pivot_longer(cols = everything()) %>%
ggplot(., aes(x = name, y = name, fill = value)) +
  geom_tile() +
  ylab('measurement')

Markdown-Feb17th

Meg Hartwick

2/17/2021

R Markdown

setup

read-in files using tidyverse

Summarize and clean

base R

tidyverse

Qualitative

Quantative

t.test

ANOVA and ad-hoc Tukeys HSD

heatmaps

heatmap basics for ggplot