R Markdown

setup

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.6     v dplyr   1.0.4
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(extrafont)
## Registering fonts with R
library(skimr)
library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(corrplot)
## corrplot 0.84 loaded
setwd('C:/Users/megha/OneDrive/Desktop/workshop/')
mytheme <- theme(
  panel.border = element_blank(),
  panel.grid.major = element_line(
    color = 'grey60',
    linetype = 'dashed',
    size = 0.5),
  panel.grid.minor = element_blank(),
  panel.background = element_rect(colour = "white", fill = "white"),
  axis.line = element_line(colour = "white"),
  text = element_text(family = "Palatino Linotype", color = 'grey5'),
  axis.text.x = element_text(colour = "grey5", size = 10),
  axis.text.y = element_text(colour = "grey5", size = 10, ),
  axis.ticks = element_line(colour = "grey5"),title = element_text(
    colour = 'grey5',
    size = 10,
    vjust = -4
  )
)

read-in files using tidyverse

raw <- read_csv('thePerfectdata_raw.csv')
## 
## -- Column specification --------------------------------------------------------
## cols(
##   SampleID = col_character(),
##   Strain = col_character(),
##   Replicate = col_double(),
##   Trial = col_character(),
##   Exposed_CFU = col_double(),
##   NoExposure_CFU = col_double()
## )
clean <- read_csv('thePerfectdata_clean.csv')
## 
## -- Column specification --------------------------------------------------------
## cols(
##   SampleID = col_character(),
##   Strain = col_character(),
##   Replicate = col_double(),
##   Trial = col_double(),
##   Exposed_CFU = col_double(),
##   NoExposure_CFU = col_double()
## )

Summarize and clean

base R

summary(raw)
##    SampleID            Strain            Replicate    Trial          
##  Length:48          Length:48          Min.   :1   Length:48         
##  Class :character   Class :character   1st Qu.:1   Class :character  
##  Mode  :character   Mode  :character   Median :2   Mode  :character  
##                                        Mean   :2                     
##                                        3rd Qu.:3                     
##                                        Max.   :3                     
##                                                                      
##   Exposed_CFU        NoExposure_CFU     
##  Min.   :0.000e+00   Min.   :1.000e+00  
##  1st Qu.:2.950e+02   1st Qu.:3.110e+02  
##  Median :1.800e+04   Median :9.900e+03  
##  Mean   :1.305e+09   Mean   :1.334e+09  
##  3rd Qu.:9.734e+05   3rd Qu.:2.800e+04  
##  Max.   :1.000e+10   Max.   :1.000e+10  
##  NA's   :2           NA's   :3
table(raw$Strain)
## 
##   mut_PD  mutP_Da    mutpd    mutPD  mutPD_a   mutPDa wildtype       wt 
##        1        2        2       15        4        6        3        3 
##       WT 
##       12

tidyverse

skim(raw)
Data summary
Name raw
Number of rows 48
Number of columns 6
_______________________
Column type frequency:
character 3
numeric 3
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
SampleID 0 1 7 8 0 48 0
Strain 0 1 2 8 0 9 0
Trial 0 1 1 4 0 9 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Replicate 0 1.00 2 8.300000e-01 1 1 2.0 3 3e+00 ▇▁▇▁▇
Exposed_CFU 2 0.96 1304631625 3.404915e+09 0 295 17999.5 973385 1e+10 ▇▁▁▁▁
NoExposure_CFU 3 0.94 1334008371 3.437492e+09 1 311 9900.0 28000 1e+10 ▇▁▁▁▁
raw %>%
  dplyr::group_by(Strain) %>%
  skim()
Data summary
Name Piped data
Number of rows 48
Number of columns 6
_______________________
Column type frequency:
character 2
numeric 3
________________________
Group variables Strain

Variable type: character

skim_variable Strain n_missing complete_rate min max empty n_unique whitespace
SampleID mut_PD 0 1 8 8 0 1 0
SampleID mutP_Da 0 1 8 8 0 2 0
SampleID mutpd 0 1 7 8 0 2 0
SampleID mutPD 0 1 7 8 0 15 0
SampleID mutPD_a 0 1 7 8 0 4 0
SampleID mutPDa 0 1 8 8 0 6 0
SampleID wildtype 0 1 7 8 0 3 0
SampleID wt 0 1 7 8 0 3 0
SampleID WT 0 1 7 8 0 12 0
Trial mut_PD 0 1 1 1 0 1 0
Trial mutP_Da 0 1 1 1 0 1 0
Trial mutpd 0 1 1 4 0 2 0
Trial mutPD 0 1 1 3 0 7 0
Trial mutPD_a 0 1 1 3 0 2 0
Trial mutPDa 0 1 1 1 0 3 0
Trial wildtype 0 1 1 1 0 3 0
Trial wt 0 1 1 1 0 3 0
Trial WT 0 1 1 1 0 6 0

Variable type: numeric

skim_variable Strain n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Replicate mut_PD 0 1.00 3.000000e+00 NA 3 3.00 3.0 3.000000e+00 3 ▁▁▇▁▁
Replicate mutP_Da 0 1.00 2.500000e+00 7.100000e-01 2 2.25 2.5 2.750000e+00 3 ▇▁▁▁▇
Replicate mutpd 0 1.00 1.500000e+00 7.100000e-01 1 1.25 1.5 1.750000e+00 2 ▇▁▁▁▇
Replicate mutPD 0 1.00 2.000000e+00 8.500000e-01 1 1.00 2.0 3.000000e+00 3 ▇▁▇▁▇
Replicate mutPD_a 0 1.00 1.750000e+00 9.600000e-01 1 1.00 1.5 2.250000e+00 3 ▇▁▃▁▃
Replicate mutPDa 0 1.00 2.000000e+00 8.900000e-01 1 1.25 2.0 2.750000e+00 3 ▇▁▇▁▇
Replicate wildtype 0 1.00 1.670000e+00 5.800000e-01 1 1.50 2.0 2.000000e+00 2 ▃▁▁▁▇
Replicate wt 0 1.00 2.670000e+00 5.800000e-01 2 2.50 3.0 3.000000e+00 3 ▃▁▁▁▇
Replicate WT 0 1.00 1.920000e+00 9.000000e-01 1 1.00 2.0 3.000000e+00 3 ▇▁▅▁▆
Exposed_CFU mut_PD 0 1.00 1.259000e+03 NA 1259 1259.00 1259.0 1.259000e+03 1259 ▁▁▇▁▁
Exposed_CFU mutP_Da 0 1.00 9.809750e+05 2.474900e+02 980800 980887.50 980975.0 9.810625e+05 981150 ▇▁▁▁▇
Exposed_CFU mutpd 0 1.00 1.400000e+04 5.656850e+03 10000 12000.00 14000.0 1.600000e+04 18000 ▇▁▁▁▇
Exposed_CFU mutPD 2 0.87 2.307701e+09 4.385285e+09 0 11000.00 17000.0 2.300000e+04 9999999999 ▇▁▁▁▂
Exposed_CFU mutPD_a 0 1.00 8.000875e+05 9.994180e+04 750000 750000.00 750175.0 8.002625e+05 950000 ▇▁▁▁▂
Exposed_CFU mutPDa 0 1.00 4.853592e+05 5.317298e+05 35 75.00 475052.5 9.731000e+05 981150 ▇▁▁▁▇
Exposed_CFU wildtype 0 1.00 2.502133e+05 4.328280e+05 100 320.00 540.0 3.752700e+05 750000 ▇▁▁▁▃
Exposed_CFU wt 0 1.00 1.706700e+02 1.453100e+02 3 126.00 249.0 2.545000e+02 260 ▃▁▁▁▇
Exposed_CFU WT 0 1.00 2.500340e+09 4.522465e+09 30 157.50 505570.0 2.501569e+09 9999999999 ▇▁▁▁▂
NoExposure_CFU mut_PD 0 1.00 1.107700e+04 NA 11077 11077.00 11077.0 1.107700e+04 11077 ▁▁▇▁▁
NoExposure_CFU mutP_Da 0 1.00 1.205000e+02 1.344000e+01 111 115.75 120.5 1.252500e+02 130 ▇▁▁▁▇
NoExposure_CFU mutpd 0 1.00 1.950000e+04 1.202082e+04 11000 15250.00 19500.0 2.375000e+04 28000 ▇▁▁▁▇
NoExposure_CFU mutPD 3 0.80 2.500015e+09 4.522661e+09 6577 15750.00 24588.5 2.500022e+09 9999999999 ▇▁▁▁▂
NoExposure_CFU mutPD_a 0 1.00 1.215250e+03 2.193570e+03 11 40.25 175.0 1.350000e+03 4500 ▇▁▁▁▂
NoExposure_CFU mutPDa 0 1.00 5.000791e+06 8.366033e+06 3 115.75 2315.0 7.501125e+06 20000000 ▇▁▂▁▂
NoExposure_CFU wildtype 0 1.00 2.547800e+04 3.626276e+04 34 4717.00 9400.0 3.820000e+04 67000 ▇▁▁▁▃
NoExposure_CFU wt 0 1.00 6.133670e+03 5.356980e+03 1 4250.50 8500.0 9.200000e+03 9900 ▃▁▁▁▇
NoExposure_CFU WT 0 1.00 2.500004e+09 4.522668e+09 2 3582.50 7500.0 2.500007e+09 9999999999 ▇▁▁▁▂

Qualitative

ggplot(raw, aes(Exposed_CFU)) + geom_histogram(bins = 15) + mytheme
## Warning: Removed 2 rows containing non-finite values (stat_bin).

ggplot(raw, aes(log(Exposed_CFU))) + geom_histogram(bins = 15) + mytheme
## Warning: Removed 3 rows containing non-finite values (stat_bin).

raw %>% pivot_longer(cols = c(Exposed_CFU,NoExposure_CFU),
                     names_to = 'study', values_to = 'CFU') %>%
  ggplot(.,aes(log(CFU), fill = Strain), alpha = 0.2) + geom_histogram(bins = 10,
                                                                  position = 'dodge')+
  facet_wrap(~Trial) + mytheme
## Warning: Removed 6 rows containing non-finite values (stat_bin).

raw %>% pivot_longer(cols = c(Exposed_CFU,NoExposure_CFU),
                     names_to = 'study', values_to = 'CFU') %>%
  ggplot(.,aes(log(CFU), fill = Strain), alpha = 0.2) + geom_histogram(bins = 10,
                                                                 position = 'dodge')+
  facet_wrap(~Trial) + mytheme
## Warning: Removed 6 rows containing non-finite values (stat_bin).

Quantative

clean %>% pivot_longer(cols = c(Exposed_CFU,NoExposure_CFU),
                       names_to = 'study', values_to = 'CFU')
## # A tibble: 72 x 6
##    SampleID Strain Replicate Trial study            CFU
##    <chr>    <chr>      <dbl> <dbl> <chr>          <dbl>
##  1 Sample1  WT             1     1 Exposed_CFU      100
##  2 Sample1  WT             1     1 NoExposure_CFU 67000
##  3 Sample2  WT             2     1 Exposed_CFU      249
##  4 Sample2  WT             2     1 NoExposure_CFU  8500
##  5 Sample3  WT             3     1 Exposed_CFU       75
##  6 Sample3  WT             3     1 NoExposure_CFU 10000
##  7 Sample4  mutPD          1     1 Exposed_CFU    10000
##  8 Sample4  mutPD          1     1 NoExposure_CFU 11000
##  9 Sample5  mutPD          2     1 Exposed_CFU    11900
## 10 Sample5  mutPD          2     1 NoExposure_CFU 15000
## # ... with 62 more rows
clean %>% pivot_longer(cols = c(Exposed_CFU,NoExposure_CFU),
                       names_to = 'study', values_to = 'CFU') %>%
  ggplot(.,aes(log(CFU), fill = Strain), alpha = 0.2) + geom_histogram(bins = 10,
                                                                      position = 'dodge')+
  facet_wrap(~Trial) + mytheme

t.test

clean_q <- clean %>% pivot_longer(cols = c(Exposed_CFU,NoExposure_CFU),
                                  names_to = 'study', values_to = 'CFU')
t.test(CFU ~ study, data = clean_q)
## 
##  Welch Two Sample t-test
## 
## data:  CFU by study
## t = 1.8121, df = 69.776, p-value = 0.07428
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -17897.59 373320.54
## sample estimates:
##    mean in group Exposed_CFU mean in group NoExposure_CFU 
##                     299923.6                     122212.1
t.test(log(CFU) ~ study, data = clean_q)
## 
##  Welch Two Sample t-test
## 
## data:  log(CFU) by study
## t = 1.1484, df = 66.569, p-value = 0.2549
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.633364  2.349046
## sample estimates:
##    mean in group Exposed_CFU mean in group NoExposure_CFU 
##                     9.421546                     8.563705

ANOVA and ad-hoc Tukeys HSD

aov_test <- aov(log(CFU) ~ study, data = clean_q)
summary(aov_test)
##             Df Sum Sq Mean Sq F value Pr(>F)
## study        1   13.2   13.25   1.319  0.255
## Residuals   70  703.1   10.04
TukeyHSD(aov_test)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = log(CFU) ~ study, data = clean_q)
## 
## $study
##                                  diff       lwr      upr     p adj
## NoExposure_CFU-Exposed_CFU -0.8578411 -2.347695 0.632013 0.2547233
aov_test <- aov(log(CFU) ~ study + Strain + study*Strain, data = clean_q)
summary(aov_test)
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## study         1   13.2   13.25   6.604 0.012439 *  
## Strain        2   35.5   17.74   8.845 0.000395 ***
## study:Strain  2  535.2  267.62 133.429  < 2e-16 ***
## Residuals    66  132.4    2.01                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(aov_test)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = log(CFU) ~ study + Strain + study * Strain, data = clean_q)
## 
## $study
##                                  diff       lwr        upr     p adj
## NoExposure_CFU-Exposed_CFU -0.8578411 -1.524312 -0.1913702 0.0124394
## 
## $Strain
##                    diff        lwr        upr     p adj
## mutPDa-mutPD  0.2120216 -0.7682327  1.1922760 0.8625496
## WT-mutPD     -1.3717902 -2.3520446 -0.3915358 0.0037224
## WT-mutPDa    -1.5838118 -2.5640662 -0.6035574 0.0007191
## 
## $`study:Strain`
##                                                   diff        lwr       upr
## NoExposure_CFU:mutPD-Exposed_CFU:mutPD      0.72527496 -0.9717201  2.422270
## Exposed_CFU:mutPDa-Exposed_CFU:mutPD        4.66753018  2.9705351  6.364525
## NoExposure_CFU:mutPDa-Exposed_CFU:mutPD    -3.51821192 -5.2152070 -1.821217
## Exposed_CFU:WT-Exposed_CFU:mutPD           -3.45262467 -5.1496198 -1.755630
## NoExposure_CFU:WT-Exposed_CFU:mutPD         1.43431924 -0.2626759  3.131314
## Exposed_CFU:mutPDa-NoExposure_CFU:mutPD     3.94225522  2.2452601  5.639250
## NoExposure_CFU:mutPDa-NoExposure_CFU:mutPD -4.24348688 -5.9404820 -2.546492
## Exposed_CFU:WT-NoExposure_CFU:mutPD        -4.17789963 -5.8748947 -2.480905
## NoExposure_CFU:WT-NoExposure_CFU:mutPD      0.70904428 -0.9879508  2.406039
## NoExposure_CFU:mutPDa-Exposed_CFU:mutPDa   -8.18574210 -9.8827372 -6.488747
## Exposed_CFU:WT-Exposed_CFU:mutPDa          -8.12015485 -9.8171499 -6.423160
## NoExposure_CFU:WT-Exposed_CFU:mutPDa       -3.23321094 -4.9302060 -1.536216
## Exposed_CFU:WT-NoExposure_CFU:mutPDa        0.06558725 -1.6314078  1.762582
## NoExposure_CFU:WT-NoExposure_CFU:mutPDa     4.95253116  3.2555361  6.649526
## NoExposure_CFU:WT-Exposed_CFU:WT            4.88694391  3.1899488  6.583939
##                                                p adj
## NoExposure_CFU:mutPD-Exposed_CFU:mutPD     0.8081742
## Exposed_CFU:mutPDa-Exposed_CFU:mutPD       0.0000000
## NoExposure_CFU:mutPDa-Exposed_CFU:mutPD    0.0000010
## Exposed_CFU:WT-Exposed_CFU:mutPD           0.0000015
## NoExposure_CFU:WT-Exposed_CFU:mutPD        0.1448298
## Exposed_CFU:mutPDa-NoExposure_CFU:mutPD    0.0000001
## NoExposure_CFU:mutPDa-NoExposure_CFU:mutPD 0.0000000
## Exposed_CFU:WT-NoExposure_CFU:mutPD        0.0000000
## NoExposure_CFU:WT-NoExposure_CFU:mutPD     0.8224164
## NoExposure_CFU:mutPDa-Exposed_CFU:mutPDa   0.0000000
## Exposed_CFU:WT-Exposed_CFU:mutPDa          0.0000000
## NoExposure_CFU:WT-Exposed_CFU:mutPDa       0.0000067
## Exposed_CFU:WT-NoExposure_CFU:mutPDa       0.9999972
## NoExposure_CFU:WT-NoExposure_CFU:mutPDa    0.0000000
## NoExposure_CFU:WT-Exposed_CFU:WT           0.0000000

heatmaps

heatmap(as.matrix(clean[5:6]))

heatmap.2(as.matrix(clean[5:6]))

cor(iris[1:4], method = 'pearson', use = 'pairwise.complete.obs')-> all

corrplot(all, method = 'circle')

heatmap basics for ggplot

iris %>% pivot_longer(cols = -Species) %>%
  ggplot(., aes(x = Species, y = name, fill = value)) +
  geom_tile() +
  ylab('measurement')

### as a correlation plot

all %>% as_tibble %>% pivot_longer(cols = everything()) %>%
ggplot(., aes(x = name, y = name, fill = value)) +
  geom_tile() +
  ylab('measurement')