The first thing we want to do is load or activate the packages we will use in this script.

Load (activate) your packages

library(ggplot2)
library(readxl)
library(tidyverse) # you may not have this installed yet. Use install.packages("tidyverse"). You'll get an error message if you try to activate (load) a package you have not installed.

## -- Attaching packages -------------------- tidyverse 1.2.1 --

## v tibble  2.1.3     v purrr   0.3.2
## v tidyr   0.8.3     v dplyr   0.8.3
## v readr   1.3.1     v stringr 1.4.0
## v tibble  2.1.3     v forcats 0.4.0

## -- Conflicts ----------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Load the datasets

potato.dat = read_xlsx(file.choose())

2. explore the data set

Use summary, and histograms to get a sense of the data

#Potatoe dat
names(potato.dat)

## [1] "Group_ID"          "Potato ID"         "Treatment"        
## [4] "Initial_length_mm" "Initial_mass_g"    "Final_length_mm"  
## [7] "Final_mass_g"      "Change_mass_g"     "Change_length_mm"

summary(potato.dat)

##    Group_ID           Potato ID      Treatment         Initial_length_mm
##  Length:72          Min.   : 1.00   Length:72          Min.   :50.50    
##  Class :character   1st Qu.: 3.75   Class :character   1st Qu.:60.00    
##  Mode  :character   Median : 6.50   Mode  :character   Median :60.00    
##                     Mean   : 6.50                      Mean   :58.42    
##                     3rd Qu.: 9.25                      3rd Qu.:60.00    
##                     Max.   :12.00                      Max.   :60.00    
##                                                                         
##  Initial_mass_g  Final_length_mm  Final_mass_g   Change_mass_g     
##  Min.   :5.100   Min.   :50.30   Min.   :4.700   Min.   :-1.20000  
##  1st Qu.:6.000   1st Qu.:59.00   1st Qu.:6.000   1st Qu.: 0.00000  
##  Median :6.300   Median :60.50   Median :6.400   Median : 0.15000  
##  Mean   :6.235   Mean   :59.31   Mean   :6.276   Mean   : 0.04167  
##  3rd Qu.:6.500   3rd Qu.:62.00   3rd Qu.:6.725   3rd Qu.: 0.40000  
##  Max.   :6.900   Max.   :65.00   Max.   :7.200   Max.   : 1.80000  
##                                                                    
##  Change_length_mm 
##  Min.   :-1.0000  
##  1st Qu.: 0.0000  
##  Median : 0.2000  
##  Mean   : 0.9859  
##  3rd Qu.: 2.0000  
##  Max.   : 5.0000  
##  NA's   :1

#Plot for length 
ggplot(potato.dat, 
       aes(x=Final_length_mm, fill=Treatment)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#I set fill = Treatment to color the bars by our Treatments 

#Plot for mass 
ggplot(potato.dat, 
       aes(x=Final_mass_g, fill=Treatment)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Use substraction to see difference between final and initial

ggplot(potato.dat, 
       aes(fill=Treatment, 
           x=Initial_length_mm -Final_length_mm)) +
  geom_histogram() +
  scale_y_continuous(name="Change in length (mm)") +
  geom_vline(aes(xintercept=0), 
             size=1.5, color="black") +
  scale_fill_manual(values=c("Deepskyblue", 
                             "Darkseagreen1",
                             "Darkseagreen2",
                             "Darkseagreen3"))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#here I use the code scale_y_continous with the argument name= "y axis label" to set the label to change in length (mm). Then I used geom_vline to add a verttical line at the x intercept = 0. 
#Then I used the scale_fill_manual with the argument values to supply specific color names, the colors match the order of the levels (usually alphabetical). You can check with levels(potato.dat$Treatment)

ggplot(potato.dat, 
       aes(fill=Treatment, 
           x=Initial_mass_g - Final_mass_g)) +
  geom_histogram() +
  scale_y_continuous(name="Change in mass (g)") +
  geom_vline(aes(xintercept=0), 
             size=1.5, color="black") +
  scale_fill_manual(values=c("Deepskyblue", 
                             "Darkseagreen1",
                             "Darkseagreen2",
                             "Darkseagreen3"))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Making a table of means and standard deviations using the package tidyverse

Tidyverse is a great package that implents functions to aid in the manipulation of tables and data within R

summary.stat.table = potato.dat %>% 
  group_by(Treatment) %>%
  summarise(Final.length.mm=mean(Initial_length_mm - Final_length_mm),
            Stat.dev.lenght=sd(Initial_length_mm - Final_length_mm, na.rm=T),
            Final.mass.g=mean(Initial_mass_g - Final_mass_g),
            Stat.dev.mass.g = mean(Initial_mass_g - Final.mass.g)
            )

#Here I used tidyverse which is a new package we will go over next class. It applies a new coding argument knowns as a pipe %>%. A pipe allows you to pipe functions together, so you can code from one function to another. In this case I created a new objects and piped in the original dataset, then I grouped by Treatment and calculated the mean and standared dev for the difference in final length and mass

summary.stat.table

## # A tibble: 4 x 5
##   Treatment    Final.length.mm Stat.dev.lenght Final.mass.g Stat.dev.mass.g
##   <chr>                  <dbl>           <dbl>        <dbl>           <dbl>
## 1 Water                -1.79              1.68       -0.444            6.52
## 2 Water + 10 ~         -0.0889            1.23        0.278            5.97
## 3 Water + 2 N~         -1.06              1.86       -0.233            6.47
## 4 Water + 6 N~         -0.639             1.28        0.233            6.16

Now I am going to show you how to make a column bar of means with error bars

#For length
ggplot(summary.stat.table, 
       aes(x=Treatment, 
           y=Final.length.mm,
           color=Treatment)) +
    geom_point(size=3) +
  geom_errorbar(aes(
    ymin=Final.length.mm-Stat.dev.lenght,
                    
    ymax=Final.length.mm + Stat.dev.lenght), 
                position="dodge") + 
  geom_hline(aes(yintercept=0), size=1.5)

#Here I used geom_errorbar() to create the graphing element of errorbars. In the aesthethic I had to set ymin and ymax (the lower and upper bounds of the error bars). I do this by adding or substracting the standard deviation to the mean. 
#Here I used geom_hline to create a horizontal line at zero

#Can you reproduce this for mass?

Now let’s move on to the Dye data

dye.dat = read_xlsx(file.choose())

summary(dye.dat)

##     Activity       Group_ID         Dye_number_name    Concentration_mM
##  Min.   :1.000   Length:280         Length:280         Min.   :0.300   
##  1st Qu.:1.000   Class :character   Class :character   1st Qu.:1.000   
##  Median :2.000   Mode  :character   Mode  :character   Median :1.000   
##  Mean   :1.857                                         Mean   :1.186   
##  3rd Qu.:3.000                                         3rd Qu.:1.000   
##  Max.   :3.000                                         Max.   :3.000   
##  Agarose_concentration      Time    Distance_from_Interface_mm
##  Min.   :0.150         Min.   : 0   Min.   :  0.000           
##  1st Qu.:0.175         1st Qu.:20   1st Qu.:  1.465           
##  Median :0.175         Median :45   Median :  3.700           
##  Mean   :0.175         Mean   :45   Mean   :  8.068           
##  3rd Qu.:0.175         3rd Qu.:70   3rd Qu.:  7.715           
##  Max.   :0.200         Max.   :90   Max.   :147.000

#notice that the Distance from interface seems like it has an outlier 


ggplot(dye.dat, 
       aes(x=Time, 
           y=Distance_from_Interface_mm,
           color=Dye_number_name)) +
  geom_point() +
  facet_grid(Dye_number_name~Activity)

#This graph is a lot, but we are looking at all the data across all activities. I used the function facet_grid to separate out columns by part 1, part 2 and part 3 and by Dye color in rows. Notice that we have an outlier of 150mm

ggplot(dye.dat, 
       aes(x=Distance_from_Interface_mm,
           fill=Dye_number_name)) +
  geom_histogram() +
  facet_grid(Dye_number_name~Activity)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Let's break it out by Activity 

part1 = dye.dat %>%
  filter(Activity==1)

#Here I used the package tidverse to pipe our dye data into part1 and I filtered by Activity 1

ggplot(part1, 
       aes(x=Time, 
           y=Distance_from_Interface_mm,
           
           color=Dye_number_name)) +
  geom_point(size=2) +
  facet_grid(.~Group_ID)

#Note that in the last group the points are kinda wonky and regress in time which should happen. In group 2 there is a dramatic pull for Methylene blue towarsd the end which might be an error.   

part2 = dye.dat %>%
  filter(Activity==2)

control.dye = dye.dat %>%
  filter(Concentration_mM==1, 
         Dye_number_name=="Dye_1_Methyl_Red")

ggplot(part2, 
       aes(x=Time, 
           y=Distance_from_Interface_mm,
           
           color=as.factor(Concentration_mM))) +
  geom_point(size=2) +
  geom_point(data=control.dye) +
  facet_grid(.~Group_ID)

#Here I added the control dye using geom_point. Note that there is an error in group 1. In the data they have 1.47, 1.47, 147 in a sequence which probably means that they missed a decimal point 


part3 = dye.dat %>%
  filter(Activity==3)

ggplot(part3, 
       aes(x=Time, 
           y=Distance_from_Interface_mm,
           
           color=Agarose_concentration)) +
  geom_point(size=2) +
  geom_point(data=control.dye) +
  facet_grid(.~Group_ID)

comments: Note that I share this with you as a learning experience so that you can see what common errors happen and how your work translates into data. I opted to keep the names so that you can see how you did and reflect on what errors potentially influenced your group. This is an exercise in building skills in working carefully and in how to spot potential errors in a dataset. Anyone, everyone, makes mistakes, the important thing is learning how to work carefully and how to spot mistakes in the dataset.

#dye.dat without error
dye.dat2 = read_xlsx(file.choose())

#excluding group 4 from dataset 

dye.dat3 = dye.dat2 %>%
  filter(Group_ID != "RRG")

#Here I use the argument != which means is not equal to create a dataset of all groups not equal to RRG. 

#Now I will use tidyverse to avarage between groups

dye.averages = dye.dat3 %>%
  group_by(Dye_number_name, Time, Activity, 
           Concentration_mM, Agarose_concentration) %>%
  summarize_all(mean)

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

## Warning in mean.default(Group_ID): argument is not numeric or logical:
## returning NA

dye.averages

## # A tibble: 70 x 7
## # Groups:   Dye_number_name, Time, Activity, Concentration_mM [60]
##    Dye_number_name  Time Activity Concentration_mM Agarose_concent~
##    <chr>           <dbl>    <dbl>            <dbl>            <dbl>
##  1 Dye_1_Methyl_R~     0        1              1              0.175
##  2 Dye_1_Methyl_R~     0        2              0.3            0.175
##  3 Dye_1_Methyl_R~     0        2              3              0.175
##  4 Dye_1_Methyl_R~     0        3              1              0.15 
##  5 Dye_1_Methyl_R~     0        3              1              0.2  
##  6 Dye_1_Methyl_R~    10        1              1              0.175
##  7 Dye_1_Methyl_R~    10        2              0.3            0.175
##  8 Dye_1_Methyl_R~    10        2              3              0.175
##  9 Dye_1_Methyl_R~    10        3              1              0.15 
## 10 Dye_1_Methyl_R~    10        3              1              0.2  
## # ... with 60 more rows, and 2 more variables: Group_ID <dbl>,
## #   Distance_from_Interface_mm <dbl>

ggplot(dye.averages,
       aes(x=Time, 
           y=Distance_from_Interface_mm,
           color=Dye_number_name)) +
  geom_point(size=2) + facet_grid(.~Activity)

#Seperate by parts 

part1 = dye.averages %>%
  filter(Activity == 1)


#Here I used the package tidverse to pipe our dye data into part1 and I filtered by Activity 1

ggplot(part1, 
       aes(x=Time, 
           y=Distance_from_Interface_mm,
           
           color=Dye_number_name)) +
  geom_point(size=2)

part2 = dye.averages %>%
  filter(Activity==2)

control.dye = dye.averages %>%
  filter(Concentration_mM==1, 
         Dye_number_name=="Dye_1_Methyl_Red")

ggplot(part2, 
       aes(x=Time, 
           y=Distance_from_Interface_mm,
           
           color=as.factor(Concentration_mM))) +
  geom_point(size=2) +
  geom_line(data=control.dye)

part3 = dye.averages %>%
  filter(Activity==3)

ggplot(part3, 
       aes(x=Time, 
           y=Distance_from_Interface_mm,
           
           color=Agarose_concentration)) +
  geom_point(size=2) +
  geom_point(data=control.dye)

New things on this script:

loading data using the file.choose() function
coloring by a categorical variable using ggplot
Using tidyverse to create a summary table of mean and standard deviation per groups
using the summary table to plot mean and error bars using ggplot *Using tidyverse to filter data

diffusion with students

KevinAviles

February 5, 2020