SCH6245 Clinical Bioinformatics in R Markdown

0.1 Load dependencies (provided in session information)

0.2 Data preparation

0.2.1 Data retrieved from https://github.com/AlyceRussell/DataManagement_ReproducibleResearch/tree/master/data

#load Excel spreadsheet dataset
#first list the sheet names - had to use full directory address
excel_sheets("../data/surveys_data.xlsx")

## [1] "surveys"      "plot_info"    "species_info"

getwd()

## [1] "C:/Users/ladki/Desktop/a SCH6245 Clinical Bio/ClinicalBioinformaticsInR/documents"

#load survey_data located on sheet 1
survey_data<-read_excel("../data/surveys_data.xlsx", sheet=1)

#load plot_info located on sheet 2
plot_info<-read_excel("../data/surveys_data.xlsx", sheet=2)

#load species_info located on sheet 3
species_info<-read_excel("../data/surveys_data.xlsx", sheet=3)

#check the structure of survey_data to determine the steps required (variable names are shown next to '$'),
#variable formats
#survey_data has 10 variables including sex (characters)
str(survey_data)

## tibble [35,549 x 10] (S3: tbl_df/tbl/data.frame)
##  $ record_id      : num [1:35549] 1 2 3 4 5 6 7 8 9 10 ...
##  $ month          : num [1:35549] 7 7 7 7 7 7 7 7 7 7 ...
##  $ day            : num [1:35549] 16 16 16 16 16 16 16 16 16 16 ...
##  $ year           : num [1:35549] 1977 1977 1977 1977 1977 ...
##  $ plot_id        : num [1:35549] 2 3 2 7 3 1 2 1 1 6 ...
##  $ species_id     : chr [1:35549] "NL" "NL" "DM" "DM" ...
##  $ sex            : chr [1:35549] "M" "M" "F" "M" ...
##  $ hindfoot_length: num [1:35549] 32 33 37 36 35 14 NA 37 34 20 ...
##  $ weight         : num [1:35549] NA NA NA NA NA NA NA NA NA NA ...
##  $ date           : POSIXct[1:35549], format: "1977-07-16" "1977-07-16" ...

#recode sex (m,f) to numeric - set value labels - (0,1)
survey_data$sex<-ifelse(survey_data$sex=="M", 0, 
                        ifelse(survey_data$sex=="F", 1, NA))
survey_data$sex<-factor(survey_data$sex,
                        levels=c(0,1),
                        labels=c("Male", "Female"))

#to label plot_ID, first explore plot_info data structure to determine the steps required
#plot_info has 2 variables: plot_id (numeric) and plot_type (character)
str(plot_info)

## tibble [24 x 2] (S3: tbl_df/tbl/data.frame)
##  $ plot_id  : num [1:24] 1 2 3 4 5 6 7 8 9 10 ...
##  $ plot_type: chr [1:24] "Spectab exclosure" "Control" "Long-term Krat Exclosure" "Control" ...

#view the first few lines
head(plot_info)

## # A tibble: 6 x 2
##   plot_id plot_type                
##     <dbl> <chr>                    
## 1       1 Spectab exclosure        
## 2       2 Control                  
## 3       3 Long-term Krat Exclosure 
## 4       4 Control                  
## 5       5 Rodent Exclosure         
## 6       6 Short-term Krat Exclosure

#label plot_id in surveys excel sheet according to plot_type info in the plot_info dataset (sheet =2)
survey_data$plot_id<-factor(survey_data$plot_id,
                            levels=plot_info$plot_id, labels=plot_info$plot_type)

#first explore the species_info data structure to determine the steps required
#species_info has 4 variables: species_id, genus, species, taxa (all variables are characters)
str(species_info)

## tibble [54 x 4] (S3: tbl_df/tbl/data.frame)
##  $ species_id: chr [1:54] "AB" "AH" "AS" "BA" ...
##  $ genus     : chr [1:54] "Amphispiza" "Ammospermophilus" "Ammodramus" "Baiomys" ...
##  $ species   : chr [1:54] "bilineata" "harrisi" "savannarum" "taylori" ...
##  $ taxa      : chr [1:54] "Bird" "Rodent" "Bird" "Rodent" ...

#merge genus and species into one variable called species_lab (creating a new column in the species info excel sheet)
#NOTE: " " adds a space between genus and species
species_info$species_lab<-paste(species_info$genus, species_info$species, sep=" ")

#check first 10 rows head(species_info) will give the first 6 rows
head(species_info[1:10,])

## # A tibble: 6 x 5
##   species_id genus            species         taxa   species_lab                
##   <chr>      <chr>            <chr>           <chr>  <chr>                      
## 1 AB         Amphispiza       bilineata       Bird   Amphispiza bilineata       
## 2 AH         Ammospermophilus harrisi         Rodent Ammospermophilus harrisi   
## 3 AS         Ammodramus       savannarum      Bird   Ammodramus savannarum      
## 4 BA         Baiomys          taylori         Rodent Baiomys taylori            
## 5 CB         Campylorhynchus  brunneicapillus Bird   Campylorhynchus brunneicap~
## 6 CM         Calamospiza      melanocorys     Bird   Calamospiza melanocorys

#check for and remove white space - lagging, forward, extra in-between. Then check Data in Global Environment
species_info$species_lab<-trimws(species_info$species_lab, which = "right")

#check first 10 rows
head(species_info[1:10,])

## # A tibble: 6 x 5
##   species_id genus            species         taxa   species_lab                
##   <chr>      <chr>            <chr>           <chr>  <chr>                      
## 1 AB         Amphispiza       bilineata       Bird   Amphispiza bilineata       
## 2 AH         Ammospermophilus harrisi         Rodent Ammospermophilus harrisi   
## 3 AS         Ammodramus       savannarum      Bird   Ammodramus savannarum      
## 4 BA         Baiomys          taylori         Rodent Baiomys taylori            
## 5 CB         Campylorhynchus  brunneicapillus Bird   Campylorhynchus brunneicap~
## 6 CM         Calamospiza      melanocorys     Bird   Calamospiza melanocorys

#label species_id values in survey_data
#by using species_id (levels) and corresponding species_lab(labels) from species_info
survey_data$species_id<-factor(survey_data$species_id,
                               levels= species_info$species_id,
                               labels=species_info$species_lab)


#add genus and taxa from species_info to survey_data, x=survey_data, and y=species_info. Using primary key to add genus and taxa to survey_data
survey_data<-merge(survey_data, species_info[,c(2,4,5)],
                   by.x="species_id", by.y="species_lab")


#check main data
str(survey_data)

## 'data.frame':    34786 obs. of  12 variables:
##  $ species_id     : Factor w/ 54 levels "Amphispiza bilineata",..: 3 3 2 2 2 2 2 2 2 2 ...
##  $ record_id      : num  18932 20588 27074 12824 22059 ...
##  $ month          : num  8 1 10 5 2 4 5 10 10 12 ...
##  $ day            : num  7 24 26 28 4 21 15 24 25 14 ...
##  $ year           : num  1991 1993 1997 1987 1995 ...
##  $ plot_id        : Factor w/ 5 levels "Spectab exclosure",..: 3 2 2 5 2 1 4 3 5 4 ...
##  $ sex            : Factor w/ 2 levels "Male","Female": NA NA NA NA NA NA NA NA NA NA ...
##  $ hindfoot_length: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ weight         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ date           : POSIXct, format: "1991-08-07" "1993-01-24" ...
##  $ genus          : chr  "Ammodramus" "Ammodramus" "Ammospermophilus" "Ammospermophilus" ...
##  $ taxa           : chr  "Bird" "Bird" "Rodent" "Rodent" ...

head(survey_data)

##                 species_id record_id month day year                   plot_id
## 1    Ammodramus savannarum     18932     8   7 1991  Long-term Krat Exclosure
## 2    Ammodramus savannarum     20588     1  24 1993                   Control
## 3 Ammospermophilus harrisi     27074    10  26 1997                   Control
## 4 Ammospermophilus harrisi     12824     5  28 1987 Short-term Krat Exclosure
## 5 Ammospermophilus harrisi     22059     2   4 1995                   Control
## 6 Ammospermophilus harrisi     18618     4  21 1991         Spectab exclosure
##    sex hindfoot_length weight       date            genus   taxa
## 1 <NA>              NA     NA 1991-08-07       Ammodramus   Bird
## 2 <NA>              NA     NA 1993-01-24       Ammodramus   Bird
## 3 <NA>              NA     NA 1997-10-26 Ammospermophilus Rodent
## 4 <NA>              NA     NA 1987-05-28 Ammospermophilus Rodent
## 5 <NA>              NA     NA 1995-02-04 Ammospermophilus Rodent
## 6 <NA>              NA     NA 1991-04-21 Ammospermophilus Rodent

#remove labels with missing, for species_id - that do not exist, overwriting survey_data$species_id
survey_data$species_id<-droplevels(survey_data$species_id)

#remove missing values/data for sex and weight
#create index number to indicate missing sex
survey_data$sexmiss<-ifelse(is.na(survey_data$sex),1,0) #if survey_data$sex is.na assign 1, else (not na) 0
survey_data$weightmiss<-ifelse(is.na(survey_data$weight),1,0) #if survey_data$weight is.na assign 1, else 0
survey_data$anymiss<-ifelse(survey_data$sexmiss==1 | survey_data$weightmiss==1,1,0) #in the new variable ..$anymiss to understand (1,1,0): if sexmiss ==1 or weightmiss==1 then assign 1, else assign 0

#check all sexmiss is counted - missing by sexmiss 0r both (sexmiss and weightmiss) = a total of 2604 samples of interest
table(survey_data$sexmiss, survey_data$anymiss)

##    
##         0     1
##   0 32182   856
##   1     0  1748

#identify total of 1 meaning atleast 1 of the 2 values is missing so either sex or weight. If it is 0, it means both values are identified.
table(survey_data$anymiss)

## 
##     0     1 
## 32182  2604

#new data excludng missing by indicator, ! means not equal to for a logical type comparison e.g. ..anymiss is !=1, so in this case, this code saves only 0 values into survey_data2
survey_data2<-survey_data[!survey_data$anymiss==1, ]

#read as row x column
dim(survey_data2)

## [1] 32182    15

#information about the data oncluding type: factor, number etc
str(survey_data2)

## 'data.frame':    32182 obs. of  15 variables:
##  $ species_id     : Factor w/ 48 levels "Amphispiza bilineata",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ record_id      : num  17486 19121 19249 17374 18252 ...
##  $ month          : num  4 10 11 4 1 3 12 11 3 5 ...
##  $ day            : num  26 10 13 24 12 7 15 14 14 25 ...
##  $ year           : num  1990 1991 1991 1990 1991 ...
##  $ plot_id        : Factor w/ 5 levels "Spectab exclosure",..: 3 2 3 1 3 3 3 3 3 3 ...
##  $ sex            : Factor w/ 2 levels "Male","Female": 1 1 1 1 2 2 2 2 2 2 ...
##  $ hindfoot_length: num  14 13 12 14 14 14 14 16 14 14 ...
##  $ weight         : num  7 6 8 7 9 9 8 9 8 10 ...
##  $ date           : POSIXct, format: "1990-04-26" "1991-10-10" ...
##  $ genus          : chr  "Baiomys" "Baiomys" "Baiomys" "Baiomys" ...
##  $ taxa           : chr  "Rodent" "Rodent" "Rodent" "Rodent" ...
##  $ sexmiss        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ weightmiss     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ anymiss        : num  0 0 0 0 0 0 0 0 0 0 ...

#different format to visualise information
head(survey_data2)

##          species_id record_id month day year                  plot_id    sex
## 743 Baiomys taylori     17486     4  26 1990 Long-term Krat Exclosure   Male
## 744 Baiomys taylori     19121    10  10 1991                  Control   Male
## 745 Baiomys taylori     19249    11  13 1991 Long-term Krat Exclosure   Male
## 746 Baiomys taylori     17374     4  24 1990        Spectab exclosure   Male
## 747 Baiomys taylori     18252     1  12 1991 Long-term Krat Exclosure Female
## 748 Baiomys taylori     19775     3   7 1992 Long-term Krat Exclosure Female
##     hindfoot_length weight       date   genus   taxa sexmiss weightmiss anymiss
## 743              14      7 1990-04-26 Baiomys Rodent       0          0       0
## 744              13      6 1991-10-10 Baiomys Rodent       0          0       0
## 745              12      8 1991-11-13 Baiomys Rodent       0          0       0
## 746              14      7 1990-04-24 Baiomys Rodent       0          0       0
## 747              14      9 1991-01-12 Baiomys Rodent       0          0       0
## 748              14      9 1992-03-07 Baiomys Rodent       0          0       0

#print name and column number displays the columns in the table-- use to exclude unwanted cols
names(survey_data2)

##  [1] "species_id"      "record_id"       "month"           "day"            
##  [5] "year"            "plot_id"         "sex"             "hindfoot_length"
##  [9] "weight"          "date"            "genus"           "taxa"           
## [13] "sexmiss"         "weightmiss"      "anymiss"

#removes columns 13-15 which is the ..sexmiss, weightmiss, and anymiss columns
survey_data2<-survey_data2[, c(1:12)]

0.3 Manipulate and summarise the data using dplyr

# create season variable with a nested ifelse function -- this is intermediate level but USEFUL to know
                                                      #if Dec to Feb, code '0', else.. 
survey_data2$season <- ifelse(survey_data2$month==12 | survey_data2$month==1 | survey_data2$month==2, 0,  
                              ifelse(survey_data2$month==3 | survey_data2$month==4 | survey_data2$month==5, 1, 
                                     ifelse(survey_data2$month==6 | survey_data2$month==7 | survey_data2$month==8, 2, 
                                            ifelse(survey_data2$month==9 | survey_data2$month==10 | survey_data2$month==11, 3, NA)))) 
# check
table(survey_data2$season)

## 
##    0    1    2    3 
## 7401 8947 7779 8055

# label seasons variable
survey_data2$season <- factor(survey_data2$season,
                              #telling it these are the levels
                              levels=c(0,1,2,3), 
                              #labelling the levels
                              labels=c("Summer", "Autumn", "Winter", "Spring"))
# check that labels have worked
table(survey_data2$season)

## 
## Summer Autumn Winter Spring 
##   7401   8947   7779   8055

0.4 Exploratory Data Analysis

#summary table

# counts
summSeason <- survey_data2 %>%
  group_by(season) %>%
  #count by season
  count() %>%
  #output as data.frame not tibble
  as.data.frame() 
#create percentages, using nrow from survey_data2 rather than summSeason$p to account for if there were any NAs
summSeason$p <- round(summSeason$n/nrow(survey_data2)*100,1)  

summSex <- survey_data2 %>%
  select(season, sex) %>%
  group_by(season) %>%
  count(sex) %>%
  as.data.frame()
summSex$p <- c(round(summSex$n[1:2]/sum(summSex$n[1:2])*100,1), #[1:2] row numbers in summSex (*100,1) means x100 to 1 d-pl.
               round(summSex$n[3:4]/sum(summSex$n[3:4])*100,1),
               round(summSex$n[5:6]/sum(summSex$n[5:6])*100,1),
               round(summSex$n[7:8]/sum(summSex$n[7:8])*100,1))  #create percentages by columns

#pipeline for summarise species in each season and find percenrtages accordingly
summSpecies <- survey_data2 %>%
  select(season, species_id) %>%
  group_by(season) %>%
  count(species_id) %>%
  as.data.frame()
summSpecies$p <- c(round(summSpecies$n[1:22]/sum(summSpecies$n[1:22])*100,3), #to 1 d-pl
               round(summSpecies$n[23:44]/sum(summSpecies$n[23:44])*100,3),
               round(summSpecies$n[45:67]/sum(summSpecies$n[45:67])*100,3),
               round(summSpecies$n[68:89]/sum(summSpecies$n[68:89])*100,3))


# means - for the numerical values 
summMeans <- survey_data2 %>%
  select(season, hindfoot_length, weight) %>%
  group_by(season) %>%
  summarise(across(.cols = everything(), ~mean(., na.rm = TRUE))) %>%
  as.data.frame()
#round to 3 decimal places - exploring
summMeans <- c(summMeans[1], round(summMeans[2:3], 3))


# sd
summSD <- survey_data2 %>%
  select(season, hindfoot_length, weight) %>%
  group_by(season) %>%
  summarise(across(.cols = everything(), ~sd(., na.rm = TRUE))) %>%
  as.data.frame()
#round to 3 decimal places - exploring
summSD <- c(summSD[1], round(summSD[2:3], 3))

# plot (meaning exclosure, i.e. location) by season - do they differ over the year??
summPlots <- survey_data2 %>%
  select(season, plot_id) %>%
  group_by(season) %>%
  count(plot_id) %>%
  as.data.frame()
summPlots$p <- c(round(summPlots$n[1:5]/sum(summPlots$n[1:5])*100,1),
                 round(summPlots$n[6:10]/sum(summPlots$n[6:10])*100,1),
                 round(summPlots$n[11:15]/sum(summPlots$n[11:15])*100,1),
                 round(summPlots$n[16:20]/sum(summPlots$n[16:20])*100,1))

#create dataframe
t1 <- data.frame(vars =   c(paste0("**Overall Count** "," *n (%)*"), 
                            paste0("**Sex** "," *n (%)*"), 
                                   "*Female*", "*Male*", 
                            paste0("**Hindfoot Length** ", "mm", " *mean (SD)*"),
                            paste0("**Weight** ", "g", " *mean (SD)*"),
                            paste0("**Plot Information** ", "*n (%)*"),
                                  "*Spectab Exclusure*", "*Control*", "*Long-term Krat Exclosure*", 
                                  "*Rodent Exclosure*", "*Short-term Krat Exclusure*"),
                 
                 summer = c(paste0(summSeason$n[1], " (", summSeason$p[1], "%)"),
                            "",  # add space for sex title
                            paste0(summSex$n[1:2], " (", summSex$p[1:2], "%)"),
                            paste0(round(summMeans$hindfoot_length[1],2), " (", 
                                   round(summSD$hindfoot_length[1], 2), ")"),
                            paste0(round(summMeans$weight[1], 2), " (", 
                                   round(summSD$weight[1], 2), ")"),
                            "",  # space for Plot Info title
                            paste0(summPlots$n[1:5], " (", summPlots$p[1:5], "%)")),
                 
                 autumn = c(paste0(summSeason$n[2], " (", summSeason$p[2], "%)"), 
                            "",  
                            paste0(summSex$n[3:4], " (", summSex$p[3:4], "%)"),
                            paste0(round(summMeans$hindfoot_length[2],2), " (", 
                                   round(summSD$hindfoot_length[2], 2), ")"),
                            paste0(round(summMeans$weight[2], 2), " (", 
                                   round(summSD$weight[2], 2), ")"),
                            "",  
                            paste0(summPlots$n[6:10], " (", summPlots$p[6:10], "%)")),
                 
                 
                 winter = c(paste0(summSeason$n[3], " (", summSeason$p[3], "%)"), 
                            "",  
                            paste0(summSex$n[5:6], " (", summSex$p[5:6], "%)"),
                            paste0(round(summMeans$hindfoot_length[3],2), " (", 
                                   round(summSD$hindfoot_length[3], 2), ")"),
                            paste0(round(summMeans$weight[3], 2), " (", 
                                   round(summSD$weight[3], 2), ")"),
                            "", 
                            paste0(summPlots$n[11:15], " (", summPlots$p[11:15], "%)")),
                 
                 spring = c(paste0(summSeason$n[4], " (", summSeason$p[4], "%)"), 
                            "",  
                            paste0(summSex$n[7:8], " (", summSex$p[7:8], "%)"),
                            paste0(round(summMeans$hindfoot_length[4],2), " (", 
                                   round(summSD$hindfoot_length[4], 2), ")"),
                            paste0(round(summMeans$weight[4], 2), " (", 
                                   round(summSD$weight[4], 2), ")"),
                            "", 
                            paste0(summPlots$n[16:20], " (", summPlots$p[16:20], "%)")),
                 
                 stringsAsFactors = FALSE)

0.5 Tabulate with kable and kableExtra

#create table using kable
kable(t1,
      caption = "**Summary of study variables by season**", # adds table caption
      col.names = c("", "Summer", "Autumn", "Winter", "Spring"),   # ("") means no title to first column
      align="lcccc", 
      type="") %>%
  column_spec(1, width_min="5cm", border_right = TRUE) %>%
  column_spec(c(2,3,4,5), width_min="3cm") %>%
  row_spec(0, bold=T, color="ivory", background="#666666") %>%
  row_spec(1:12, color="black", background="white") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  add_indent(c(3,4,8:12)) %>%  # indent rows for measures with levels
  footnote(general = c("*% are within groups for all non-missing values*")) %>% # add footnote
  add_header_above(c(" " = 1, "Season" = 4), bold=T, color="black", background="white", include_empty = T)

**Summary of study variables by season**

	Summer	Autumn	Winter	Spring
Overall Count n (%)	7401 (23%)	8947 (27.8%)	7779 (24.2%)	8055 (25%)
Sex n (%)
Female	3981 (53.8%)	4833 (54%)	3897 (50.1%)	4168 (51.7%)
Male	3420 (46.2%)	4114 (46%)	3882 (49.9%)	3887 (48.3%)
Hindfoot Length mm mean (SD)	29.03 (9.56)	29.74 (9.89)	28.91 (9.2)	29.12 (9.43)
Weight g mean (SD)	40.77 (34.39)	44.72 (36.74)	40.93 (35.95)	43.55 (38.45)
Plot Information n (%)
Spectab Exclusure	885 (12%)	1092 (12.2%)	840 (10.8%)	888 (11%)
Control	3329 (45%)	4070 (45.5%)	3494 (44.9%)	3718 (46.2%)
Long-term Krat Exclosure	1057 (14.3%)	1247 (13.9%)	1215 (15.6%)	1157 (14.4%)
Rodent Exclosure	986 (13.3%)	1048 (11.7%)	869 (11.2%)	894 (11.1%)
Short-term Krat Exclusure	1144 (15.5%)	1490 (16.7%)	1361 (17.5%)	1398 (17.4%)
Note:
% are within groups for all non-missing values

0.6 Data visualisation with ggplot2

# remove levels where species_id has 0 records in survey_data2
survey_data2$species_id <- droplevels(survey_data2$species_id) 
survey_data2$genus <- as.factor(survey_data2$genus)

# number samples collected each year, group by year, counting the genus
p1 <- survey_data2 %>%
  select(year, genus) %>%
  group_by(year) %>%
  count(genus) %>%
  ggplot( aes(x=year, y=n, colour=genus)) +
  geom_line(size=1.2) +
  theme_bw() +
  theme(legend.position = "bottom",
        axis.text.x = element_text(angle = 90)) +
  xlab("Year") + ylab("Number of Samples Collected (by genus)") 

# weight vs length by plot_id colour=species_id
p2 <- ggplot(survey_data2, aes(x=hindfoot_length, y=weight, colour=as.factor(genus))) +
  geom_point(na.rm = TRUE, alpha=0.5) + theme_bw() +
  xlab("Hindfoot Length (mm)") + ylab("Weight (g)") + 
  facet_grid(~ plot_id) + 
  scale_colour_discrete(name = "Genus") + theme(legend.position = "none")
  
# length by sex
p3 <- ggplot(survey_data2, aes(x=hindfoot_length, fill=sex)) +
  geom_histogram(aes(y = ..density..), colour="black", bins=50, alpha=0.5) + 
  geom_density(size=1, alpha=0.4) + 
  theme_bw() + facet_grid(sex ~ .) + theme(legend.position = "none") +
  xlab("Hindfoot Length (mm)") + ylab("Density")

p3a <- ggplot(survey_data2, aes(x=as.factor(genus), y=hindfoot_length, colour = as.factor(genus))) +
  geom_boxplot(fill="white", weight=1.5) + 
  geom_point(#fill="white", 
             position="jitter", 
             alpha=0.1) +
  xlab("Genus") + ylab("Hindfoot Length (mm)") +
  theme_bw() + 
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 90)) +
  coord_flip()

p4 <- ggplot(survey_data2, aes(x=weight, fill=sex)) +
  geom_histogram(aes(y = ..density..), colour="black", bins=50, alpha=0.5) + 
  geom_density(size=1, alpha=0.4) + 
  theme_bw() + facet_grid(sex ~ .) + theme(legend.position = "none") +
  xlab("Weight (g)") + ylab("Density")

p4a <- ggplot(survey_data2, aes(x=as.factor(genus), y=weight, colour = as.factor(genus))) +
  geom_boxplot(fill="white", weight=1.5) + 
  geom_point(#fill="white", 
             position="jitter", 
             alpha=0.1) +
  xlab("Genus") + ylab("Weight (g)") +
  theme_bw() + 
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 90)) +
  coord_flip()

ggarrange(
  p2, p1,
  labels = c("a", "b"),
  nrow = 2 #, heights = c(1,1.5)
)

ggplot(survey_data2, aes(x=as.factor(sex), y=weight, colour = as.factor(sex))) +
  geom_boxplot(fill="white", weight=1.5) + 
  geom_point(#fill="white", 
             position="jitter", 
             alpha=0.1) +
  xlab("Sex") + ylab("Weight (g)") +
  theme_bw() + 
  theme(legend.position = "none",
        axis.text.x = element_text(angle = 90)) +
  coord_flip()

ggplot(survey_data2, aes(x=year)) +
  labs(title = "Density per Year")+
  geom_histogram(aes(y=..density..), bins = 50, color = "palegreen3", fill = "palegreen3") + 
  geom_density(alpha = .2, fill = "antiquewhite3", color= "black") +
  theme_bw()

0.7 Session information

sessionInfo()

## R version 4.1.2 (2021-11-01)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19043)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_Australia.1252  LC_CTYPE=English_Australia.1252   
## [3] LC_MONETARY=English_Australia.1252 LC_NUMERIC=C                      
## [5] LC_TIME=English_Australia.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggpubr_0.4.0     kableExtra_1.3.4 readxl_1.3.1     forcats_0.5.1   
##  [5] stringr_1.4.0    dplyr_1.0.7      purrr_0.3.4      readr_2.0.1     
##  [9] tidyr_1.1.3      tibble_3.1.4     ggplot2_3.3.5    tidyverse_1.3.1 
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.7        svglite_2.1.0     lubridate_1.7.10  assertthat_0.2.1 
##  [5] digest_0.6.27     utf8_1.2.2        R6_2.5.1          cellranger_1.1.0 
##  [9] backports_1.2.1   reprex_2.0.1      evaluate_0.14     highr_0.9        
## [13] httr_1.4.2        pillar_1.6.2      rlang_0.4.11      rstudioapi_0.13  
## [17] car_3.0-12        jquerylib_0.1.4   rmarkdown_2.11    labeling_0.4.2   
## [21] webshot_0.5.2     munsell_0.5.0     broom_0.7.9       compiler_4.1.2   
## [25] modelr_0.1.8      xfun_0.25         pkgconfig_2.0.3   systemfonts_1.0.2
## [29] htmltools_0.5.2   tidyselect_1.1.1  fansi_0.5.0       viridisLite_0.4.0
## [33] crayon_1.4.1      tzdb_0.1.2        dbplyr_2.1.1      withr_2.4.3      
## [37] grid_4.1.2        jsonlite_1.7.2    gtable_0.3.0      lifecycle_1.0.0  
## [41] DBI_1.1.1         magrittr_2.0.1    scales_1.1.1      carData_3.0-5    
## [45] cli_3.0.1         stringi_1.7.4     farver_2.1.0      ggsignif_0.6.3   
## [49] fs_1.5.0          xml2_1.3.2        bslib_0.3.0       ellipsis_0.3.2   
## [53] generics_0.1.0    vctrs_0.3.8       cowplot_1.1.1     tools_4.1.2      
## [57] glue_1.4.2        hms_1.1.0         abind_1.4-5       fastmap_1.1.0    
## [61] yaml_2.2.1        colorspace_2.0-2  rstatix_0.7.0     rvest_1.0.1      
## [65] knitr_1.34        haven_2.4.3       sass_0.4.0

SCH6245 Clinical Bioinformatics in R Markdown

Artika Kirby

26/02/2022

0.1 Load dependencies (provided in session information)

0.2 Data preparation

0.2.1 Data retrieved from https://github.com/AlyceRussell/DataManagement_ReproducibleResearch/tree/master/data

0.3 Manipulate and summarise the data using dplyr

0.4 Exploratory Data Analysis

0.5 Tabulate with kable and kableExtra

0.6 Data visualisation with ggplot2

0.7 Session information