#Notes from MResBEC Introduction to R (October 14th 2020)
Here’s a link https://rstudio.com/resources/cheatsheets/
So can Google’s R style guide (ignroe anything super complicated!) Here’s another link https://google.github.io/styleguide/Rguide.html
#download.file("https://ndownloader.figshare.com/files/2292169", "data/portal_data_joined.csv")
# Then load the data
surveys <- read.csv('data/portal_data_joined.csv', stringsAsFactors = FALSE)
# What's the structure of the data
str(surveys)
## 'data.frame': 34786 obs. of 13 variables:
## $ record_id : int 1 72 224 266 349 363 435 506 588 661 ...
## $ month : int 7 8 9 10 11 11 12 1 2 3 ...
## $ day : int 16 19 13 16 12 12 10 8 18 11 ...
## $ year : int 1977 1977 1977 1977 1977 1977 1977 1978 1978 1978 ...
## $ plot_id : int 2 2 2 2 2 2 2 2 2 2 ...
## $ species_id : chr "NL" "NL" "NL" "NL" ...
## $ sex : chr "M" "M" "" "" ...
## $ hindfoot_length: int 32 31 NA NA NA NA NA NA NA NA ...
## $ weight : int NA NA NA NA NA NA NA NA 218 NA ...
## $ genus : chr "Neotoma" "Neotoma" "Neotoma" "Neotoma" ...
## $ species : chr "albigula" "albigula" "albigula" "albigula" ...
## $ taxa : chr "Rodent" "Rodent" "Rodent" "Rodent" ...
## $ plot_type : chr "Control" "Control" "Control" "Control" ...
# Can also load it and have R convert characters to factors (I find this annoying)
surveys <- read.csv('data/portal_data_joined.csv')
# See the difference
str(surveys)
## 'data.frame': 34786 obs. of 13 variables:
## $ record_id : int 1 72 224 266 349 363 435 506 588 661 ...
## $ month : int 7 8 9 10 11 11 12 1 2 3 ...
## $ day : int 16 19 13 16 12 12 10 8 18 11 ...
## $ year : int 1977 1977 1977 1977 1977 1977 1977 1978 1978 1978 ...
## $ plot_id : int 2 2 2 2 2 2 2 2 2 2 ...
## $ species_id : Factor w/ 48 levels "AB","AH","AS",..: 16 16 16 16 16 16 16 16 16 16 ...
## $ sex : Factor w/ 3 levels "","F","M": 3 3 1 1 1 1 1 1 3 1 ...
## $ hindfoot_length: int 32 31 NA NA NA NA NA NA NA NA ...
## $ weight : int NA NA NA NA NA NA NA NA 218 NA ...
## $ genus : Factor w/ 26 levels "Ammodramus","Ammospermophilus",..: 13 13 13 13 13 13 13 13 13 13 ...
## $ species : Factor w/ 40 levels "albigula","audubonii",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ taxa : Factor w/ 4 levels "Bird","Rabbit",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ plot_type : Factor w/ 5 levels "Control","Long-term Krat Exclosure",..: 1 1 1 1 1 1 1 1 1 1 ...
# How to force something to be a factor
surveys$species = as.factor(surveys$species)
# Have a look at the first few rows
head(surveys)
## record_id month day year plot_id species_id sex hindfoot_length weight
## 1 1 7 16 1977 2 NL M 32 NA
## 2 72 8 19 1977 2 NL M 31 NA
## 3 224 9 13 1977 2 NL NA NA
## 4 266 10 16 1977 2 NL NA NA
## 5 349 11 12 1977 2 NL NA NA
## 6 363 11 12 1977 2 NL NA NA
## genus species taxa plot_type
## 1 Neotoma albigula Rodent Control
## 2 Neotoma albigula Rodent Control
## 3 Neotoma albigula Rodent Control
## 4 Neotoma albigula Rodent Control
## 5 Neotoma albigula Rodent Control
## 6 Neotoma albigula Rodent Control
And here’s some pretty plots….
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Create the 'complete' survey data frame
surveys_complete <- surveys %>%
filter(species_id != "", # remove missing species_id
!is.na(weight), # remove missing weight
!is.na(hindfoot_length), # remove missing hindfoot_length
sex != "") # remove missing sex
species_counts <- surveys_complete %>%
dplyr::group_by(species_id) %>%
tally %>%
filter(n >= 50) %>%
select(species_id)
surveys_complete <- surveys_complete %>%
filter(species_id %in% species_counts$species_id)
ggplot(surveys, aes(x=weight, y=hindfoot_length)) + geom_point(alpha = 0.1, aes(color=species_id))
## Warning: Removed 4048 rows containing missing values (geom_point).