Introduction:
Part 1 had you follow my instructions for loading and working with
data. This part will have you dive into a VERY useful package in R
called “dplyr”, this package allows you to easily manipulate and rework
data. To complete this lab, you will be following the instructions on
this website:
https://datacarpentry.org/R-genomics/04-dplyr.html
***You will need to bring the “metadata.csv” file into R (found on
canvas) NOTE: in order for the codes you use later on to work, you NEED
to name your dataframe “metadata”
Follow this guide step-by-step. This will be as simple as copying and
pasting code from the site. You WILL need to perform the challenge
outlined!
Place all of your code for this assignment below and knit this to an
HTML to save and upload to canvas.
#Install dplyr, load the package, and read in file
install.packages("dplyr") ## install
library("dplyr") ## load
metadata<- read.csv('Ecoli_metadata.csv')
#Perform common dplyr functions
##select data columns
select(metadata, sample, clade, cit, genome_size)
##To choose rows
filter(metadata, cit == "plus")
#Use pipes to select and filter
#Use pipes to send the metadata data set first through filter, to keep rows where 'cit' was equal to 'plus', and then through select to keep the sample and generation and clade columns
metadata %>%
filter(cit == "plus") %>%
select(sample, generation, clade)
#Create a new object with this smaller version of the data we could do so by assigning it a new name
meta_citplus <- metadata %>%
filter(cit == "plus") %>%
select(sample, generation, clade)
meta_citplus
#Challenge
#Using pipes, subset the data to include rows where the clade is ‘Cit+’. Retain columns sample, cit, and genome_size.
filtered_data <- metadata %>%
filter(clade == 'Cit+') %>%
select(sample, cit, genome_size)
#Mutate
#To create a new column of genome size in bp:
metadata %>%
mutate(genome_bp = genome_size *1e6)
#If this runs off your screen and you just want to see the first few rows, you can use a pipe to view the head() of the data
metadata %>%
mutate(genome_bp = genome_size *1e6) %>%
head
#Remove rows that have NA values for clade
metadata %>%
mutate(genome_bp = genome_size *1e6) %>%
filter(!is.na(clade)) %>%
head
#Split-apply-combine data analysis
#Split the data into groups, apply some analysis to each group, and then combine the results
metadata %>%
group_by(cit) %>%
summarize(n())
#Ignore NA
metadata %>%
group_by(cit) %>%
summarize(mean_size = mean(genome_size, na.rm = TRUE))
#Group by multiple columns
metadata %>%
group_by(cit, clade) %>%
summarize(mean_size = mean(genome_size, na.rm = TRUE))
#Discard rows with missing information
metadata %>%
group_by(cit, clade) %>%
summarize(mean_size = mean(genome_size, na.rm = TRUE)) %>%
filter(!is.na(clade))
#Summarize multiple varaible at the same time
metadata %>%
group_by(cit, clade) %>%
summarize(mean_size = mean(genome_size, na.rm = TRUE),
min_generation = min(generation))
LS0tCnRpdGxlOiAiQklONTEwIFVOSVQgMTogTGFiIDEgUEFSVCBJSSIKTmFtZTogQWx5c29uIEJhcnNhbG91CnN1YnRpdGxlOiBMb2FkaW5nIGRwbHlyIGFuZCBtYW5pcHVsYXRpbmcgZGF0YQpvdXRwdXQ6CiAgaHRtbF9ub3RlYm9vazogZGVmYXVsdAogIGh0bWxfZG9jdW1lbnQ6CiAgICBkZl9wcmludDogcGFnZWQKICBwZGZfZG9jdW1lbnQ6IGRlZmF1bHQKLS0tCgojIyBJbnRyb2R1Y3Rpb246ClBhcnQgMSBoYWQgeW91IGZvbGxvdyBteSBpbnN0cnVjdGlvbnMgZm9yIGxvYWRpbmcgYW5kIHdvcmtpbmcgd2l0aCBkYXRhLiBUaGlzIHBhcnQgd2lsbCBoYXZlIHlvdSBkaXZlIGludG8gYSBWRVJZIHVzZWZ1bCBwYWNrYWdlIGluIFIgY2FsbGVkICJkcGx5ciIsIHRoaXMgcGFja2FnZSBhbGxvd3MgeW91IHRvIGVhc2lseSBtYW5pcHVsYXRlIGFuZCByZXdvcmsgZGF0YS4gVG8gY29tcGxldGUgdGhpcyBsYWIsIHlvdSB3aWxsIGJlIGZvbGxvd2luZyB0aGUgaW5zdHJ1Y3Rpb25zIG9uIHRoaXMgd2Vic2l0ZToKCmh0dHBzOi8vZGF0YWNhcnBlbnRyeS5vcmcvUi1nZW5vbWljcy8wNC1kcGx5ci5odG1sCgoqKipZb3Ugd2lsbCBuZWVkIHRvIGJyaW5nIHRoZSAibWV0YWRhdGEuY3N2IiBmaWxlIGludG8gUiAoZm91bmQgb24gY2FudmFzKQpOT1RFOiBpbiBvcmRlciBmb3IgdGhlIGNvZGVzIHlvdSB1c2UgbGF0ZXIgb24gdG8gd29yaywgeW91IE5FRUQgdG8gbmFtZSB5b3VyIGRhdGFmcmFtZSAibWV0YWRhdGEiCgpGb2xsb3cgdGhpcyBndWlkZSBzdGVwLWJ5LXN0ZXAuIFRoaXMgd2lsbCBiZSBhcyBzaW1wbGUgYXMgY29weWluZyBhbmQgcGFzdGluZyBjb2RlIGZyb20gdGhlIHNpdGUuIFlvdSBXSUxMIG5lZWQgdG8gcGVyZm9ybSB0aGUgY2hhbGxlbmdlIG91dGxpbmVkIQoKUGxhY2UgYWxsIG9mIHlvdXIgY29kZSBmb3IgdGhpcyBhc3NpZ25tZW50IGJlbG93IGFuZCBrbml0IHRoaXMgdG8gYW4gSFRNTCB0byBzYXZlIGFuZCB1cGxvYWQgdG8gY2FudmFzLgoKI0luc3RhbGwgZHBseXIsIGxvYWQgdGhlIHBhY2thZ2UsIGFuZCByZWFkIGluIGZpbGUgIApgYGB7cn0KaW5zdGFsbC5wYWNrYWdlcygiZHBseXIiKSAjIyBpbnN0YWxsCmxpYnJhcnkoImRwbHlyIikgICAgICAgICAgIyMgbG9hZAptZXRhZGF0YTwtIHJlYWQuY3N2KCdFY29saV9tZXRhZGF0YS5jc3YnKQpgYGAKCiNQZXJmb3JtIGNvbW1vbiBkcGx5ciBmdW5jdGlvbnMKYGBge3J9CiMjc2VsZWN0IGRhdGEgY29sdW1ucyAKc2VsZWN0KG1ldGFkYXRhLCBzYW1wbGUsIGNsYWRlLCBjaXQsIGdlbm9tZV9zaXplKQojI1RvIGNob29zZSByb3dzIApmaWx0ZXIobWV0YWRhdGEsIGNpdCA9PSAicGx1cyIpCmBgYAojVXNlIHBpcGVzIHRvIHNlbGVjdCBhbmQgZmlsdGVyIApgYGB7cn0KI1VzZSBwaXBlcyB0byBzZW5kIHRoZSBtZXRhZGF0YSBkYXRhIHNldCBmaXJzdCB0aHJvdWdoIGZpbHRlciwgdG8ga2VlcCByb3dzIHdoZXJlICdjaXQnIHdhcyBlcXVhbCB0byAncGx1cycsIGFuZCB0aGVuIHRocm91Z2ggc2VsZWN0IHRvIGtlZXAgdGhlIHNhbXBsZSBhbmQgZ2VuZXJhdGlvbiBhbmQgY2xhZGUgY29sdW1ucyAKbWV0YWRhdGEgJT4lCiAgZmlsdGVyKGNpdCA9PSAicGx1cyIpICU+JQogIHNlbGVjdChzYW1wbGUsIGdlbmVyYXRpb24sIGNsYWRlKQojQ3JlYXRlIGEgbmV3IG9iamVjdCB3aXRoIHRoaXMgc21hbGxlciB2ZXJzaW9uIG9mIHRoZSBkYXRhIHdlIGNvdWxkIGRvIHNvIGJ5IGFzc2lnbmluZyBpdCBhIG5ldyBuYW1lCm1ldGFfY2l0cGx1cyA8LSBtZXRhZGF0YSAlPiUKICBmaWx0ZXIoY2l0ID09ICJwbHVzIikgJT4lCiAgc2VsZWN0KHNhbXBsZSwgZ2VuZXJhdGlvbiwgY2xhZGUpCgptZXRhX2NpdHBsdXMKYGBgCgojQ2hhbGxlbmdlIApgYGB7cn0KI1VzaW5nIHBpcGVzLCBzdWJzZXQgdGhlIGRhdGEgdG8gaW5jbHVkZSByb3dzIHdoZXJlIHRoZSBjbGFkZSBpcyDigJhDaXQr4oCZLiBSZXRhaW4gY29sdW1ucyBzYW1wbGUsIGNpdCwgYW5kIGdlbm9tZV9zaXplLgpmaWx0ZXJlZF9kYXRhIDwtIG1ldGFkYXRhICU+JQogIGZpbHRlcihjbGFkZSA9PSAnQ2l0KycpICU+JQogIHNlbGVjdChzYW1wbGUsIGNpdCwgZ2Vub21lX3NpemUpCmBgYAoKI011dGF0ZQpgYGB7cn0KI1RvIGNyZWF0ZSBhIG5ldyBjb2x1bW4gb2YgZ2Vub21lIHNpemUgaW4gYnA6Cm1ldGFkYXRhICU+JQogIG11dGF0ZShnZW5vbWVfYnAgPSBnZW5vbWVfc2l6ZSAqMWU2KQojSWYgdGhpcyBydW5zIG9mZiB5b3VyIHNjcmVlbiBhbmQgeW91IGp1c3Qgd2FudCB0byBzZWUgdGhlIGZpcnN0IGZldyByb3dzLCB5b3UgY2FuIHVzZSBhIHBpcGUgdG8gdmlldyB0aGUgaGVhZCgpIG9mIHRoZSBkYXRhCm1ldGFkYXRhICU+JQogIG11dGF0ZShnZW5vbWVfYnAgPSBnZW5vbWVfc2l6ZSAqMWU2KSAlPiUKICBoZWFkCiNSZW1vdmUgcm93cyB0aGF0IGhhdmUgTkEgdmFsdWVzIGZvciBjbGFkZSAKbWV0YWRhdGEgJT4lCiAgbXV0YXRlKGdlbm9tZV9icCA9IGdlbm9tZV9zaXplICoxZTYpICU+JQogIGZpbHRlcighaXMubmEoY2xhZGUpKSAlPiUKICBoZWFkCmBgYAoKI1NwbGl0LWFwcGx5LWNvbWJpbmUgZGF0YSBhbmFseXNpcyAKYGBge3J9CiNTcGxpdCB0aGUgZGF0YSBpbnRvIGdyb3VwcywgYXBwbHkgc29tZSBhbmFseXNpcyB0byBlYWNoIGdyb3VwLCBhbmQgdGhlbiBjb21iaW5lIHRoZSByZXN1bHRzCm1ldGFkYXRhICU+JQogIGdyb3VwX2J5KGNpdCkgJT4lCiAgc3VtbWFyaXplKG4oKSkKI0lnbm9yZSBOQQptZXRhZGF0YSAlPiUKICBncm91cF9ieShjaXQpICU+JQogIHN1bW1hcml6ZShtZWFuX3NpemUgPSBtZWFuKGdlbm9tZV9zaXplLCBuYS5ybSA9IFRSVUUpKQojR3JvdXAgYnkgbXVsdGlwbGUgY29sdW1ucwptZXRhZGF0YSAlPiUKICBncm91cF9ieShjaXQsIGNsYWRlKSAlPiUKICBzdW1tYXJpemUobWVhbl9zaXplID0gbWVhbihnZW5vbWVfc2l6ZSwgbmEucm0gPSBUUlVFKSkKI0Rpc2NhcmQgcm93cyB3aXRoIG1pc3NpbmcgaW5mb3JtYXRpb24gCm1ldGFkYXRhICU+JQogIGdyb3VwX2J5KGNpdCwgY2xhZGUpICU+JQogIHN1bW1hcml6ZShtZWFuX3NpemUgPSBtZWFuKGdlbm9tZV9zaXplLCBuYS5ybSA9IFRSVUUpKSAlPiUKICBmaWx0ZXIoIWlzLm5hKGNsYWRlKSkKI1N1bW1hcml6ZSBtdWx0aXBsZSB2YXJhaWJsZSBhdCB0aGUgc2FtZSB0aW1lIAptZXRhZGF0YSAlPiUKICBncm91cF9ieShjaXQsIGNsYWRlKSAlPiUKICBzdW1tYXJpemUobWVhbl9zaXplID0gbWVhbihnZW5vbWVfc2l6ZSwgbmEucm0gPSBUUlVFKSwKICAgICAgICAgICAgbWluX2dlbmVyYXRpb24gPSBtaW4oZ2VuZXJhdGlvbikpCmBgYA==