#PART B) The rest of this lab exercise (Part B) is best performed directly within the RMarkdown document (Part_B_Pokemon.Rmd) in our 3520 RStudio project space. Simply open the file here in the BIO-3520 Posit CLoud space and you can implement the exercise directly. We will use RMarkdown for 2 reasons: 1) it provides a handy notebook format where individual “chunks” of code can be run separately, and 2) the notebook, along with all the resulting data can be formatted easily for export. To run each code chunk, just press the play button and you’ll see the result below. #To see more about RMarkdown, visit: https://rmarkdown.rstudio.com/ #A lot of the capabilities of R require the use of packages installed within the R environment. Packages are available in the the CRAN repository but can easily be installed within RStudio. So you can hit the ground running, all of the required packages have been installed for you. However, to use a package it needs to be loaded into the workspace.
library(rmarkdown)
knitr::opts_chunk$set(echo = TRUE, message=FALSE,warning=FALSE,collapse = TRUE)
library(reshape2)
library(ggplot2)
library(dplyr)
library(plotly)
library(viridis)
library(data.table)
library(pheatmap)
library(tidyverse)
library(ggthemes)
library(clipr)
library(tidyr)
library(Rcpp)
mycolors<-c(viridis(15)) # this will give us 15 colours from the viridis colour scheme
felix_cols<-mycolors[c(5,2)] # this will give us the 5th and 2nd colour from those 15
felix_4cols<-mycolors[c(15,10,8,2)] # this will give us the 15th, 10th, 8th, and 2nd colours from those 15
plain_cols1<-c("blue","gray") # here are 2 "simple" colours that we can compare to our nice viridis colours
plain_cols2<-c("red","gray") # here are 2 more simple colours that we can also compare
pats_cols<-colorRampPalette(c("#FDE725FF", "white","#440154FF"))(21) # this will give us a palette between two colours
leos_cols<-colorRampPalette(c("white","blue"))(10)
x<-paste("Hello","World")
x
## [1] "Hello World"
#——————————————————————————————————————-#
pokemon<-read_csv(file="pokemon_gen_1.csv") # loads the pokemon dataset from a csv
## click on "pokemon" in the environment console (top right panel) to see what it looks like
## the head() function will preview the first few rows
head(pokemon)
## # A tibble: 6 × 7
## pokedex_number name sp_attack sp_defense speed type weight_kg
## <dbl> <chr> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 1 Bulbasaur 65 65 45 grass 6.9
## 2 2 Ivysaur 80 80 60 grass 13
## 3 3 Venusaur 122 120 80 grass 100
## 4 4 Charmander 60 50 65 fire 8.5
## 5 5 Charmeleon 80 65 80 fire 19
## 6 6 Charizard 159 115 100 fire 90.5
## use the summary function to determine some characteristics about the dataset
summary(pokemon)
## pokedex_number name sp_attack sp_defense
## Min. : 1.0 Length:151 Min. : 10.0 Min. : 20.00
## 1st Qu.: 38.5 Class :character 1st Qu.: 45.0 1st Qu.: 49.00
## Median : 76.0 Mode :character Median : 65.0 Median : 65.00
## Mean : 76.0 Mean : 69.4 Mean : 67.74
## 3rd Qu.:113.5 3rd Qu.: 90.0 3rd Qu.: 85.00
## Max. :151.0 Max. :194.0 Max. :130.00
##
## speed type weight_kg
## Min. : 15.00 Length:151 Min. : 0.10
## 1st Qu.: 45.00 Class :character 1st Qu.: 9.50
## Median : 70.00 Mode :character Median : 30.00
## Mean : 70.15 Mean : 45.83
## 3rd Qu.: 90.00 3rd Qu.: 59.00
## Max. :150.00 Max. :460.00
## NA's :18
## what is the name of the pokemon character at the top of the list? What type is it?
## how many characters are in the dataset? what is their mean speed?
pokemon %>% group_by(type) %>% summarise(length(type))
## # A tibble: 15 × 2
## type `length(type)`
## <chr> <int>
## 1 bug 12
## 2 dragon 3
## 3 electric 9
## 4 fairy 2
## 5 fighting 7
## 6 fire 12
## 7 ghost 3
## 8 grass 12
## 9 ground 8
## 10 ice 2
## 11 normal 22
## 12 poison 14
## 13 psychic 8
## 14 rock 9
## 15 water 28
## 1. First, we used the pipe operator (%>%) to pass pokemon into the group_by() function
## 2. Second, we used group_by() to group together observations (pokemon) with the same value in the "type" column (water, rock, psychic, etc). Take a look at the pokemon table to understand what it's doing here.
## 3. Third, we used summarise() to group together the types, and then used the length() function on the type column to see how many pokemon of each type there were!!
pokemon %>% group_by(type) %>% summarise(mean(sp_attack))
## # A tibble: 15 × 2
## type `mean(sp_attack)`
## <chr> <dbl>
## 1 bug 45.8
## 2 dragon 73.3
## 3 electric 91.7
## 4 fairy 77.5
## 5 fighting 45
## 6 fire 88.8
## 7 ghost 128.
## 8 grass 89.8
## 9 ground 35.6
## 10 ice 105
## 11 normal 55.6
## 12 poison 57.1
## 13 psychic 114.
## 14 rock 61.7
## 15 water 68.0
pokemon %>% group_by(type) %>% summarise(mean(sp_attack))
## # A tibble: 15 × 2
## type `mean(sp_attack)`
## <chr> <dbl>
## 1 bug 45.8
## 2 dragon 73.3
## 3 electric 91.7
## 4 fairy 77.5
## 5 fighting 45
## 6 fire 88.8
## 7 ghost 128.
## 8 grass 89.8
## 9 ground 35.6
## 10 ice 105
## 11 normal 55.6
## 12 poison 57.1
## 13 psychic 114.
## 14 rock 61.7
## 15 water 68.0
## select() allows us to pick only certain columns from the dataset
## filter() allows us to pick only certain rows from the dataset, based on the value in a column
## Here, we first select the name, type, and sp_attack columns
## Then, we filter so that we only keep grass type pokemon with sp_attack higher than 20.
## Finally, we put this data into a new table (pokemon_trimmed_grass)
pokemon_trimmed_grass<-pokemon %>% select(name,type,sp_attack) %>% filter(sp_attack>20 & type=="grass")
head(pokemon_trimmed_grass)
## # A tibble: 6 × 3
## name type sp_attack
## <chr> <chr> <dbl>
## 1 Bulbasaur grass 65
## 2 Ivysaur grass 80
## 3 Venusaur grass 122
## 4 Oddish grass 75
## 5 Gloom grass 85
## 6 Vileplume grass 110
## mutate() is a function that makes new columns based on columns that we already have
## 1. The code below first makes a new column, attack_defense, that describes the attack/defense ratio
## 2. Then, we make another new column that is the log2 of the attack_defense column, called log_AD
## 3. Finally, it makes another column that describes each character as light or heavy, based on whether their weight_kg column is higher or lower than 40kg
## 4. We put all of these new columns, along with our previous ones, into a new table called "pokemon_descriptors"
## Note: we remove the NA values from the weight_kg column using filter(!is.na(weight_kg)). The "!" before "is.na()" is like a negative sign, meaning that we filter OUT the NA values, rather than keeping them. If we let them stay in our table, we would get an error when creating the heavy_light column.
pokemon_descriptors<-pokemon %>%
filter(!is.na(weight_kg)) %>%
mutate(attack_defense=sp_attack/sp_defense) %>%
mutate(log_AD=log2(attack_defense)) %>%
mutate(heavy_light=ifelse(weight_kg>40, "heavy","light"))
head(pokemon_descriptors)
## # A tibble: 6 × 10
## pokedex_number name sp_attack sp_defense speed type weight_kg attack_defense
## <dbl> <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 1 Bulb… 65 65 45 grass 6.9 1
## 2 2 Ivys… 80 80 60 grass 13 1
## 3 3 Venu… 122 120 80 grass 100 1.02
## 4 4 Char… 60 50 65 fire 8.5 1.2
## 5 5 Char… 80 65 80 fire 19 1.23
## 6 6 Char… 159 115 100 fire 90.5 1.38
## # ℹ 2 more variables: log_AD <dbl>, heavy_light <chr>
## Next, we can again take a look at some summaries
pokemon_descriptors %>% group_by(type,heavy_light) %>%
summarise(n=n(),mean_wt=mean(weight_kg),mean_AD=mean(attack_defense))
## # A tibble: 28 × 5
## # Groups: type [15]
## type heavy_light n mean_wt mean_AD
## <chr> <chr> <int> <dbl> <dbl>
## 1 bug heavy 2 55.5 0.705
## 2 bug light 10 16.5 0.881
## 3 dragon heavy 1 210 1
## 4 dragon light 2 9.9 1
## 5 electric heavy 3 59.7 1.37
## 6 electric light 5 15.4 1.20
## 7 fairy light 2 23.8 0.989
## 8 fighting heavy 4 75.1 0.559
## 9 fighting light 3 26.5 0.878
## 10 fire heavy 5 89 1.26
## # ℹ 18 more rows
## take a look at the data. how many light versus heavy bug types are there? What is the mean weight for each of these types?
pokemon2 <- pokemon %>% filter(!is.na(speed)) %>% filter(!is.na(weight_kg)) %>% mutate(log_SW=log2(speed/weight_kg))
fire_poison <- pokemon2 %>% select(name,type,log_SW) %>% filter(type == "fire"|type == "poison")
head(fire_poison)
## # A tibble: 6 × 3
## name type log_SW
## <chr> <chr> <dbl>
## 1 Charmander fire 2.93
## 2 Charmeleon fire 2.07
## 3 Charizard fire 0.144
## 4 Ekans poison 2.99
## 5 Arbok poison 0.300
## 6 Nidoran♀ poison 2.55
summary(fire_poison)
## name type log_SW
## Length:22 Length:22 Min. :-0.7063
## Class :character Class :character 1st Qu.: 0.4876
## Mode :character Mode :character Median : 1.5352
## Mean : 1.5715
## 3rd Qu.: 2.5311
## Max. : 5.1293
fire_poison %>% group_by(type) %>% summarise(mean_logSW=mean(log_SW))
## # A tibble: 2 × 2
## type mean_logSW
## <chr> <dbl>
## 1 fire 1.09
## 2 poison 1.98
pokemon_long <- pivot_longer(pokemon, cols=c(sp_attack,sp_defense,speed,weight_kg), names_to='variable')
head(pokemon_long)
## # A tibble: 6 × 5
## pokedex_number name type variable value
## <dbl> <chr> <chr> <chr> <dbl>
## 1 1 Bulbasaur grass sp_attack 65
## 2 1 Bulbasaur grass sp_defense 65
## 3 1 Bulbasaur grass speed 45
## 4 1 Bulbasaur grass weight_kg 6.9
## 5 2 Ivysaur grass sp_attack 80
## 6 2 Ivysaur grass sp_defense 80
## See how all of the variables are now contained in a single column?
## We can also reverse this to go back to the shorter version if we want to.
## This pretty much produces the same thing as our original "pokemon" table, but it's useful if for some reason we didn't have access to it.
pokemon_wider <- pokemon_long %>% pivot_wider(names_from=variable, values_from=value) ## this (re)creates a shorter version of the data. The formula can be adjusted to only show specific parts of the long form data.
head(pokemon_wider)
## # A tibble: 6 × 7
## pokedex_number name type sp_attack sp_defense speed weight_kg
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1 Bulbasaur grass 65 65 45 6.9
## 2 2 Ivysaur grass 80 80 60 13
## 3 3 Venusaur grass 122 120 80 100
## 4 4 Charmander fire 60 50 65 8.5
## 5 5 Charmeleon fire 80 65 80 19
## 6 6 Charizard fire 159 115 100 90.5
pokemon_speed <- pokemon_long %>% filter(variable== "speed") %>% select(-variable & -pokedex_number) %>% rename(speed=value)
#——————————————————————————————————————-#
## ggplot is a library in R that allows for simple plotting
## 1. First, we initialize the ggplot object using "ggplot()"
## 2. Second, we use the aesthetics mapping, aes(), to tell ggplot that we want "speed" to be on the x-axis and "sp_attack" to be on the y-axis
## 3. Finally, we use "+" to add to our ggplot object a "geom_point()". This is the most important part, as it actually adds the scatter plot to our figure. There are many different types of "geoms" that we can use, such as geom_line(), geom_bar(), etc.
pokemon %>% ggplot(aes(x=speed,y=sp_attack)) +
geom_point(alpha=0.7,color="royalblue")
## We can change the properties of each point so that different pokemon types are represented by different colours
pokemon %>% ggplot(aes(x=speed,y=sp_attack,color=type)) +
geom_point(alpha=0.7)+
scale_color_manual(values=mycolors)
## We can also make it so that the size of each pokemon's point corresponds to their weight!! Got rid of any NA's in weight column so there are no error warnings.
pokemon %>% filter(!is.na(weight_kg)) %>%
ggplot(aes(x=speed,y=sp_attack,color=type,size=weight_kg)) +
geom_point(alpha=0.7)+
scale_color_manual(values=mycolors)
## We can also make the figure an object in the environment (here called pokemon_scatter_1) and then use ggplotly (a package) to create an interactive plotly widget to explore the data
pokemon_scatter_1<-pokemon %>% filter(!is.na(weight_kg)) %>%
ggplot(aes(x=speed,y=sp_attack,color=type,size=weight_kg, description=name)) +
geom_point(alpha=0.7)+
scale_color_manual(values=mycolors)
ggplotly(pokemon_scatter_1)
#Hover over some points. Who is fast with very little attack? Who is slowest? What types are they?
pokemon_scatter_2<-pokemon %>% filter(!is.na(weight_kg)) %>%
ggplot(aes(x=weight_kg,y=sp_defense,color=type,size=speed, description=name)) +
geom_point(alpha=0.7)+
scale_color_manual(values=mycolors)
ggplotly(pokemon_scatter_2)
##and here it is a an interactive ggplotly widget
ggplotly(pokemon_scatter_2)
## pheatmap uses a matrix of values, so we need to turn our data table into a "matrix" using the as.matrix() function
pokemon_mat1 <-pokemon %>% select(sp_attack,sp_defense,speed) %>% as.matrix()
## then, we use this matrix to make a heatmap
pheatmap(pokemon_mat1, color=viridis(8,option="magma",begin=1,end=0),cellwidth=10,cellheight=4,cluster_cols=FALSE,cluster_rows=FALSE, legend=TRUE, fontsize = 6)
## NOTE: you may need to use the Zoom button in the plot tab to see the whole heatmap
## The scale was very wide (out of 151) so should be converted to a relative intensity (out of 1).
## This code will do that, and make a new matrix, then make a heatmap.
pokemon_mat2<-pokemon %>% select(sp_attack,sp_defense,speed) %>%
mutate(rel_attack=sp_attack/max(sp_attack),rel_defense=sp_defense/max(sp_defense),rel_speed=speed/max(speed)) %>%
select(rel_attack,rel_defense,rel_speed) %>%
as.matrix()
pheatmap(pokemon_mat2, color=viridis(8,option="magma",begin=1,end=0),cellwidth=10,cellheight=4,cluster_cols=FALSE,cluster_rows=FALSE,legend=TRUE,fontsize = 6)
## That matrix didn't tell us a huge amount about the data though. Let's incorporate some of the characters and their types.
## To make it simple, let's just get the grass and fire type
pokemon_fire_grass<-pokemon %>% filter(type=="grass" | type=="fire")
## now lets make a matrix of the data
pokemon_matFG<-pokemon_fire_grass %>% select(sp_attack,sp_defense,speed) %>%
mutate(rel_attack=sp_attack/max(sp_attack),rel_defense=sp_defense/max(sp_defense),rel_speed=speed/max(speed)) %>%
select(rel_attack,rel_defense,rel_speed) %>%
as.matrix()
## rename rownames as pokemon names
rownames(pokemon_matFG)<-pokemon_fire_grass$name
## now we are ready to make an unclustered heat map
pheatmap(pokemon_matFG, color=viridis(8,option="magma",begin=1,end=0),cellwidth=10,cellheight=6,cluster_cols=FALSE,cluster_rows=FALSE,legend=TRUE,fontsize = 7)
## Optionally, we can cluster the rows. This is very helpful to see patterns in the data. By default pheatmap uses hierarchal clustering.
pheatmap(pokemon_matFG, color=viridis(8,option="magma",begin=1,end=0),cellwidth=10,cellheight=6,cluster_cols=FALSE,cluster_rows=TRUE,legend=TRUE,fontsize = 7) ## this is a clustered heatmap
## take a look. do they cluster by type?
#——————————————————————————————————————-#
##For this type of data, the long form is the most appropriate
pokemon_long %>% filter(variable=="speed") %>%
ggplot(aes(y=value,x=type))+
geom_boxplot()
## We could also compare traits to one another across a couple different types
pokemon_long %>% filter(type=="grass" | type=="water" | type=="fire") %>%
ggplot(aes(y=value,x=type,fill=variable))+
geom_boxplot()+
scale_fill_manual(values=felix_4cols)
## and, lets add some nicer colors to this
## We could also just compare a few characters (barplot is fine since there is only one value per group)
pokemon_long %>% filter(name=="Squirtle" | name=="Cubone" | name=="Mew") %>%
ggplot(aes(y=value,x=name,fill=variable))+
geom_bar(stat="identity",position = "dodge")+
scale_fill_manual(values=felix_4cols)
## Since these values represent a range of different measurements, it's not appropriate to plot them on the same axis. For that we would want to use facet
pokemon_long %>% filter(name=="Squirtle" | name=="Cubone" | name=="Mew") %>%
ggplot(aes(y=value,x=name,fill=type))+
geom_bar(stat="identity",position = "dodge")+
facet_wrap(~variable,scales="free_y")+
scale_fill_manual(values=felix_4cols)
pokemon_long %>% select(-name) %>%
ggplot(aes(y=value,x=type,fill=type))+
geom_boxplot()+
facet_wrap(~variable,scales="free_y")+
theme(axis.text.x = element_text(colour= "white"), axis.tricks.length.x=unit(0,"cm"))
#hi heathyr! it worked :)
##WANT TO KNOW MORE ABOUT BOX PLOTS: look here https://waterdata.usgs.gov/blog/boxplots/ #and here: #https://ggplot2.tidyverse.org/reference/geom_boxplot.html ## If you click “Knit” in the panel above, your browser will open up an easy-to-read webpage that you can use to help you complete part C.
#——————————————————————————————————————-# ##That’s it!! I hope you had some fun! This Training set exposed you to lots of chunks of code, some statistical operations, & plotting features.
##This should prepare you for doing Part C. Next step is to try the COVID.Rmd and then chose your own data set in Part C_Template.Rmd. We will host a help session on this.