library(rmarkdown)
knitr::opts_chunk$set(echo = TRUE, message=FALSE,warning=FALSE,collapse = TRUE)
library(reshape2)
library(ggplot2)
library(dplyr)
library(plotly)
library(viridis)
library(data.table)
library(pheatmap)
library(tidyverse)
library(ggthemes)
library(clipr)
library(tidyr)
library(Rcpp)
mycolors<-c(viridis(15)) # this will give us 15 colours from the viridis colour scheme
felix_cols<-mycolors[c(5,2)] # this will give us the 5th and 2nd colour from those 15
felix_4cols<-mycolors[c(15,10,8,2)] # this will give us the 15th, 10th, 8th, and 2nd colours from those 15
plain_cols1<-c("blue","gray") # here are 2 "simple" colours that we can compare to our nice viridis colours
plain_cols2<-c("red","gray") # here are 2 more simple colours that we can also compare
pats_cols<-colorRampPalette(c("#FDE725FF", "white","#440154FF"))(21) # this will give us a palette between two colours
leos_cols<-colorRampPalette(c("white","blue"))(10)
x<-paste("Hello","World")
x
## [1] "Hello World"
#——————————————————————————————————————-#
pokemon<-read_csv(file="pokemon_gen_1.csv") # loads the pokemon dataset from a csv
## click on "pokemon" in the environment console (top right panel) to see what it looks like
## the head() function will preview the first few rows
head(pokemon)
## # A tibble: 6 × 7
## pokedex_number name sp_attack sp_defense speed type weight_kg
## <dbl> <chr> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 1 Bulbasaur 65 65 45 grass 6.9
## 2 2 Ivysaur 80 80 60 grass 13
## 3 3 Venusaur 122 120 80 grass 100
## 4 4 Charmander 60 50 65 fire 8.5
## 5 5 Charmeleon 80 65 80 fire 19
## 6 6 Charizard 159 115 100 fire 90.5
## use the summary function to determine some chracteristics about the dataset
summary(pokemon)
## pokedex_number name sp_attack sp_defense
## Min. : 1.0 Length:151 Min. : 10.0 Min. : 20.00
## 1st Qu.: 38.5 Class :character 1st Qu.: 45.0 1st Qu.: 49.00
## Median : 76.0 Mode :character Median : 65.0 Median : 65.00
## Mean : 76.0 Mean : 69.4 Mean : 67.74
## 3rd Qu.:113.5 3rd Qu.: 90.0 3rd Qu.: 85.00
## Max. :151.0 Max. :194.0 Max. :130.00
##
## speed type weight_kg
## Min. : 15.00 Length:151 Min. : 0.10
## 1st Qu.: 45.00 Class :character 1st Qu.: 9.50
## Median : 70.00 Mode :character Median : 30.00
## Mean : 70.15 Mean : 45.83
## 3rd Qu.: 90.00 3rd Qu.: 59.00
## Max. :150.00 Max. :460.00
## NA's :18
## what is the name of the pokemon character at the top of the list? What type is it?
## how many characters are in the dataset? what is their mean speed?
pokemon %>% group_by(type) %>% summarise(length(type))
## # A tibble: 15 × 2
## type `length(type)`
## <chr> <int>
## 1 bug 12
## 2 dragon 3
## 3 electric 9
## 4 fairy 2
## 5 fighting 7
## 6 fire 12
## 7 ghost 3
## 8 grass 12
## 9 ground 8
## 10 ice 2
## 11 normal 22
## 12 poison 14
## 13 psychic 8
## 14 rock 9
## 15 water 28
## 1. First, we used the pipe operator (%>%) to pass pokemon into the group_by() function
## 2. Second, we used group_by() to group together observations (pokemon) with the same value in the "type" column (water, rock, psychic, etc). Take a look at the pokemon table to understand what it's doing here.
## 3. Third, we used summarise() to group together the types, and then used the length() function on the type column to see how many pokemon of each type there were!!
pokemon %>% group_by(type) %>% summarise(mean(sp_attack))
## # A tibble: 15 × 2
## type `mean(sp_attack)`
## <chr> <dbl>
## 1 bug 45.8
## 2 dragon 73.3
## 3 electric 91.7
## 4 fairy 77.5
## 5 fighting 45
## 6 fire 88.8
## 7 ghost 128.
## 8 grass 89.8
## 9 ground 35.6
## 10 ice 105
## 11 normal 55.6
## 12 poison 57.1
## 13 psychic 114.
## 14 rock 61.7
## 15 water 68.0
## select() allows us to pick only certain columns from the dataset
## filter() allows us to pick only certain rows from the dataset, based on the value in a column
## Here, we first select the name, type, and sp_attack columns
## Then, we filter so that we only keep grass type pokemon with sp_attack higher than 20.
## Finally, we put this data into a new table (pokemon_trimmed_grass)
pokemon_trimmed_grass<-pokemon %>% select(name,type,sp_attack) %>% filter(sp_attack>20 & type=="grass")
head(pokemon_trimmed_grass)
## # A tibble: 6 × 3
## name type sp_attack
## <chr> <chr> <dbl>
## 1 Bulbasaur grass 65
## 2 Ivysaur grass 80
## 3 Venusaur grass 122
## 4 Oddish grass 75
## 5 Gloom grass 85
## 6 Vileplume grass 110
## mutate() is a function that makes new columns based on columns that we already have
## 1. The code below first makes a new column, attack_defense, that describes the attack/defense ratio
## 2. Then, we make another new column that is the log2 of the attack_defense column, called log_AD
## 3. Finally, it makes another column that describes each character as light or heavy, based on whether their weight_kg column is higher or lower than 40kg
## 4. We put all of these new columns, along with our previous ones, into a new table called "pokemon_descriptors"
## Note: we remove the NA values from the weight_kg column using filter(!is.na(weight_kg)). The "!" before "is.na()" is like a negative sign, meaning that we filter OUT the NA values, rather than keeping them. If we let them stay in our table, we would get an error when creating the heavy_light column.
pokemon_descriptors<-pokemon %>% filter(!is.na(weight_kg)) %>%
mutate(attack_defense=sp_attack/sp_defense) %>%
mutate(log_AD=log(attack_defense)) %>%
mutate(heavy_light=ifelse(weight_kg>40, "heavy","light"))
head(pokemon_descriptors)
## # A tibble: 6 × 10
## pokedex_number name sp_attack sp_defense speed type weight_kg attack_defense
## <dbl> <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 1 Bulb… 65 65 45 grass 6.9 1
## 2 2 Ivys… 80 80 60 grass 13 1
## 3 3 Venu… 122 120 80 grass 100 1.02
## 4 4 Char… 60 50 65 fire 8.5 1.2
## 5 5 Char… 80 65 80 fire 19 1.23
## 6 6 Char… 159 115 100 fire 90.5 1.38
## # … with 2 more variables: log_AD <dbl>, heavy_light <chr>
## Next, we can again take a look at some summaries
pokemon_descriptors %>% group_by(type,heavy_light) %>%
summarise(n=n(),mean_wt=mean(weight_kg),mean_AD=mean(attack_defense))
## # A tibble: 28 × 5
## # Groups: type [15]
## type heavy_light n mean_wt mean_AD
## <chr> <chr> <int> <dbl> <dbl>
## 1 bug heavy 2 55.5 0.705
## 2 bug light 10 16.5 0.881
## 3 dragon heavy 1 210 1
## 4 dragon light 2 9.9 1
## 5 electric heavy 3 59.7 1.37
## 6 electric light 5 15.4 1.20
## 7 fairy light 2 23.8 0.989
## 8 fighting heavy 4 75.1 0.559
## 9 fighting light 3 26.5 0.878
## 10 fire heavy 5 89 1.26
## # … with 18 more rows
## take a look at the data. how many light versus heavy bug types are there? What is the mean weight for each of these types?
pokemon2<-pokemon%>%filter(!is.na(weight_kg))%>%mutate(log_SW=log(speed/weight_kg))
fire_posion<-pokemon2%>%select(name,type,log_SW)%>%filter(type=="fire"|type=="poison")
fire_posion
## # A tibble: 22 × 3
## name type log_SW
## <chr> <chr> <dbl>
## 1 Charmander fire 2.03
## 2 Charmeleon fire 1.44
## 3 Charizard fire 0.0998
## 4 Ekans poison 2.08
## 5 Arbok poison 0.208
## 6 Nidoran♀ poison 1.77
## 7 Nidorina poison 1.03
## 8 Nidoqueen poison 0.236
## 9 Nidoran♂ poison 1.71
## 10 Nidorino poison 1.20
## # … with 12 more rows
pokemon_long <- pivot_longer(pokemon, cols=c(sp_attack,sp_defense,speed,weight_kg), names_to='variable')
head(pokemon_long)
## # A tibble: 6 × 5
## pokedex_number name type variable value
## <dbl> <chr> <chr> <chr> <dbl>
## 1 1 Bulbasaur grass sp_attack 65
## 2 1 Bulbasaur grass sp_defense 65
## 3 1 Bulbasaur grass speed 45
## 4 1 Bulbasaur grass weight_kg 6.9
## 5 2 Ivysaur grass sp_attack 80
## 6 2 Ivysaur grass sp_defense 80
## See how all of the variables are now contained in a single column?
## We can also reverse this to go back to the shorter version if we want to.
## This pretty much produces the same thing as our original "pokemon" table, but it's useful if for some reason we didn't have access to it.
pokemon_wider <- pokemon_long %>% pivot_wider(names_from=variable, values_from=value) ## this (re)creates a shorter version of the data. The formula can be adjusted to only show specific parts of the long form data.
head(pokemon_wider)
## # A tibble: 6 × 7
## pokedex_number name type sp_attack sp_defense speed weight_kg
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1 Bulbasaur grass 65 65 45 6.9
## 2 2 Ivysaur grass 80 80 60 13
## 3 3 Venusaur grass 122 120 80 100
## 4 4 Charmander fire 60 50 65 8.5
## 5 5 Charmeleon fire 80 65 80 19
## 6 6 Charizard fire 159 115 100 90.5
pokemon_speed<-pokemon_long %>% filter(variable=="speed")%>% select(-variable)%>% select(-pokedex_number)%>% rename(speed=value)
pokemon_speed
## # A tibble: 151 × 3
## name type speed
## <chr> <chr> <dbl>
## 1 Bulbasaur grass 45
## 2 Ivysaur grass 60
## 3 Venusaur grass 80
## 4 Charmander fire 65
## 5 Charmeleon fire 80
## 6 Charizard fire 100
## 7 Squirtle water 43
## 8 Wartortle water 58
## 9 Blastoise water 78
## 10 Caterpie bug 45
## # … with 141 more rows
#——————————————————————————————————————-#
## ggplot is a library in R that allows for simple plotting
## 1. First, we initialize the ggplot object using "ggplot()"
## 2. Second, we use the aesthetics mapping, aes(), to tell ggplot that we want "speed" to be on the x-axis and "sp_attack" to be on the y-axis
## 3. Finally, we use "+" to add to our ggplot object a "geom_point()". This is the most important part, as it actually adds the scatter plot to our figure. There are many different types of "geoms" that we can use, such as geom_line(), geom_bar(), etc.
pokemon %>% ggplot(aes(x=speed,y=sp_attack)) +
geom_point(alpha=0.7,color="royalblue")
## We can change the properties of each point so that different pokemon types are represented by different colours
pokemon %>% ggplot(aes(x=speed,y=sp_attack,color=type)) +
geom_point(alpha=0.7)+
scale_color_manual(values=mycolors)
## We can also make it so that the size of each pokemon's point corresponds to their weight!!
pokemon %>% ggplot(aes(x=speed,y=sp_attack,color=type,size=weight_kg)) +
geom_point(alpha=0.7)+
scale_color_manual(values=mycolors)
##HF: We can also make the figure an object in the environment (here called pokemon_scatter_1) and then use ggplotly (a package) to create an interactive plotly widget to explore the data
pokemon_scatter_1<-pokemon %>% ggplot(aes(x=speed,y=sp_attack,color=type,size=weight_kg, description=name)) +
geom_point(alpha=0.7)+
scale_color_manual(values=mycolors)
ggplotly(pokemon_scatter_1)
pokemon %>% ggplot(aes(x=weight_kg,y=sp_defense,color=type,size=speed))+
geom_point(alpha=0.7)+
scale_color_manual(values=mycolors)
##and here it is a an interactive ggplotly widget
pokemon_scatter_2<-pokemon %>% ggplot(aes(x=weight_kg,y=sp_defense,color=type,size=speed, description=name))+
geom_point(alpha=0.7)+
scale_color_manual(values=mycolors)
ggplotly(pokemon_scatter_2)
##WANT TO KNOW MORE ABOUT BOX PLOTS: look here https://waterdata.usgs.gov/blog/boxplots/ #and here: #https://ggplot2.tidyverse.org/reference/geom_boxplot.html
## pheatmap uses a matrix of values, so we need to turn our data table into a "matrix" using the as.matrix() function
pokemon_mat1 <-pokemon %>% select(sp_attack,sp_defense,speed) %>% as.matrix()
## then, we use this matrix to make a heatmap
pheatmap(pokemon_mat1, color=viridis(8,option="magma",begin=1,end=0),cellwidth=10,cellheight=4,cluster_cols=FALSE,cluster_rows=FALSE, legend=TRUE, fontsize = 6)
## NOTE: you may need to use the Zoom button in the plot tab to see the whole heatmap
## The scale was very wide so should be converted to a relative intensity (out of 1).
## This code will do that, and make a new matrix, then make a heatmap.
pokemon_mat2<-pokemon %>% select(sp_attack,sp_defense,speed) %>%
mutate(rel_attack=sp_attack/max(sp_attack),rel_defense=sp_defense/max(sp_defense),rel_speed=speed/max(speed)) %>%
select(rel_attack,rel_defense,rel_speed) %>%
as.matrix()
pheatmap(pokemon_mat2, color=viridis(8,option="magma",begin=1,end=0),cellwidth=10,cellheight=4,cluster_cols=FALSE,cluster_rows=FALSE,legend=TRUE,fontsize = 6)
## That matrix didn't tell us a huge amount about the data though. Let's incorporate some of the characters and their types.
## To make it simple, let's just get the grass and fire type
pokemon_fire_grass<-pokemon %>% filter(type=="grass" | type=="fire")
pokemon_mat3<-pokemon %>% select(sp_attack,sp_defense,speed) %>%
mutate(rel_attack=sp_attack/max(sp_attack),rel_defense=sp_defense/max(sp_defense),rel_speed=speed/max(speed)) %>%
select(rel_attack,rel_defense,rel_speed) %>%
as.matrix()
rownames(pokemon_mat3)<-pokemon$name ## put the names as matrix rownames
ann<-pokemon %>% select(type)## make a table of the pokemon types
rownames(ann)<-pokemon$name ## attach the rownames (names) to link across tables and use as heatmap annotations
ann<-as.data.frame(ann)## the anotations need to be in dataframe format
pheatmap(pokemon_mat3, color=viridis(8,option="magma",begin=1,end=0),cellwidth=10,cellheight=6,cluster_cols=FALSE,cluster_rows=FALSE, legend=TRUE,fontsize = 7,annotation_row = ann)## this is an unlcustered heatmap
## Optionally, we can cluster the rows. This is very helpful to see patterns in the data. By default pheatmap uses hierarchal clustering.
pheatmap(pokemon_mat3, color=viridis(8,option="magma",begin=1,end=0),cellwidth=10,cellheight=6,cluster_cols=FALSE,cluster_rows=TRUE,legend=TRUE,fontsize = 7,annotation_row = ann)## this is a clustered heatmap
## take a look. do they cluster by type?
##For this type of data, the long form is the most appropriate
pokemon_long %>% filter(variable=="speed") %>%
ggplot(aes(y=value,x=type))+
geom_boxplot()
## We could also compare traits to one another across a couple different types
pokemon_long %>% filter(type=="grass" | type=="water" | type=="fire") %>%
ggplot(aes(y=value,x=type,fill=variable))+
geom_boxplot()+
scale_fill_manual(values=felix_4cols)
## and, lets add some nicer colors to this
## We could also just compare a few characters (barplot is fine since there is only one value per group)
pokemon_long %>% filter(name=="Squirtle" | name=="Cubone" | name=="Mew") %>%
ggplot(aes(y=value,x=name,fill=variable))+
geom_bar(stat="identity",position = "dodge")+
scale_fill_manual(values=felix_4cols)
## Since these values represent a range of different measurements, it's not appropriate to plot them on the same axis. For that we would want to use facet
pokemon_long %>% filter(name=="Squirtle" | name=="Cubone" | name=="Mew") %>%
ggplot(aes(y=value,x=name,fill=type))+
geom_bar(stat="identity",position = "dodge")+
facet_wrap(~variable,scales="free_y")+
scale_fill_manual(values=felix_4cols)
pokemon_long %>% select(-name) %>% ggplot(aes(y=value,x=type,fill=type)) + geom_boxplot() + facet_wrap(~variable,scales="free_y")+theme(axis.text.x=element_text(colour='white'),axis.ticks.length.x=unit(0,"cm"))
##That’s it!! I hope you had fun. This should prepare you for doing part C. You’ll have 2 weeks to do that.