library(rmarkdown)
knitr::opts_chunk$set(echo = TRUE, message=FALSE,warning=FALSE,collapse = TRUE)
library(reshape2)
library(ggplot2)
library(dplyr)
library(plotly)
library(viridis)
library(data.table)
library(pheatmap)
library(tidyverse)
library(ggthemes)
library(clipr)
library(tidyr)
mycolors<-c(viridis(15))
felix_cols<-mycolors[c(5,2)]
felix_4cols<-mycolors[c(15,10,8,2)]
plain_cols1<-c("blue","gray")
plain_cols2<-c("red","gray")
pats_cols<-colorRampPalette(c("#FDE725FF", "white","#440154FF"))(21)
leos_cols<-colorRampPalette(c("white","blue"))(10)
x<-paste("Hello","World")
x
## [1] "Hello World"
pokemon<-read_csv(file="pokemon_gen_1.csv")##loads the pokemon dataset from a csv
## click on the pokemon object in the environment console if you like, and see what it looks like
##run the head() function on the dataset and see what it looks like
head(pokemon)
## # A tibble: 6 x 7
## pokedex_number name sp_attack sp_defense speed type weight_kg
## <dbl> <chr> <dbl> <dbl> <dbl> <chr> <dbl>
## 1 1 Bulbasaur 65 65 45 grass 6.9
## 2 2 Ivysaur 80 80 60 grass 13
## 3 3 Venusaur 122 120 80 grass 100
## 4 4 Charmander 60 50 65 fire 8.5
## 5 5 Charmeleon 80 65 80 fire 19
## 6 6 Charizard 159 115 100 fire 90.5
## use the summary function to determine some chracteristics about the dataset
summary(pokemon)
## pokedex_number name sp_attack sp_defense
## Min. : 1.0 Length:151 Min. : 10.0 Min. : 20.00
## 1st Qu.: 38.5 Class :character 1st Qu.: 45.0 1st Qu.: 49.00
## Median : 76.0 Mode :character Median : 65.0 Median : 65.00
## Mean : 76.0 Mean : 69.4 Mean : 67.74
## 3rd Qu.:113.5 3rd Qu.: 90.0 3rd Qu.: 85.00
## Max. :151.0 Max. :194.0 Max. :130.00
##
## speed type weight_kg
## Min. : 15.00 Length:151 Min. : 0.10
## 1st Qu.: 45.00 Class :character 1st Qu.: 9.50
## Median : 70.00 Mode :character Median : 30.00
## Mean : 70.15 Mean : 45.83
## 3rd Qu.: 90.00 3rd Qu.: 59.00
## Max. :150.00 Max. :460.00
## NA's :18
## what is the name of the pokemon character at the top of the list? What type is it?
## how many characters are in the dataset? what is their mean speed?
pokemon %>% group_by(type) %>% ## this groups the pokemon by type
summarise(number_of_characters=n()) ## this creates a new variable "number_of_characters", that uses the n function to determine the number of characters for each pokemon type in the dataset
## # A tibble: 15 x 2
## type number_of_characters
## * <chr> <int>
## 1 bug 12
## 2 dragon 3
## 3 electric 9
## 4 fairy 2
## 5 fighting 7
## 6 fire 12
## 7 ghost 3
## 8 grass 12
## 9 ground 8
## 10 ice 2
## 11 normal 22
## 12 poison 14
## 13 psychic 8
## 14 rock 9
## 15 water 28
## take a look at the data. Which type has the most pokemon in the dataset?
pokemon %>% group_by(type) %>% summarise(mean(sp_attack))
## # A tibble: 15 x 2
## type `mean(sp_attack)`
## * <chr> <dbl>
## 1 bug 45.8
## 2 dragon 73.3
## 3 electric 91.7
## 4 fairy 77.5
## 5 fighting 45
## 6 fire 88.8
## 7 ghost 128.
## 8 grass 89.8
## 9 ground 35.6
## 10 ice 105
## 11 normal 55.6
## 12 poison 57.1
## 13 psychic 114.
## 14 rock 61.7
## 15 water 68.0
## select() is a function that takes only certain columns from the dataset and the function filter() takes only certain rows. Here, we are selecting thr name, type, and sp_attack column, then filtering the data for sp_attack and grass type. Finally, the data goes into a new table (pokemon_trimmed_grass)
pokemon_trimmed_grass<-pokemon %>% select(name,type,sp_attack) %>%
filter(sp_attack>20 & type=="grass")
head(pokemon_trimmed_grass)
## # A tibble: 6 x 3
## name type sp_attack
## <chr> <chr> <dbl>
## 1 Bulbasaur grass 65
## 2 Ivysaur grass 80
## 3 Venusaur grass 122
## 4 Oddish grass 75
## 5 Gloom grass 85
## 6 Vileplume grass 110
## mutate() is a function that makes new columns. The code below makes a new column in the dataset (attack_defense) that describes the attack/defense ratio, then makes a new column (log_AD) that is the log2 of the attack_defense ratio. Finally, it makes another column that describes each character as light or heavy. This all gets stored in a new table called pokemon descriptors.
pokemon_descriptors<-pokemon %>% filter(!is.na(weight_kg)) %>% ## note that the NAs for weight are first removed using filter
mutate(attack_defense=sp_attack/sp_defense) %>% ## this adds a column called attack_defense that computes the sp_attack/sp_defense ratio
mutate(log_AD=log(attack_defense)) %>% ## adds a column that is the log of the attack_defense column
mutate(heavy_light=ifelse(weight_kg>40, "heavy","light")) ## makes a column that labels characters as heavy or light depending on whether they eigh more than 40 kg
head(pokemon_descriptors)
## # A tibble: 6 x 10
## pokedex_number name sp_attack sp_defense speed type weight_kg attack_defense
## <dbl> <chr> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 1 Bulb… 65 65 45 grass 6.9 1
## 2 2 Ivys… 80 80 60 grass 13 1
## 3 3 Venu… 122 120 80 grass 100 1.02
## 4 4 Char… 60 50 65 fire 8.5 1.2
## 5 5 Char… 80 65 80 fire 19 1.23
## 6 6 Char… 159 115 100 fire 90.5 1.38
## # … with 2 more variables: log_AD <dbl>, heavy_light <chr>
## Next, take a look at some summaries
pokemon_descriptors %>% group_by(type,heavy_light) %>%
summarise(n=n(),mean_wt=mean(weight_kg),mean_AD=mean(attack_defense))
## # A tibble: 28 x 5
## # Groups: type [15]
## type heavy_light n mean_wt mean_AD
## <chr> <chr> <int> <dbl> <dbl>
## 1 bug heavy 2 55.5 0.705
## 2 bug light 10 16.5 0.881
## 3 dragon heavy 1 210 1
## 4 dragon light 2 9.9 1
## 5 electric heavy 3 59.7 1.37
## 6 electric light 5 15.4 1.20
## 7 fairy light 2 23.8 0.989
## 8 fighting heavy 4 75.1 0.559
## 9 fighting light 3 26.5 0.878
## 10 fire heavy 5 89 1.26
## # … with 18 more rows
## take a look at the data. how many light versus heavy bug types are there? What is the mean weight for each of these types?
pokemon2<-pokemon%>%filter(!is.na(weight_kg))%>%mutate(log_SW=log(speed/weight_kg))
##
fire_posion<-pokemon2%>%select(name,type,log_SW)%>%filter(type=="fire"|type=="poison")
pokemon_long<-pokemon %>% select(-pokedex_number) %>% ## remove the pokedex since it is not really a useful value
melt() ## this creates a longer version of the dataset and defines the variables. This may seem strange but it fits better into how R treats variables and values for plotting and other tasks. You can also used pivot_longer for this but more arguements need to be speicifie for that function
head(pokemon_long)
## name type variable value
## 1 Bulbasaur grass sp_attack 65
## 2 Ivysaur grass sp_attack 80
## 3 Venusaur grass sp_attack 122
## 4 Charmander fire sp_attack 60
## 5 Charmeleon fire sp_attack 80
## 6 Charizard fire sp_attack 159
## see the difference in how the data are arranged? (run head(pokemon) to compare))
## Now, we can go back to the short version.
pokemon_wider<-pokemon_long %>% pivot_wider(names_from = variable) ## this (re)creates a shorter version of the data. The formula can be adjusted to only show specific parts of the long form data.
head(pokemon_wider)
## # A tibble: 6 x 6
## name type sp_attack sp_defense speed weight_kg
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Bulbasaur grass 65 65 45 6.9
## 2 Ivysaur grass 80 80 60 13
## 3 Venusaur grass 122 120 80 100
## 4 Charmander fire 60 50 65 8.5
## 5 Charmeleon fire 80 65 80 19
## 6 Charizard fire 159 115 100 90.5
pokemon_speed<-pokemon_long %>% filter(variable=="speed")%>%select(-variable) %>% rename(speed=value)
head(pokemon_speed)
## name type speed
## 1 Bulbasaur grass 45
## 2 Ivysaur grass 60
## 3 Venusaur grass 80
## 4 Charmander fire 65
## 5 Charmeleon fire 80
## 6 Charizard fire 100
scatter_wt<-pokemon %>% ggplot(aes(x=speed,y=sp_attack))+## makes a scatterplot of speed vs sp_attack
geom_point(alpha=0.7,color="royalblue")##adjusts transparency
scatter_wt
scatter_wt2<-pokemon %>% ggplot(aes(x=speed,y=sp_attack,color=type))+## passes the pokemon type onto the color part of the aesthetics (aes)
geom_point(alpha=0.7)+
scale_color_manual(values=mycolors)
scatter_wt2
scatter_wt3<-pokemon %>% ggplot(aes(x=speed,y=sp_attack,color=type,size=weight_kg,description=name))+## passes the weight onto the size part of the aesthetics (aes)
geom_point(alpha=0.7)+
scale_color_manual(values=mycolors)
scatter_wt3
ggplotly(scatter_wt3)##creates an interactive plotly widget to explore the data (uses the ggplotly package)
scatter_wt3<-pokemon %>% ggplot(aes(x=speed,y=sp_attack,color=type,size=weight_kg,description=name))+## passes the weight onto the size part of the aesthetics (aes)
geom_point(alpha=0.7)+
scale_color_manual(values=mycolors)
scatter_spd<-pokemon%>%ggplot(aes(x=weight_kg,y=sp_defense,color=type,size=speed,description=name))+
geom_point(alpha=0.7)+
scale_color_manual(values=mycolors)
##
ggplotly(scatter_spd)
## pheatmap uses a matrix of values, not a data table so we need to take the data we want and turn it into a matrix
pokemon_mat1<-pokemon %>% select(sp_attack,sp_defense,speed) %>%
as.matrix()
## then, we use this matrix to make a heatmap
pheatmap(pokemon_mat1, color=viridis(8,option="magma",begin=1,end=0),cellwidth=10,cellheight=4,cluster_cols=FALSE,cluster_rows=FALSE,
legend=TRUE,fontsize = 6)
## NOTE: you may need to use the Zoom button in the plot tab to see the whole heatmap
## The scale was very wide so should be converted to a relative intensity (out of 1). This is particularly important for large gene expression or proteomics datasets.
## This code will do that, and make a new matrix, then make a heatmap.
pokemon_mat2<-pokemon %>% select(sp_attack,sp_defense,speed) %>%
mutate(rel_attack=sp_attack/max(sp_attack),rel_defense=sp_defense/max(sp_defense),rel_speed=speed/max(speed)) %>%
select(rel_attack,rel_defense,rel_speed) %>%
as.matrix()
pheatmap(pokemon_mat2, color=viridis(8,option="magma",begin=1,end=0),cellwidth=10,cellheight=4,cluster_cols=FALSE,cluster_rows=FALSE,
legend=TRUE,fontsize = 6)
## That matrix didn't tell us a huge amount about the data though. Let's incorporate some of the chracters and their types.
## To make it simple, let's just get the grass and fire type
pokemon_fire_grass<-pokemon %>% filter(type=="grass" | type=="fire")
pokemon_mat3<-pokemon %>% select(sp_attack,sp_defense,speed) %>%
mutate(rel_attack=sp_attack/max(sp_attack),rel_defense=sp_defense/max(sp_defense),rel_speed=speed/max(speed)) %>%
select(rel_attack,rel_defense,rel_speed) %>%
as.matrix()
rownames(pokemon_mat3)<-pokemon$name ## put the names as matrix rownames
ann<-pokemon %>% select(type)## make a table of the pokemon types
rownames(ann)<-pokemon$name ## attach the rownames (names) to link across tables and use as heatmap annotations
ann<-as.data.frame(ann)## the anotations need to be in dataframe format
pheatmap(pokemon_mat3, color=viridis(8,option="magma",begin=1,end=0),cellwidth=10,cellheight=6,cluster_cols=FALSE,cluster_rows=FALSE,
legend=TRUE,fontsize = 7,annotation_row = ann)## this is an unlcustered heatmap
## Optionally, we can cluster the rows. This is very helpful to see patterns in the data. By default pheatmap uses hierarchal clustering.
pheatmap(pokemon_mat3, color=viridis(8,option="magma",begin=1,end=0),cellwidth=10,cellheight=6,cluster_cols=FALSE,cluster_rows=TRUE,
legend=TRUE,fontsize = 7,annotation_row = ann)## this is a clustered heatmap
## take a look. do they cluster by type?
##For this type of data, the long form is the most appropriate
speed_compare<-pokemon_long %>% filter(variable=="speed") %>%
ggplot(aes(y=value,x=type))+
geom_boxplot()
speed_compare
## We could also compare traits to one another across a couple differeent types
grass_fire_water_compare<-pokemon_long %>% filter(type=="grass" | type=="water" | type=="fire") %>%
ggplot(aes(y=value,x=type,fill=variable))+
geom_boxplot()+
scale_fill_manual(values=felix_4cols)## and, lets add some nicer colors to this
grass_fire_water_compare
## We could also just compare a few characters (barplot is fine since there is only one value per group)
select_character_compare<-pokemon_long %>% filter(name=="Squirtle" | name=="Cubone" | name=="Mew") %>%
ggplot(aes(y=value,x=name,fill=variable))+
geom_bar(stat="identity",position = "dodge")+
scale_fill_manual(values=felix_4cols)
select_character_compare
## Since these values represent a range of different measurements, it's not appropriate to plot them on the same axis. For that we would want to use facet
select_character_grid<-pokemon_long %>% filter(name=="Squirtle" | name=="Cubone" | name=="Mew") %>%
ggplot(aes(y=value,x=name,fill=type))+
geom_bar(stat="identity",position = "dodge")+
facet_wrap(~variable,scales="free_y")+
scale_fill_manual(values=felix_4cols)
select_character_grid
type_compare<-pokemon_long%>%select(-name) %>%ggplot(aes(y=value,x=type,fill=type))+geom_boxplot()+facet_wrap(~variable,scales="free_y")+theme(axis.text.x=element_text(colour='white'),axis.ticks.length.x=unit(0,"cm"))
type_compare
black_white<-select_character_compare + theme_bw()
black_white
clean<-select_character_compare + theme_clean()
clean
few<-select_character_compare + theme_few()
few
wallstreet<-select_character_compare + theme_wsj()
wallstreet
economist<-select_character_compare + theme_economist()
economist
bw_adjust<-black_white + theme(axis.text = element_text(colour = "black",size=14))+
theme(text = element_text(size=14))
bw_adjust
##That’s it!! I hope you had fun. This should prepare you for doing part C. You’ll have 2 weeks to do that.