Tutorial followed and modified from: https://brunomioto.com/posts/intro_ggplot2/en/#the-data
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(ggpath)
## Warning: package 'ggpath' was built under R version 4.1.3
library(magick)
## Warning: package 'magick' was built under R version 4.1.3
## Linking to ImageMagick 6.9.12.3
## Enabled features: cairo, freetype, fftw, ghostscript, heic, lcms, pango, raw, rsvg, webp
## Disabled features: fontconfig, x11
library(pokemon)
## Warning: package 'pokemon' was built under R version 4.1.3
#load dataset into environment
pokemon.df=pokemon::pokemon # '=' or '<-' are assignment operators
#base R plotting
plot(pokemon.df$weight, pokemon.df$height)
Label the axes.
plot(pokemon.df$weight, pokemon.df$height,
xlab ="kg", ylab = "m", main="pokemon heights and weights")
?plot() #use to see documentation or use help(plot)
## starting httpd help server ... done
Look at the data.
#glimpse the data
glimpse(pokemon.df) #notice the type of each column, any missing values
## Rows: 949
## Columns: 22
## $ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,~
## $ pokemon <chr> "bulbasaur", "ivysaur", "venusaur", "charmander", "cha~
## $ species_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,~
## $ height <dbl> 0.7, 1.0, 2.0, 0.6, 1.1, 1.7, 0.5, 1.0, 1.6, 0.3, 0.7,~
## $ weight <dbl> 6.9, 13.0, 100.0, 8.5, 19.0, 90.5, 9.0, 22.5, 85.5, 2.~
## $ base_experience <dbl> 64, 142, 236, 62, 142, 240, 63, 142, 239, 39, 72, 178,~
## $ type_1 <chr> "grass", "grass", "grass", "fire", "fire", "fire", "wa~
## $ type_2 <chr> "poison", "poison", "poison", NA, NA, "flying", NA, NA~
## $ hp <dbl> 45, 60, 80, 39, 58, 78, 44, 59, 79, 45, 50, 60, 40, 45~
## $ attack <dbl> 49, 62, 82, 52, 64, 84, 48, 63, 83, 30, 20, 45, 35, 25~
## $ defense <dbl> 49, 63, 83, 43, 58, 78, 65, 80, 100, 35, 55, 50, 30, 5~
## $ special_attack <dbl> 65, 80, 100, 60, 80, 109, 50, 65, 85, 20, 25, 90, 20, ~
## $ special_defense <dbl> 65, 80, 100, 50, 65, 85, 64, 80, 105, 20, 25, 80, 20, ~
## $ speed <dbl> 45, 60, 80, 65, 80, 100, 43, 58, 78, 45, 30, 70, 50, 3~
## $ color_1 <chr> "#78C850", "#78C850", "#78C850", "#F08030", "#F08030",~
## $ color_2 <chr> "#A040A0", "#A040A0", "#A040A0", NA, NA, "#A890F0", NA~
## $ color_f <chr> "#81A763", "#81A763", "#81A763", NA, NA, "#DE835E", NA~
## $ egg_group_1 <chr> "monster", "monster", "monster", "monster", "monster",~
## $ egg_group_2 <chr> "plant", "plant", "plant", "dragon", "dragon", "dragon~
## $ url_icon <chr> "//archives.bulbagarden.net/media/upload/7/7b/001MS6.p~
## $ generation_id <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
## $ url_image <chr> "https://raw.githubusercontent.com/HybridShivam/Pokemo~
#head of the data (first rows)
head(pokemon.df)
## # A tibble: 6 x 22
## id pokemon species_id height weight base_experience type_1 type_2 hp
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl>
## 1 1 bulbasaur 1 0.7 6.9 64 grass poison 45
## 2 2 ivysaur 2 1 13 142 grass poison 60
## 3 3 venusaur 3 2 100 236 grass poison 80
## 4 4 charmander 4 0.6 8.5 62 fire <NA> 39
## 5 5 charmeleon 5 1.1 19 142 fire <NA> 58
## 6 6 charizard 6 1.7 90.5 240 fire flying 78
## # i 13 more variables: attack <dbl>, defense <dbl>, special_attack <dbl>,
## # special_defense <dbl>, speed <dbl>, color_1 <chr>, color_2 <chr>,
## # color_f <chr>, egg_group_1 <chr>, egg_group_2 <chr>, url_icon <chr>,
## # generation_id <dbl>, url_image <chr>
#tail of the data (last rows)
tail(pokemon.df) #there are a total of 10,147
## # A tibble: 6 x 22
## id pokemon species_id height weight base_experience type_1 type_2 hp
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl>
## 1 10142 minior-vio~ 774 0.3 0.3 175 rock flying 60
## 2 10143 mimikyu-bu~ 778 0.2 0.7 167 ghost fairy 55
## 3 10144 mimikyu-to~ 778 0.4 2.8 167 ghost fairy 55
## 4 10145 mimikyu-to~ 778 0.4 2.8 167 ghost fairy 55
## 5 10146 kommo-o-to~ 784 2.4 208. 270 dragon fight~ 75
## 6 10147 magearna-o~ 801 1 80.5 120 steel fairy 80
## # i 13 more variables: attack <dbl>, defense <dbl>, special_attack <dbl>,
## # special_defense <dbl>, speed <dbl>, color_1 <chr>, color_2 <chr>,
## # color_f <chr>, egg_group_1 <chr>, egg_group_2 <chr>, url_icon <chr>,
## # generation_id <dbl>, url_image <chr>
Is there a relationship between height and weight?
?cor() #correlation function
cor(pokemon.df$weight, pokemon.df$height,
method="pearson")
## [1] 0.6561909
#okay now add a linear regression line to the previous plot
plot(pokemon.df$weight, pokemon.df$height,
xlab = "height (m)", ylab="weight (kg)", main="pokemon heights and weights")
abline(lm(height ~ weight,data = pokemon.df),
col="red")
Is there a relationship between weight and speed?
#cor
cor(pokemon.df$weight, pokemon.df$speed) #no correlation
## [1] 0.03333363
#assume that pokemon speeds are in m/s units
plot(x=pokemon.df$weight, y=pokemon.df$speed,
xlab="weight (kg)",ylab="speed (m/s)",
main = "relationship between pokemon speed and weight")
abline(lm(speed ~ weight, data=pokemon.df),
col="blue")
#ggplot2 is part of the tidyverse packages
#and can use different syntax to do same things
#The + sign means 'add this'
ggplot() #1 graphics layer
ggplot(data=pokemon.df,aes(x=weight,y=height)) #2 graphics layers
ggplot(data=pokemon.df,aes(x=weight,y=height))+
geom_point() #3 graphics layers
Now let’s pimp up the ggplot by adding color and shapes.
#pimp up the plot! make color relative to type_1 data categories
ggplot(
data=pokemon.df, aes(x=weight,y=height))+
geom_point(shape=25,
aes(color=type_1))+
ggtitle("heights and weights of pokemon according to type_1")+
theme_bw()+
geom_smooth() #adds a smooth regression line using a default method
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
To summartize: there seems to be a maximum for heights as weight increase. Most of the pokemon are clustered between height 0 to 5 m and 0 to 500 kg.
We can calculate the range, average, and median for the heights and weights
#descriptive statistics
range(pokemon.df$height) #heights range
## [1] 0.1 14.5
mean(pokemon.df$height) #average height
## [1] 1.22824
median(pokemon.df$height)#median height
## [1] 1
range(pokemon.df$weight) #weights range
## [1] 0.1 999.9
mean(pokemon.df$weight) #average weight
## [1] 66.21317
median(pokemon.df$weight)#median weight
## [1] 28.8
#box plots of heights for all pokemon FIRST type
ggplot(data=pokemon.df,aes(x=type_1,y=height))+
geom_boxplot(aes(color=type_1))+
theme(axis.text.x = element_text(angle=50, vjust=0.5))+
ggtitle("height distributions based on pokemon's first type")
#box plots of weights for all pokemon FIRST type
ggplot(data=pokemon.df,aes(x=type_1, y=weight))+
geom_boxplot(aes(color=type_1))+
theme(axis.text.x = element_text(angle=50, vjust=0.5))+
ggtitle("weight distributions based on pokemon's first type")
#box plots of HP for all pokemon FIRST type
ggplot(data=pokemon.df,aes(x=type_1, y=hp))+
geom_boxplot(aes(color=type_1))+
theme(axis.text.x = element_text(angle=50, vjust=0.5))+
ggtitle("health point distributions based on pokemon's first type")
#box plots of speeds for all pokemon FIRST type
ggplot(data=pokemon.df,aes(x=type_1, y=speed))+
geom_boxplot(aes(color=type_1))+
theme(axis.text.x = element_text(angle=50, vjust=0.5))+
ggtitle("speed distributions based on pokemon's first type")
#column plots of the defense of all pokemons in the dataset
ggplot(data=pokemon.df, aes(x=pokemon,y=defense))+
geom_col()+
ggtitle("defense score for all pokemon")
What is the difference between geom_col and geom_bar? Use ?geom_col and ?geom_bar.
THE END