Tutorial followed and modified from: https://brunomioto.com/posts/intro_ggplot2/en/#the-data
Check out my rpubs site for other scripts: https://rpubs.com/deyvis305/
Main Packages required for this tutorial
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(ggpath)
## Warning: package 'ggpath' was built under R version 4.1.3
library(magick)
## Warning: package 'magick' was built under R version 4.1.3
## Linking to ImageMagick 6.9.12.3
## Enabled features: cairo, freetype, fftw, ghostscript, heic, lcms, pango, raw, rsvg, webp
## Disabled features: fontconfig, x11
library(pokemon)
## Warning: package 'pokemon' was built under R version 4.1.3
Pokemon data from the “pokemon” package
#load dataset into environment
pokemon.df=pokemon::pokemon # '=' or '<-' are assignment operators
Base R plot of pokemon weight and height
#base R plotting
plot(pokemon.df$weight, pokemon.df$height)

Label the axes.
plot(pokemon.df$weight, pokemon.df$height,
xlab ="kg", ylab = "m", main="pokemon heights and weights")

?plot() #use to see documentation or use help(plot)
## starting httpd help server ... done
Look at the data.
#glimpse the data
glimpse(pokemon.df) #notice the type of each column, any missing values
## Rows: 949
## Columns: 22
## $ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,~
## $ pokemon <chr> "bulbasaur", "ivysaur", "venusaur", "charmander", "cha~
## $ species_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,~
## $ height <dbl> 0.7, 1.0, 2.0, 0.6, 1.1, 1.7, 0.5, 1.0, 1.6, 0.3, 0.7,~
## $ weight <dbl> 6.9, 13.0, 100.0, 8.5, 19.0, 90.5, 9.0, 22.5, 85.5, 2.~
## $ base_experience <dbl> 64, 142, 236, 62, 142, 240, 63, 142, 239, 39, 72, 178,~
## $ type_1 <chr> "grass", "grass", "grass", "fire", "fire", "fire", "wa~
## $ type_2 <chr> "poison", "poison", "poison", NA, NA, "flying", NA, NA~
## $ hp <dbl> 45, 60, 80, 39, 58, 78, 44, 59, 79, 45, 50, 60, 40, 45~
## $ attack <dbl> 49, 62, 82, 52, 64, 84, 48, 63, 83, 30, 20, 45, 35, 25~
## $ defense <dbl> 49, 63, 83, 43, 58, 78, 65, 80, 100, 35, 55, 50, 30, 5~
## $ special_attack <dbl> 65, 80, 100, 60, 80, 109, 50, 65, 85, 20, 25, 90, 20, ~
## $ special_defense <dbl> 65, 80, 100, 50, 65, 85, 64, 80, 105, 20, 25, 80, 20, ~
## $ speed <dbl> 45, 60, 80, 65, 80, 100, 43, 58, 78, 45, 30, 70, 50, 3~
## $ color_1 <chr> "#78C850", "#78C850", "#78C850", "#F08030", "#F08030",~
## $ color_2 <chr> "#A040A0", "#A040A0", "#A040A0", NA, NA, "#A890F0", NA~
## $ color_f <chr> "#81A763", "#81A763", "#81A763", NA, NA, "#DE835E", NA~
## $ egg_group_1 <chr> "monster", "monster", "monster", "monster", "monster",~
## $ egg_group_2 <chr> "plant", "plant", "plant", "dragon", "dragon", "dragon~
## $ url_icon <chr> "//archives.bulbagarden.net/media/upload/7/7b/001MS6.p~
## $ generation_id <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
## $ url_image <chr> "https://raw.githubusercontent.com/HybridShivam/Pokemo~
#head of the data (first rows)
head(pokemon.df)
## # A tibble: 6 x 22
## id pokemon species_id height weight base_experience type_1 type_2 hp
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl>
## 1 1 bulbasaur 1 0.7 6.9 64 grass poison 45
## 2 2 ivysaur 2 1 13 142 grass poison 60
## 3 3 venusaur 3 2 100 236 grass poison 80
## 4 4 charmander 4 0.6 8.5 62 fire <NA> 39
## 5 5 charmeleon 5 1.1 19 142 fire <NA> 58
## 6 6 charizard 6 1.7 90.5 240 fire flying 78
## # i 13 more variables: attack <dbl>, defense <dbl>, special_attack <dbl>,
## # special_defense <dbl>, speed <dbl>, color_1 <chr>, color_2 <chr>,
## # color_f <chr>, egg_group_1 <chr>, egg_group_2 <chr>, url_icon <chr>,
## # generation_id <dbl>, url_image <chr>
#tail of the data (last rows)
tail(pokemon.df) #there are a total of 10,147
## # A tibble: 6 x 22
## id pokemon species_id height weight base_experience type_1 type_2 hp
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl>
## 1 10142 minior-vio~ 774 0.3 0.3 175 rock flying 60
## 2 10143 mimikyu-bu~ 778 0.2 0.7 167 ghost fairy 55
## 3 10144 mimikyu-to~ 778 0.4 2.8 167 ghost fairy 55
## 4 10145 mimikyu-to~ 778 0.4 2.8 167 ghost fairy 55
## 5 10146 kommo-o-to~ 784 2.4 208. 270 dragon fight~ 75
## 6 10147 magearna-o~ 801 1 80.5 120 steel fairy 80
## # i 13 more variables: attack <dbl>, defense <dbl>, special_attack <dbl>,
## # special_defense <dbl>, speed <dbl>, color_1 <chr>, color_2 <chr>,
## # color_f <chr>, egg_group_1 <chr>, egg_group_2 <chr>, url_icon <chr>,
## # generation_id <dbl>, url_image <chr>
Correlation and Regression in base R
Is there a relationship between height and weight?
?cor() #correlation function
cor(pokemon.df$weight, pokemon.df$height,
method="pearson")
## [1] 0.6561909
#okay now add a linear regression line to the previous plot
plot(pokemon.df$weight, pokemon.df$height,
xlab = "height (m)", ylab="weight (kg)", main="pokemon heights and weights")
abline(lm(height ~ weight,data = pokemon.df),
col="red")

Is there a relationship between weight and speed?
#cor
cor(pokemon.df$weight, pokemon.df$speed) #no correlation
## [1] 0.03333363
#assume that pokemon speeds are in m/s units
plot(x=pokemon.df$weight, y=pokemon.df$speed,
xlab="weight (kg)",ylab="speed (m/s)",
main = "relationship between pokemon speed and weight")
abline(lm(speed ~ weight, data=pokemon.df),
col="blue")

Using the ggplot2 package
#ggplot2 is part of the tidyverse packages
#and can use different syntax to do same things
#The + sign means 'add this'
ggplot() #1 graphics layer

ggplot(data=pokemon.df,aes(x=weight,y=height)) #2 graphics layers

ggplot(data=pokemon.df,aes(x=weight,y=height))+
geom_point() #3 graphics layers

Now let’s pimp up the ggplot by adding color and shapes.
#pimp up the plot! make color relative to type_1 data categories
ggplot(
data=pokemon.df, aes(x=weight,y=height))+
geom_point(shape=25,
aes(color=type_1))+
ggtitle("heights and weights of pokemon according to type_1")+
theme_bw()+
geom_smooth() #adds a smooth regression line using a default method
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

To summarize: there seems to be a maximum for heights as weight
increase. Most of the pokemon are clustered between height 0 to 5 m and
0 to 500 kg. The blue dark line is the regression line. Specify the
‘method=’ argument to
We can calculate the range, average, and median for the heights and
weights
#descriptive statistics
range(pokemon.df$height) #heights range
## [1] 0.1 14.5
mean(pokemon.df$height) #average height
## [1] 1.22824
median(pokemon.df$height)#median height
## [1] 1
range(pokemon.df$weight) #weights range
## [1] 0.1 999.9
mean(pokemon.df$weight) #average weight
## [1] 66.21317
median(pokemon.df$weight)#median weight
## [1] 28.8
Box plots
#box plots of heights for all pokemon FIRST type
ggplot(data=pokemon.df,aes(x=type_1,y=height))+
geom_boxplot(aes(color=type_1))+
theme(axis.text.x = element_text(angle=50, vjust=0.5))+
ggtitle("height distributions based on pokemon's first type")

#box plots of weights for all pokemon FIRST type
ggplot(data=pokemon.df,aes(x=type_1, y=weight))+
geom_boxplot(aes(color=type_1))+
theme(axis.text.x = element_text(angle=50, vjust=0.5))+
ggtitle("weight distributions based on pokemon's first type")

#box plots of HP for all pokemon FIRST type
ggplot(data=pokemon.df,aes(x=type_1, y=hp))+
geom_boxplot(aes(color=type_1))+
theme(axis.text.x = element_text(angle=50, vjust=0.5))+
ggtitle("health point distributions based on pokemon's first type")

#box plots of speeds for all pokemon FIRST type
ggplot(data=pokemon.df,aes(x=type_1, y=speed))+
geom_boxplot(aes(color=type_1))+
theme(axis.text.x = element_text(angle=50, vjust=0.5))+
ggtitle("speed distributions based on pokemon's first type")

Column plots using ggplot
#column plots of the defense of all pokemon in the dataset
ggplot(data=pokemon.df, aes(x=pokemon,y=defense))+
geom_col()+
ggtitle("defense scores for all pokemon")

What is the difference between geom_col and geom_bar? Use ?geom_col
and ?geom_bar.
Density plots
Density plots are similar to histograms. For the following, assume
height is in meters.
ggplot(
data=pokemon.df, aes(x=height)
)+
geom_density(aes(color=type_1))+
theme_classic()+
labs(title="density distributions of pokemon hieghts based on their type_1")+
theme(legend.position = "left")

ggplot(
data=pokemon.df, aes(x=height)
)+
geom_density(aes(color=type_2))+
theme_classic()+
labs(title="density distributions of pokemon hieghts based on their type_2")+
theme(legend.position = "left")

Correlograms with base R and Corrplot Package
#select only the numerical columns
data2<-data.frame(pokemon.df$height,pokemon.df$weight,
pokemon.df$base_experience, pokemon.df$hp,
pokemon.df$attack, pokemon.df$defense,
pokemon.df$special_attack,
pokemon.df$special_defense, pokemon.df$speed)
#rename columns
names(data2)<-c("height","weight","base experience","hp","attack",
"defense","special attack","special defense","speed")
#correlation matrix
pairs(data2)

Let’s visualize the strength of each variable, use the cor() function
to to get the correlation values for use in a correlogram plot using the
corrplot function from the corrplot package.
#correlation test, matrix
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.1.3
## corrplot 0.92 loaded
pokemon.cor<-cor(data2, method = "spearman")
pokemon.cor
## height weight base experience hp attack
## height 1.0000000 0.8442241 0.6653778 0.6390209 0.5975432
## weight 0.8442241 1.0000000 0.5830315 0.5977866 0.5595202
## base experience 0.6653778 0.5830315 1.0000000 0.6965262 0.6590522
## hp 0.6390209 0.5977866 0.6965262 1.0000000 0.5640783
## attack 0.5975432 0.5595202 0.6590522 0.5640783 1.0000000
## defense 0.4757982 0.5316826 0.6211443 0.4453686 0.5049619
## special attack 0.4709723 0.3424776 0.6867004 0.4841647 0.3601834
## special defense 0.4744299 0.4437225 0.7170422 0.4907858 0.3118763
## speed 0.2699755 0.1199377 0.5075607 0.2243894 0.3464289
## defense special attack special defense speed
## height 0.47579817 0.4709723 0.4744299 0.26997549
## weight 0.53168263 0.3424776 0.4437225 0.11993774
## base experience 0.62114431 0.6867004 0.7170422 0.50756068
## hp 0.44536856 0.4841647 0.4907858 0.22438943
## attack 0.50496193 0.3601834 0.3118763 0.34642892
## defense 1.00000000 0.3109700 0.5997917 0.04219941
## special attack 0.31096997 1.0000000 0.5543091 0.43621795
## special defense 0.59979175 0.5543091 1.0000000 0.26823991
## speed 0.04219941 0.4362180 0.2682399 1.00000000
corrplot(pokemon.cor, method = "shade", type="lower")

corrplot(pokemon.cor, method = "number", type="lower")

Determine the most appropriate correlation test by their assumptions.
One is normality of distribution for data for a Pearson test. The
Spearman correlation test does not have a distribution assumption for
the data.
Multi Histograms
data2 |>
ggplot()+
geom_histogram(aes(height, fill="height"),alpha=0.3)+
geom_histogram(aes(weight, fill="weight"), alpha=0.3)+
geom_histogram(aes(`base experience`, fill="base experience"), alpha=0.3)+
geom_histogram(aes(hp, fill="hp"), alpha=0.3)+
geom_histogram(aes(attack, fill="attack"), alpha=0.3)+
geom_histogram(aes(defense, fill="defense"), alpha=0.3)+
geom_histogram(aes(`special attack`, fill="special attack"), alpha=0.3)+
geom_histogram(aes(`special defense`, fill="special defense"), alpha=0.3)+
geom_histogram(aes(speed, fill="speed"), alpha=0.3)+
theme_void()+
labs(title = "distributions of all pokemon attributes by their counts",subtitle = "each bin is 30")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The overlapping makes it a little hard to see each distribution. We
can develop an interactive plot from the ggplot we just made by storing
the plot in an object and using the plotly function ggplotly(). Here we
store the ggplot multi-histogram as “histogram”.
histogram<-ggplot(data=data2)+
geom_histogram(aes(height, fill="height"),alpha=0.3)+
geom_histogram(aes(weight, fill="weight"), alpha=0.3)+
geom_histogram(aes(`base experience`, fill="base experience"), alpha=0.3)+
geom_histogram(aes(hp, fill="hp"), alpha=0.3)+
geom_histogram(aes(attack, fill="attack"), alpha=0.3)+
geom_histogram(aes(defense, fill="defense"), alpha=0.3)+
geom_histogram(aes(`special attack`, fill="special attack"), alpha=0.3)+
geom_histogram(aes(`special defense`, fill="special defense"), alpha=0.3)+
geom_histogram(aes(speed, fill="speed"), alpha=0.3)+
theme_void()+
labs(title = "distributions of all pokemon attributes by their counts",subtitle = "each bin is 30")
Making an interactive plot from ggplot2 using plotly
This is an interactive plot.
#make an interactive plot
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
ggplotly(histogram)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Now comparing each distribution, not all are bell-shaped (normally
distributed). For example, weight is right skewed with extreme weight
values to the right. For height there isn’t much variation, however we
could transform meters to centimeters to better see the distribution of
heights relative to the scale of the other attributes.
Facet wrapping in ggplot2
Facet wrapping let’s us group the plot by a certain variable.
ggplot(data=pokemon.df, aes(x=speed, color=type_1, fill=type_1))+
geom_histogram()+
facet_wrap(~type_1)+
ggtitle("speeds of pokemon based on type_1")+
theme_classic()+
theme(legend.position="none")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

THE END