Tutorial followed and modified from: https://brunomioto.com/posts/intro_ggplot2/en/#the-data

Check out my rpubs site for other scripts: https://rpubs.com/deyvis305/

Main Packages required for this tutorial

library(dplyr)
## Warning: package 'dplyr' was built under R version 4.1.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(ggpath)                     
## Warning: package 'ggpath' was built under R version 4.1.3
library(magick)
## Warning: package 'magick' was built under R version 4.1.3
## Linking to ImageMagick 6.9.12.3
## Enabled features: cairo, freetype, fftw, ghostscript, heic, lcms, pango, raw, rsvg, webp
## Disabled features: fontconfig, x11
library(pokemon)
## Warning: package 'pokemon' was built under R version 4.1.3

Pokemon data from the “pokemon” package

#load dataset into environment
pokemon.df=pokemon::pokemon # '=' or '<-' are assignment operators

Base R plot of pokemon weight and height

#base R plotting
plot(pokemon.df$weight, pokemon.df$height)

Label the axes.

plot(pokemon.df$weight, pokemon.df$height, 
     xlab ="kg", ylab = "m", main="pokemon heights and weights")

?plot() #use to see documentation or use help(plot)
## starting httpd help server ... done

Look at the data.

#glimpse the data
glimpse(pokemon.df) #notice the type of each column, any missing values
## Rows: 949
## Columns: 22
## $ id              <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,~
## $ pokemon         <chr> "bulbasaur", "ivysaur", "venusaur", "charmander", "cha~
## $ species_id      <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,~
## $ height          <dbl> 0.7, 1.0, 2.0, 0.6, 1.1, 1.7, 0.5, 1.0, 1.6, 0.3, 0.7,~
## $ weight          <dbl> 6.9, 13.0, 100.0, 8.5, 19.0, 90.5, 9.0, 22.5, 85.5, 2.~
## $ base_experience <dbl> 64, 142, 236, 62, 142, 240, 63, 142, 239, 39, 72, 178,~
## $ type_1          <chr> "grass", "grass", "grass", "fire", "fire", "fire", "wa~
## $ type_2          <chr> "poison", "poison", "poison", NA, NA, "flying", NA, NA~
## $ hp              <dbl> 45, 60, 80, 39, 58, 78, 44, 59, 79, 45, 50, 60, 40, 45~
## $ attack          <dbl> 49, 62, 82, 52, 64, 84, 48, 63, 83, 30, 20, 45, 35, 25~
## $ defense         <dbl> 49, 63, 83, 43, 58, 78, 65, 80, 100, 35, 55, 50, 30, 5~
## $ special_attack  <dbl> 65, 80, 100, 60, 80, 109, 50, 65, 85, 20, 25, 90, 20, ~
## $ special_defense <dbl> 65, 80, 100, 50, 65, 85, 64, 80, 105, 20, 25, 80, 20, ~
## $ speed           <dbl> 45, 60, 80, 65, 80, 100, 43, 58, 78, 45, 30, 70, 50, 3~
## $ color_1         <chr> "#78C850", "#78C850", "#78C850", "#F08030", "#F08030",~
## $ color_2         <chr> "#A040A0", "#A040A0", "#A040A0", NA, NA, "#A890F0", NA~
## $ color_f         <chr> "#81A763", "#81A763", "#81A763", NA, NA, "#DE835E", NA~
## $ egg_group_1     <chr> "monster", "monster", "monster", "monster", "monster",~
## $ egg_group_2     <chr> "plant", "plant", "plant", "dragon", "dragon", "dragon~
## $ url_icon        <chr> "//archives.bulbagarden.net/media/upload/7/7b/001MS6.p~
## $ generation_id   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ~
## $ url_image       <chr> "https://raw.githubusercontent.com/HybridShivam/Pokemo~
#head of the data (first rows)
head(pokemon.df)
## # A tibble: 6 x 22
##      id pokemon    species_id height weight base_experience type_1 type_2    hp
##   <dbl> <chr>           <dbl>  <dbl>  <dbl>           <dbl> <chr>  <chr>  <dbl>
## 1     1 bulbasaur           1    0.7    6.9              64 grass  poison    45
## 2     2 ivysaur             2    1     13               142 grass  poison    60
## 3     3 venusaur            3    2    100               236 grass  poison    80
## 4     4 charmander          4    0.6    8.5              62 fire   <NA>      39
## 5     5 charmeleon          5    1.1   19               142 fire   <NA>      58
## 6     6 charizard           6    1.7   90.5             240 fire   flying    78
## # i 13 more variables: attack <dbl>, defense <dbl>, special_attack <dbl>,
## #   special_defense <dbl>, speed <dbl>, color_1 <chr>, color_2 <chr>,
## #   color_f <chr>, egg_group_1 <chr>, egg_group_2 <chr>, url_icon <chr>,
## #   generation_id <dbl>, url_image <chr>
#tail of the data (last rows)
tail(pokemon.df) #there are a total of 10,147
## # A tibble: 6 x 22
##      id pokemon     species_id height weight base_experience type_1 type_2    hp
##   <dbl> <chr>            <dbl>  <dbl>  <dbl>           <dbl> <chr>  <chr>  <dbl>
## 1 10142 minior-vio~        774    0.3    0.3             175 rock   flying    60
## 2 10143 mimikyu-bu~        778    0.2    0.7             167 ghost  fairy     55
## 3 10144 mimikyu-to~        778    0.4    2.8             167 ghost  fairy     55
## 4 10145 mimikyu-to~        778    0.4    2.8             167 ghost  fairy     55
## 5 10146 kommo-o-to~        784    2.4  208.              270 dragon fight~    75
## 6 10147 magearna-o~        801    1     80.5             120 steel  fairy     80
## # i 13 more variables: attack <dbl>, defense <dbl>, special_attack <dbl>,
## #   special_defense <dbl>, speed <dbl>, color_1 <chr>, color_2 <chr>,
## #   color_f <chr>, egg_group_1 <chr>, egg_group_2 <chr>, url_icon <chr>,
## #   generation_id <dbl>, url_image <chr>

Correlation and Regression in base R

Is there a relationship between height and weight?

?cor() #correlation function
cor(pokemon.df$weight, pokemon.df$height, 
    method="pearson")
## [1] 0.6561909
#okay now add a linear regression line to the previous plot
plot(pokemon.df$weight, pokemon.df$height,
     xlab = "height (m)", ylab="weight (kg)", main="pokemon heights and weights")
abline(lm(height ~ weight,data = pokemon.df),
       col="red")

Is there a relationship between weight and speed?

#cor
cor(pokemon.df$weight, pokemon.df$speed) #no correlation
## [1] 0.03333363
#assume that pokemon speeds are in m/s units
plot(x=pokemon.df$weight, y=pokemon.df$speed,
     xlab="weight (kg)",ylab="speed (m/s)", 
     main = "relationship between pokemon speed and weight")
abline(lm(speed ~ weight, data=pokemon.df),
       col="blue")

Using the ggplot2 package

#ggplot2 is part of the tidyverse packages
#and can use different syntax to do same things
#The + sign means 'add this'
ggplot()                                       #1 graphics layer

ggplot(data=pokemon.df,aes(x=weight,y=height)) #2 graphics layers

ggplot(data=pokemon.df,aes(x=weight,y=height))+
  geom_point()                                 #3 graphics layers

Now let’s pimp up the ggplot by adding color and shapes.

#pimp up the plot! make color relative to type_1 data categories
ggplot(
  data=pokemon.df, aes(x=weight,y=height))+
  geom_point(shape=25,
             aes(color=type_1))+
  ggtitle("heights and weights of pokemon according to type_1")+
  theme_bw()+
  geom_smooth() #adds a smooth regression line using a default method
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

To summarize: there seems to be a maximum for heights as weight increase. Most of the pokemon are clustered between height 0 to 5 m and 0 to 500 kg. The blue dark line is the regression line. Specify the ‘method=’ argument to

We can calculate the range, average, and median for the heights and weights

#descriptive statistics
range(pokemon.df$height) #heights range
## [1]  0.1 14.5
mean(pokemon.df$height)  #average height
## [1] 1.22824
median(pokemon.df$height)#median height
## [1] 1
range(pokemon.df$weight) #weights range
## [1]   0.1 999.9
mean(pokemon.df$weight)  #average weight
## [1] 66.21317
median(pokemon.df$weight)#median weight
## [1] 28.8

Box plots

#box plots of heights for all pokemon FIRST type
ggplot(data=pokemon.df,aes(x=type_1,y=height))+
  geom_boxplot(aes(color=type_1))+
  theme(axis.text.x = element_text(angle=50, vjust=0.5))+
  ggtitle("height distributions based on pokemon's first type")

#box plots of weights for all pokemon FIRST type
ggplot(data=pokemon.df,aes(x=type_1, y=weight))+
  geom_boxplot(aes(color=type_1))+
  theme(axis.text.x = element_text(angle=50, vjust=0.5))+
  ggtitle("weight distributions based on pokemon's first type")

#box plots of HP for all pokemon FIRST type
ggplot(data=pokemon.df,aes(x=type_1, y=hp))+
  geom_boxplot(aes(color=type_1))+
  theme(axis.text.x = element_text(angle=50, vjust=0.5))+
  ggtitle("health point distributions based on pokemon's first type")

#box plots of speeds for all pokemon FIRST type
ggplot(data=pokemon.df,aes(x=type_1, y=speed))+
  geom_boxplot(aes(color=type_1))+
  theme(axis.text.x = element_text(angle=50, vjust=0.5))+
  ggtitle("speed distributions based on pokemon's first type")

Column plots using ggplot

#column plots of the defense of all pokemon in the dataset
ggplot(data=pokemon.df, aes(x=pokemon,y=defense))+
  geom_col()+
  ggtitle("defense scores for all pokemon")

What is the difference between geom_col and geom_bar? Use ?geom_col and ?geom_bar.

Density plots

Density plots are similar to histograms. For the following, assume height is in meters.

ggplot(
  data=pokemon.df, aes(x=height)
)+
  geom_density(aes(color=type_1))+
  theme_classic()+
  labs(title="density distributions of pokemon hieghts based on their type_1")+
  theme(legend.position = "left")

ggplot(
  data=pokemon.df, aes(x=height)
)+
  geom_density(aes(color=type_2))+
  theme_classic()+
  labs(title="density distributions of pokemon hieghts based on their type_2")+
  theme(legend.position = "left")

Correlograms with base R and Corrplot Package

#select only the numerical columns
data2<-data.frame(pokemon.df$height,pokemon.df$weight,
                    pokemon.df$base_experience, pokemon.df$hp,
                    pokemon.df$attack, pokemon.df$defense,
                    pokemon.df$special_attack,
                    pokemon.df$special_defense, pokemon.df$speed)
#rename columns
names(data2)<-c("height","weight","base experience","hp","attack",
                "defense","special attack","special defense","speed")

#correlation matrix
pairs(data2)

Let’s visualize the strength of each variable, use the cor() function to to get the correlation values for use in a correlogram plot using the corrplot function from the corrplot package.

#correlation test, matrix
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.1.3
## corrplot 0.92 loaded
pokemon.cor<-cor(data2, method = "spearman")
pokemon.cor
##                    height    weight base experience        hp    attack
## height          1.0000000 0.8442241       0.6653778 0.6390209 0.5975432
## weight          0.8442241 1.0000000       0.5830315 0.5977866 0.5595202
## base experience 0.6653778 0.5830315       1.0000000 0.6965262 0.6590522
## hp              0.6390209 0.5977866       0.6965262 1.0000000 0.5640783
## attack          0.5975432 0.5595202       0.6590522 0.5640783 1.0000000
## defense         0.4757982 0.5316826       0.6211443 0.4453686 0.5049619
## special attack  0.4709723 0.3424776       0.6867004 0.4841647 0.3601834
## special defense 0.4744299 0.4437225       0.7170422 0.4907858 0.3118763
## speed           0.2699755 0.1199377       0.5075607 0.2243894 0.3464289
##                    defense special attack special defense      speed
## height          0.47579817      0.4709723       0.4744299 0.26997549
## weight          0.53168263      0.3424776       0.4437225 0.11993774
## base experience 0.62114431      0.6867004       0.7170422 0.50756068
## hp              0.44536856      0.4841647       0.4907858 0.22438943
## attack          0.50496193      0.3601834       0.3118763 0.34642892
## defense         1.00000000      0.3109700       0.5997917 0.04219941
## special attack  0.31096997      1.0000000       0.5543091 0.43621795
## special defense 0.59979175      0.5543091       1.0000000 0.26823991
## speed           0.04219941      0.4362180       0.2682399 1.00000000
corrplot(pokemon.cor, method = "shade", type="lower")

corrplot(pokemon.cor, method = "number", type="lower")

Determine the most appropriate correlation test by their assumptions. One is normality of distribution for data for a Pearson test. The Spearman correlation test does not have a distribution assumption for the data.

Multi Histograms

data2 |>
  ggplot()+
  geom_histogram(aes(height, fill="height"),alpha=0.3)+
  geom_histogram(aes(weight, fill="weight"), alpha=0.3)+
  geom_histogram(aes(`base experience`, fill="base experience"), alpha=0.3)+
  geom_histogram(aes(hp, fill="hp"), alpha=0.3)+
  geom_histogram(aes(attack, fill="attack"), alpha=0.3)+
  geom_histogram(aes(defense, fill="defense"), alpha=0.3)+
  geom_histogram(aes(`special attack`, fill="special attack"), alpha=0.3)+
  geom_histogram(aes(`special defense`, fill="special defense"), alpha=0.3)+
  geom_histogram(aes(speed, fill="speed"), alpha=0.3)+
  theme_void()+
  labs(title = "distributions of all pokemon attributes by their counts",subtitle = "each bin is 30")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

The overlapping makes it a little hard to see each distribution. We can develop an interactive plot from the ggplot we just made by storing the plot in an object and using the plotly function ggplotly(). Here we store the ggplot multi-histogram as “histogram”.

histogram<-ggplot(data=data2)+
  geom_histogram(aes(height, fill="height"),alpha=0.3)+
  geom_histogram(aes(weight, fill="weight"), alpha=0.3)+
  geom_histogram(aes(`base experience`, fill="base experience"), alpha=0.3)+
  geom_histogram(aes(hp, fill="hp"), alpha=0.3)+
  geom_histogram(aes(attack, fill="attack"), alpha=0.3)+
  geom_histogram(aes(defense, fill="defense"), alpha=0.3)+
  geom_histogram(aes(`special attack`, fill="special attack"), alpha=0.3)+
  geom_histogram(aes(`special defense`, fill="special defense"), alpha=0.3)+
  geom_histogram(aes(speed, fill="speed"), alpha=0.3)+
  theme_void()+
  labs(title = "distributions of all pokemon attributes by their counts",subtitle = "each bin is 30")

Making an interactive plot from ggplot2 using plotly

This is an interactive plot.

#make an interactive plot
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
ggplotly(histogram)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Now comparing each distribution, not all are bell-shaped (normally distributed). For example, weight is right skewed with extreme weight values to the right. For height there isn’t much variation, however we could transform meters to centimeters to better see the distribution of heights relative to the scale of the other attributes.

Facet wrapping in ggplot2

Facet wrapping let’s us group the plot by a certain variable.

ggplot(data=pokemon.df, aes(x=speed, color=type_1, fill=type_1))+
    geom_histogram()+
      facet_wrap(~type_1)+
ggtitle("speeds of pokemon based on type_1")+
  theme_classic()+
  theme(legend.position="none")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

THE END