library(tidyverse)
#> Warning: package 'ggplot2' was built under R version 4.5.3
#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#> ✔ dplyr 1.1.4 ✔ readr 2.1.5
#> ✔ forcats 1.0.0 ✔ stringr 1.5.1
#> ✔ ggplot2 4.0.2 ✔ tibble 3.3.0
#> ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
#> ✔ purrr 1.1.0
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag() masks stats::lag()
#> ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(palmerpenguins)
#> Warning: package 'palmerpenguins' was built under R version 4.5.3
#>
#> Attaching package: 'palmerpenguins'
#>
#> The following objects are masked from 'package:datasets':
#>
#> penguins, penguins_raw
library(FactoMineR)
#> Warning: package 'FactoMineR' was built under R version 4.5.3
library(factoextra)
#> Warning: package 'factoextra' was built under R version 4.5.3
#> Welcome to factoextra!
#> Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/
library(psych)
#> Warning: package 'psych' was built under R version 4.5.3
#>
#> Attaching package: 'psych'
#>
#> The following objects are masked from 'package:ggplot2':
#>
#> %+%, alpha
library(class)
library(rpart)
library(rpart.plot)
#> Warning: package 'rpart.plot' was built under R version 4.5.3
library(caret)
#> Loading required package: lattice
#>
#> Attaching package: 'caret'
#>
#> The following object is masked from 'package:purrr':
#>
#> lift
library(dplyr)
library(car)
#> Loading required package: carData
#>
#> Attaching package: 'car'
#>
#> The following object is masked from 'package:psych':
#>
#> logit
#>
#> The following object is masked from 'package:dplyr':
#>
#> recode
#>
#> The following object is masked from 'package:purrr':
#>
#> some
library(ggplot2)
library(GGally)
#> Warning: package 'GGally' was built under R version 4.5.3
library(corrplot)
#> Warning: package 'corrplot' was built under R version 4.5.3
#> corrplot 0.95 loadedExplore the datasets: - How many variables and observations? - Which variables are continuous? - Are there missing values?
Create: - Pairwise plots - Correlation matrices - Heatmaps - What patterns do you observe? - Which dataset is more suited for which type of analysis?
library(palmerpenguins)
penguins <- penguins
str(penguins)
#> tibble [344 × 8] (S3: tbl_df/tbl/data.frame)
#> $ species : Factor w/ 3 levels "Adelie","Chinstrap",..: 1 1 1 1 1 1 1 1 1 1 ...
#> $ island : Factor w/ 3 levels "Biscoe","Dream",..: 3 3 3 3 3 3 3 3 3 3 ...
#> $ bill_length_mm : num [1:344] 39.1 39.5 40.3 NA 36.7 39.3 38.9 39.2 34.1 42 ...
#> $ bill_depth_mm : num [1:344] 18.7 17.4 18 NA 19.3 20.6 17.8 19.6 18.1 20.2 ...
#> $ flipper_length_mm: int [1:344] 181 186 195 NA 193 190 181 195 193 190 ...
#> $ body_mass_g : int [1:344] 3750 3800 3250 NA 3450 3650 3625 4675 3475 4250 ...
#> $ sex : Factor w/ 2 levels "female","male": 2 1 1 NA 1 2 1 2 NA NA ...
#> $ year : int [1:344] 2007 2007 2007 2007 2007 2007 2007 2007 2007 2007 ...
summary(penguins)
#> species island bill_length_mm bill_depth_mm
#> Adelie :152 Biscoe :168 Min. :32.10 Min. :13.10
#> Chinstrap: 68 Dream :124 1st Qu.:39.23 1st Qu.:15.60
#> Gentoo :124 Torgersen: 52 Median :44.45 Median :17.30
#> Mean :43.92 Mean :17.15
#> 3rd Qu.:48.50 3rd Qu.:18.70
#> Max. :59.60 Max. :21.50
#> NA's :2 NA's :2
#> flipper_length_mm body_mass_g sex year
#> Min. :172.0 Min. :2700 female:165 Min. :2007
#> 1st Qu.:190.0 1st Qu.:3550 male :168 1st Qu.:2007
#> Median :197.0 Median :4050 NA's : 11 Median :2008
#> Mean :200.9 Mean :4202 Mean :2008
#> 3rd Qu.:213.0 3rd Qu.:4750 3rd Qu.:2009
#> Max. :231.0 Max. :6300 Max. :2009
#> NA's :2 NA's :2
dim(penguins)
#> [1] 344 8The penguin dataset has 344 observations and 8
variables, of which 3 are categorical: species, island, and sex, and 5
are numerical: bill_length_mm, bill_depth_mm, flipper_length_mm,
body_mass_g, and year.
To identify missing values, we can use the is.na()
function. The easiest solution to deal with missing values, is to remove
rows that contain missing value from the dataset using the
na.omit() function. Note that this is not always the best
option, especially when there are a lot of missing values. Other options
exist, including multiple imputation. However, these are not the focus
of this course.
The penguin dataset contains 11 row with missing values (19 in total). After removal, we are left with a dataset of 333 rows and 8 variables.
For some analyses, we only use continuous variables:
penguins_cont <- penguins_clean %>%
select(bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g)Since the penguin dataset only has 4 continuous
variables that we’re interested in, we can make pairwise scatterplots of
all the combinations of variables.
library(MASS)
#>
#> Attaching package: 'MASS'
#> The following object is masked from 'package:dplyr':
#>
#> select
scatterplotMatrix(penguins_cont)
ggpairs(penguins, columns = 3:6, aes(color = species))
#> Warning: Removed 2 rows containing non-finite outside the scale range
#> (`stat_density()`).
#> Warning: Removed 2 rows containing missing values
#> Removed 2 rows containing missing values
#> Removed 2 rows containing missing values
#> Warning: Removed 2 rows containing missing values or values outside the scale range
#> (`geom_point()`).
#> Warning: Removed 2 rows containing non-finite outside the scale range
#> (`stat_density()`).
#> Warning: Removed 2 rows containing missing values
#> Removed 2 rows containing missing values
#> Warning: Removed 2 rows containing missing values or values outside the scale range
#> (`geom_point()`).
#> Removed 2 rows containing missing values or values outside the scale range
#> (`geom_point()`).
#> Warning: Removed 2 rows containing non-finite outside the scale range
#> (`stat_density()`).
#> Warning: Removed 2 rows containing missing values
#> Warning: Removed 2 rows containing missing values or values outside the scale range
#> (`geom_point()`).
#> Removed 2 rows containing missing values or values outside the scale range
#> (`geom_point()`).
#> Removed 2 rows containing missing values or values outside the scale range
#> (`geom_point()`).
#> Warning: Removed 2 rows containing non-finite outside the scale range
#> (`stat_density()`).We can see, based on the pairwise scatterplots colored per penguin species that there is clear group separation between the three different species.
cor(penguins_cont)
#> bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
#> bill_length_mm 1.0000000 -0.2286256 0.6530956 0.5894511
#> bill_depth_mm -0.2286256 1.0000000 -0.5777917 -0.4720157
#> flipper_length_mm 0.6530956 -0.5777917 1.0000000 0.8729789
#> body_mass_g 0.5894511 -0.4720157 0.8729789 1.0000000We can see that there are moderate correlations between the variables
in the penguin dataset.
penguins_cormat <- cor(penguins_cont) %>%
as.data.frame() %>%
rownames_to_column() %>%
pivot_longer(-rowname)
penguins_cormat %>%
ggplot(aes(x=rowname,y=name,fill=value))+
geom_tile() +
geom_text(aes(label=round(value,2)),color="white") +
scale_fill_gradient2(low = "red",
high = "darkgreen",
mid="white",
midpoint=0,
limit=c(-1,1),
name="pearson\nCorrelation")library(FactoMineR)
data(decathlon)
str(decathlon)
#> 'data.frame': 41 obs. of 13 variables:
#> $ 100m : num 11 10.8 11 11 11.3 ...
#> $ Long.jump : num 7.58 7.4 7.3 7.23 7.09 7.6 7.3 7.31 6.81 7.56 ...
#> $ Shot.put : num 14.8 14.3 14.8 14.2 15.2 ...
#> $ High.jump : num 2.07 1.86 2.04 1.92 2.1 1.98 2.01 2.13 1.95 1.86 ...
#> $ 400m : num 49.8 49.4 48.4 48.9 50.4 ...
#> $ 110m.hurdle: num 14.7 14.1 14.1 15 15.3 ...
#> $ Discus : num 43.8 50.7 49 40.9 46.3 ...
#> $ Pole.vault : num 5.02 4.92 4.92 5.32 4.72 4.92 4.42 4.42 4.92 4.82 ...
#> $ Javeline : num 63.2 60.1 50.3 62.8 63.4 ...
#> $ 1500m : num 292 302 300 280 276 ...
#> $ Rank : int 1 2 3 4 5 6 7 8 9 10 ...
#> $ Points : int 8217 8122 8099 8067 8036 8030 8004 7995 7802 7733 ...
#> $ Competition: Factor w/ 2 levels "Decastar","OlympicG": 1 1 1 1 1 1 1 1 1 1 ...
summary(decathlon)
#> 100m Long.jump Shot.put High.jump 400m
#> Min. :10.44 Min. :6.61 Min. :12.68 Min. :1.850 Min. :46.81
#> 1st Qu.:10.85 1st Qu.:7.03 1st Qu.:13.88 1st Qu.:1.920 1st Qu.:48.93
#> Median :10.98 Median :7.30 Median :14.57 Median :1.950 Median :49.40
#> Mean :11.00 Mean :7.26 Mean :14.48 Mean :1.977 Mean :49.62
#> 3rd Qu.:11.14 3rd Qu.:7.48 3rd Qu.:14.97 3rd Qu.:2.040 3rd Qu.:50.30
#> Max. :11.64 Max. :7.96 Max. :16.36 Max. :2.150 Max. :53.20
#> 110m.hurdle Discus Pole.vault Javeline
#> Min. :13.97 Min. :37.92 Min. :4.200 Min. :50.31
#> 1st Qu.:14.21 1st Qu.:41.90 1st Qu.:4.500 1st Qu.:55.27
#> Median :14.48 Median :44.41 Median :4.800 Median :58.36
#> Mean :14.61 Mean :44.33 Mean :4.762 Mean :58.32
#> 3rd Qu.:14.98 3rd Qu.:46.07 3rd Qu.:4.920 3rd Qu.:60.89
#> Max. :15.67 Max. :51.65 Max. :5.400 Max. :70.52
#> 1500m Rank Points Competition
#> Min. :262.1 Min. : 1.00 Min. :7313 Decastar:13
#> 1st Qu.:271.0 1st Qu.: 6.00 1st Qu.:7802 OlympicG:28
#> Median :278.1 Median :11.00 Median :8021
#> Mean :279.0 Mean :12.12 Mean :8005
#> 3rd Qu.:285.1 3rd Qu.:18.00 3rd Qu.:8122
#> Max. :317.0 Max. :28.00 Max. :8893
dim(decathlon)
#> [1] 41 13The decathlon dataset has 41 observations of 13
variables, of which 12 are numerical and 1 is categorical
(Competition).
The decathlon dataset does not contain any missing values.
For some analyses, we only use the explanatory variables related to the 10 diffferent decathlon events.
Contrary to the penguin dataset, we see that there is not clear group separation in the decathlon dataset. There seem to be more complex multivariate relationships between the variables in the dataset.
cor(decathlon_cont)
#> 100m Long.jump Shot.put High.jump 400m
#> 100m 1.00000000 -0.59867767 -0.35648227 -0.24625292 0.520298155
#> Long.jump -0.59867767 1.00000000 0.18330436 0.29464444 -0.602062618
#> Shot.put -0.35648227 0.18330436 1.00000000 0.48921153 -0.138432919
#> High.jump -0.24625292 0.29464444 0.48921153 1.00000000 -0.187956928
#> 400m 0.52029815 -0.60206262 -0.13843292 -0.18795693 1.000000000
#> 110m.hurdle 0.57988893 -0.50541009 -0.25161571 -0.28328909 0.547987756
#> Discus -0.22170757 0.19431009 0.61576810 0.36921834 -0.117879365
#> Pole.vault -0.08253683 0.20401411 0.06118185 -0.15618074 -0.079292469
#> Javeline -0.15774645 0.11975893 0.37495551 0.17188009 0.004232096
#> 1500m -0.06054645 -0.03368613 0.11580306 -0.04490252 0.408106432
#> 110m.hurdle Discus Pole.vault Javeline 1500m
#> 100m 0.579888931 -0.2217076 -0.082536834 -0.157746452 -0.06054645
#> Long.jump -0.505410086 0.1943101 0.204014112 0.119758933 -0.03368613
#> Shot.put -0.251615714 0.6157681 0.061181853 0.374955509 0.11580306
#> High.jump -0.283289090 0.3692183 -0.156180742 0.171880092 -0.04490252
#> 400m 0.547987756 -0.1178794 -0.079292469 0.004232096 0.40810643
#> 110m.hurdle 1.000000000 -0.3262010 -0.002703885 0.008743251 0.03754024
#> Discus -0.326200961 1.0000000 -0.150072400 0.157889799 0.25817510
#> Pole.vault -0.002703885 -0.1500724 1.000000000 -0.030000603 0.24744778
#> Javeline 0.008743251 0.1578898 -0.030000603 1.000000000 -0.18039313
#> 1500m 0.037540240 0.2581751 0.247447780 -0.180393128 1.00000000There are some strong correlation between the different events in the decathlon competition.
decathlon_cormat <- cor(decathlon_cont) %>%
as.data.frame() %>%
rownames_to_column() %>%
pivot_longer(-rowname)
decathlon_cormat %>%
ggplot(aes(x=rowname,y=name,fill=value))+
geom_tile() +
geom_text(aes(label=round(value,2)),color="white") +
scale_fill_gradient2(low = "red",
high = "darkgreen",
mid="white",
midpoint=0,
limit=c(-1,1),
name="pearson\nCorrelation")Penguins:
Decathlon: