import data

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.2.0 ──
## ✔ broom        1.0.6     ✔ rsample      1.2.1
## ✔ dials        1.3.0     ✔ tune         1.2.1
## ✔ infer        1.0.7     ✔ workflows    1.1.4
## ✔ modeldata    1.4.0     ✔ workflowsets 1.1.0
## ✔ parsnip      1.2.1     ✔ yardstick    1.3.1
## ✔ recipes      1.1.0     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
library(correlationfunnel)
## ══ Using correlationfunnel? ════════════════════════════════════════════════════
## You might also be interested in applied data science training for business.
## </> Learn more at - www.business-science.io </>
library(tidytext)
library(usemodels)
library(textrecipes)

data <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-09-13/bigfoot.csv')
## Rows: 5021 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): observed, location_details, county, state, season, title, classif...
## dbl  (17): latitude, longitude, number, temperature_high, temperature_mid, t...
## date  (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

clean data

skimr::skim(data)
Data summary
Name data
Number of rows 5021
Number of columns 28
_______________________
Column type frequency:
character 10
Date 1
numeric 17
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
observed 38 0.99 1 30374 0 4982 0
location_details 758 0.85 1 3876 0 4196 0
county 0 1.00 10 30 0 1037 0
state 0 1.00 4 14 0 49 0
season 0 1.00 4 7 0 5 0
title 976 0.81 23 235 0 4045 0
classification 0 1.00 7 7 0 3 0
geohash 976 0.81 10 10 0 4001 0
precip_type 3298 0.34 4 4 0 2 0
summary 1655 0.67 15 103 0 321 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
date 976 0.81 1869-11-10 2021-11-27 2003-11-16 3111

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
latitude 976 0.81 39.36 5.68 25.14 35.35 39.30 43.93 64.89 ▂▇▆▁▁
longitude 976 0.81 -97.42 16.73 -167.13 -117.06 -91.77 -83.07 -68.23 ▁▁▆▆▇
number 0 1.00 21520.23 19259.15 60.00 4595.00 15473.00 33979.00 71997.00 ▇▃▂▂▁
temperature_high 1683 0.66 67.12 17.78 -0.62 55.14 69.97 81.10 106.51 ▁▂▅▇▃
temperature_mid 1835 0.63 57.84 16.40 -8.46 46.77 59.36 70.38 94.03 ▁▁▆▇▃
temperature_low 1832 0.64 48.64 15.94 -22.78 37.50 49.40 60.66 84.34 ▁▁▅▇▃
dew_point 1648 0.67 46.23 16.44 -11.21 34.77 46.69 59.00 77.40 ▁▂▆▇▅
humidity 1648 0.67 0.71 0.16 0.08 0.62 0.73 0.82 1.00 ▁▁▃▇▅
cloud_cover 1937 0.61 0.44 0.33 0.00 0.12 0.40 0.73 1.00 ▇▅▃▃▅
moon_phase 1625 0.68 0.50 0.29 0.00 0.25 0.49 0.75 1.00 ▇▇▇▇▇
precip_intensity 2309 0.54 0.01 0.05 0.00 0.00 0.00 0.00 2.07 ▇▁▁▁▁
precip_probability 2311 0.54 0.30 0.42 0.00 0.00 0.00 0.73 1.00 ▇▁▁▁▃
pressure 2402 0.52 1017.08 6.14 980.34 1013.42 1016.96 1020.64 1042.41 ▁▁▇▆▁
uv_index 1629 0.68 5.16 3.14 0.00 3.00 5.00 8.00 13.00 ▆▇▅▆▁
visibility 1972 0.61 8.49 2.06 0.74 7.66 9.45 10.00 10.00 ▁▁▁▂▇
wind_bearing 1634 0.67 196.57 96.38 0.00 128.00 203.00 273.00 359.00 ▅▅▇▇▆
wind_speed 1632 0.67 3.87 3.28 0.00 1.34 2.93 5.56 23.94 ▇▃▁▁▁
data_clean <- data %>%
    select(-precip_type, -precip_intensity, -precip_probability) %>%
    na.omit() %>%
        
    select(-c(date)) %>% 
    
    # Remove a third rare level
    filter(classification != "Class C", !is.na(observed)) %>%

    select(-c(location_details, title, summary, observed, geohash)) %>%
    
    mutate(
    classification = case_when(
      classification == "Class A" ~ "sighting",
      classification == "Class B" ~ "possible"
    )
  )

explore data

data_clean %>% count(classification)
## # A tibble: 2 × 2
##   classification     n
##   <chr>          <int>
## 1 possible        1053
## 2 sighting        1019
data_clean %>%
    ggplot(aes(classification)) +
    geom_bar()

data_clean %>%
    ggplot(aes(classification, temperature_high)) +
    geom_boxplot()

correlation

# step 1
data_binarized <- data_clean %>%
    select(-number) %>%
    binarize()

data_binarized %>% glimpse()
## Rows: 2,072
## Columns: 97
## $ county__Jackson_County              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ county__Jefferson_County            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ county__King_County                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ county__Pierce_County               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ county__Snohomish_County            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ county__Washington_County           <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `county__-OTHER`                    <dbl> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ state__Alabama                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Arkansas                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__California                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Colorado                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Florida                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Georgia                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Idaho                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Illinois                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Indiana                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Iowa                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Kansas                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Kentucky                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Michigan                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Missouri                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__New_Jersey                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1…
## $ state__New_York                     <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__North_Carolina               <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0…
## $ state__Ohio                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Oklahoma                     <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Oregon                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Pennsylvania                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Tennessee                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Texas                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Virginia                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Washington                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__West_Virginia                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Wisconsin                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `state__-OTHER`                     <dbl> 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0…
## $ season__Fall                        <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ season__Spring                      <dbl> 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0…
## $ season__Summer                      <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1…
## $ season__Unknown                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ season__Winter                      <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ `latitude__-Inf_35.298325`          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ latitude__35.298325_39.642495       <dbl> 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0…
## $ latitude__39.642495_43.46018        <dbl> 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1…
## $ latitude__43.46018_Inf              <dbl> 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0…
## $ `longitude__-Inf_-112.1051`         <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0…
## $ `longitude__-112.1051_-88.748825`   <dbl> 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0…
## $ `longitude__-88.748825_-82.1174575` <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0…
## $ `longitude__-82.1174575_Inf`        <dbl> 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1…
## $ classification__possible            <dbl> 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1…
## $ classification__sighting            <dbl> 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0…
## $ `temperature_high__-Inf_54.65`      <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1…
## $ temperature_high__54.65_69.905      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ temperature_high__69.905_81.2625    <dbl> 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0…
## $ temperature_high__81.2625_Inf       <dbl> 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0…
## $ `temperature_mid__-Inf_46.7925`     <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1…
## $ temperature_mid__46.7925_59.7775    <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ temperature_mid__59.7775_70.86125   <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0…
## $ temperature_mid__70.86125_Inf       <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ `temperature_low__-Inf_38.04`       <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0…
## $ temperature_low__38.04_49.94        <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1…
## $ temperature_low__49.94_61.4425      <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ temperature_low__61.4425_Inf        <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0…
## $ `dew_point__-Inf_35.5475`           <dbl> 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0…
## $ dew_point__35.5475_47.51            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ dew_point__47.51_59.6225            <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ dew_point__59.6225_Inf              <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0…
## $ `humidity__-Inf_0.64`               <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0…
## $ humidity__0.64_0.74                 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ humidity__0.74_0.82                 <dbl> 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0…
## $ humidity__0.82_Inf                  <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1…
## $ `cloud_cover__-Inf_0.13`            <dbl> 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0…
## $ cloud_cover__0.13_0.41              <dbl> 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0…
## $ cloud_cover__0.41_0.74              <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0…
## $ cloud_cover__0.74_Inf               <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ `moon_phase__-Inf_0.25`             <dbl> 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0…
## $ moon_phase__0.25_0.51               <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ moon_phase__0.51_0.75               <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1…
## $ moon_phase__0.75_Inf                <dbl> 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ `pressure__-Inf_1013.32`            <dbl> 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0…
## $ pressure__1013.32_1016.935          <dbl> 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0…
## $ pressure__1016.935_1020.65          <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ pressure__1020.65_Inf               <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1…
## $ `uv_index__-Inf_3`                  <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1…
## $ uv_index__3_5                       <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ uv_index__5_8                       <dbl> 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0…
## $ uv_index__8_Inf                     <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0…
## $ `visibility__-Inf_7.63`             <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0…
## $ visibility__7.63_9.4105             <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1…
## $ visibility__9.4105_Inf              <dbl> 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0…
## $ `wind_bearing__-Inf_127`            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ wind_bearing__127_202               <dbl> 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0…
## $ wind_bearing__202_268               <dbl> 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0…
## $ wind_bearing__268_Inf               <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ `wind_speed__-Inf_1.42`             <dbl> 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0…
## $ wind_speed__1.42_2.97               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1…
## $ wind_speed__2.97_5.4925             <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ wind_speed__5.4925_Inf              <dbl> 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0…
data_binarized %>% glimpse()
## Rows: 2,072
## Columns: 97
## $ county__Jackson_County              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ county__Jefferson_County            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ county__King_County                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ county__Pierce_County               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ county__Snohomish_County            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ county__Washington_County           <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `county__-OTHER`                    <dbl> 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ state__Alabama                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Arkansas                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__California                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Colorado                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Florida                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Georgia                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Idaho                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Illinois                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Indiana                      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Iowa                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Kansas                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Kentucky                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Michigan                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Missouri                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__New_Jersey                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1…
## $ state__New_York                     <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__North_Carolina               <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0…
## $ state__Ohio                         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Oklahoma                     <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Oregon                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Pennsylvania                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Tennessee                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Texas                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Virginia                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Washington                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__West_Virginia                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ state__Wisconsin                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ `state__-OTHER`                     <dbl> 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0…
## $ season__Fall                        <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ season__Spring                      <dbl> 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0…
## $ season__Summer                      <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1…
## $ season__Unknown                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ season__Winter                      <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ `latitude__-Inf_35.298325`          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ latitude__35.298325_39.642495       <dbl> 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0…
## $ latitude__39.642495_43.46018        <dbl> 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1…
## $ latitude__43.46018_Inf              <dbl> 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0…
## $ `longitude__-Inf_-112.1051`         <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0…
## $ `longitude__-112.1051_-88.748825`   <dbl> 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0…
## $ `longitude__-88.748825_-82.1174575` <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0…
## $ `longitude__-82.1174575_Inf`        <dbl> 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1…
## $ classification__possible            <dbl> 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1…
## $ classification__sighting            <dbl> 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0…
## $ `temperature_high__-Inf_54.65`      <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1…
## $ temperature_high__54.65_69.905      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ temperature_high__69.905_81.2625    <dbl> 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0…
## $ temperature_high__81.2625_Inf       <dbl> 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0…
## $ `temperature_mid__-Inf_46.7925`     <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1…
## $ temperature_mid__46.7925_59.7775    <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ temperature_mid__59.7775_70.86125   <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0…
## $ temperature_mid__70.86125_Inf       <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ `temperature_low__-Inf_38.04`       <dbl> 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0…
## $ temperature_low__38.04_49.94        <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1…
## $ temperature_low__49.94_61.4425      <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ temperature_low__61.4425_Inf        <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0…
## $ `dew_point__-Inf_35.5475`           <dbl> 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0…
## $ dew_point__35.5475_47.51            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ dew_point__47.51_59.6225            <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0…
## $ dew_point__59.6225_Inf              <dbl> 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0…
## $ `humidity__-Inf_0.64`               <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0…
## $ humidity__0.64_0.74                 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ humidity__0.74_0.82                 <dbl> 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0…
## $ humidity__0.82_Inf                  <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1…
## $ `cloud_cover__-Inf_0.13`            <dbl> 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0…
## $ cloud_cover__0.13_0.41              <dbl> 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0…
## $ cloud_cover__0.41_0.74              <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0…
## $ cloud_cover__0.74_Inf               <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ `moon_phase__-Inf_0.25`             <dbl> 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0…
## $ moon_phase__0.25_0.51               <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ moon_phase__0.51_0.75               <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1…
## $ moon_phase__0.75_Inf                <dbl> 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ `pressure__-Inf_1013.32`            <dbl> 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0…
## $ pressure__1013.32_1016.935          <dbl> 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0…
## $ pressure__1016.935_1020.65          <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ pressure__1020.65_Inf               <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1…
## $ `uv_index__-Inf_3`                  <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1…
## $ uv_index__3_5                       <dbl> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ uv_index__5_8                       <dbl> 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0…
## $ uv_index__8_Inf                     <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0…
## $ `visibility__-Inf_7.63`             <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0…
## $ visibility__7.63_9.4105             <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1…
## $ visibility__9.4105_Inf              <dbl> 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0…
## $ `wind_bearing__-Inf_127`            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ wind_bearing__127_202               <dbl> 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0…
## $ wind_bearing__202_268               <dbl> 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0…
## $ wind_bearing__268_Inf               <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ `wind_speed__-Inf_1.42`             <dbl> 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0…
## $ wind_speed__1.42_2.97               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1…
## $ wind_speed__2.97_5.4925             <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ wind_speed__5.4925_Inf              <dbl> 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0…
 # step 2
data_correlation <- data_binarized %>%
    correlate(classification__sighting)

data_correlation
## # A tibble: 97 × 3
##    feature        bin                  correlation
##    <fct>          <chr>                      <dbl>
##  1 classification possible                 -1     
##  2 classification sighting                  1     
##  3 wind_speed     -Inf_1.42                -0.0917
##  4 longitude      -112.1051_-88.748825      0.0741
##  5 wind_speed     5.4925_Inf                0.0697
##  6 longitude      -Inf_-112.1051           -0.0686
##  7 state          California               -0.0677
##  8 wind_bearing   -Inf_127                  0.0640
##  9 state          Alabama                   0.0598
## 10 dew_point      35.5475_47.51            -0.0573
## # ℹ 87 more rows
# step 3
data_correlation %>%
    correlationfunnel::plot_correlation_funnel()
## Warning: ggrepel: 35 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

model building

split data

library(tidymodels)

# set.seed(1234)
# data_clean <- data_clean %>% sample_n(100)

data_split <- initial_split(data_clean)
data_train <- training(data_split)
data_test <- testing(data_split)

data_cv <- rsample::vfold_cv(data_train)
data_cv
## #  10-fold cross-validation 
## # A tibble: 10 × 2
##    splits             id    
##    <list>             <chr> 
##  1 <split [1398/156]> Fold01
##  2 <split [1398/156]> Fold02
##  3 <split [1398/156]> Fold03
##  4 <split [1398/156]> Fold04
##  5 <split [1399/155]> Fold05
##  6 <split [1399/155]> Fold06
##  7 <split [1399/155]> Fold07
##  8 <split [1399/155]> Fold08
##  9 <split [1399/155]> Fold09
## 10 <split [1399/155]> Fold10

preprocess data

xgboost_rec <- recipes::recipe(classification ~ ., data = data_train) %>%
    update_role(number, new_role = "ID") %>%
    step_dummy(all_nominal_predictors()) %>%
    step_YeoJohnson(longitude, number, humidity, visibility, wind_speed)%>%
    step_normalize(all_numeric_predictors()) %>%
    step_pca(all_numeric_predictors(), threshold = .75)
    
xgboost_rec %>% prep() %>% juice() %>% glimpse()
## Rows: 1,554
## Columns: 446
## $ number         <dbl> 120.75502, 130.40188, 168.73455, 119.62538, 166.84814, …
## $ classification <fct> sighting, possible, possible, possible, possible, sight…
## $ PC001          <dbl> -1.1782117, 1.0566478, 1.5112077, -0.6498679, -3.471479…
## $ PC002          <dbl> 0.8082876, 1.5821371, 3.5100633, 1.6672201, -0.4477700,…
## $ PC003          <dbl> -3.6281167059, 0.5795528362, 2.9809548300, -3.580495468…
## $ PC004          <dbl> 1.18872694, -2.19981078, -0.97198941, 0.16755085, 1.112…
## $ PC005          <dbl> 0.01157484, 1.61060114, -0.43412363, 0.32196992, 0.8551…
## $ PC006          <dbl> -1.03109289, -2.51496547, 1.87613206, -0.41714477, -0.2…
## $ PC007          <dbl> -1.01693175, 2.15319437, -2.07784129, -0.37038343, -1.4…
## $ PC008          <dbl> -0.49974316, -1.59737223, -1.87469536, -0.09309599, 0.0…
## $ PC009          <dbl> -0.6430447, 1.5724413, -2.6045766, -1.1686868, -1.00004…
## $ PC010          <dbl> -0.138181943, -2.095605882, 6.317381579, 0.237203283, 0…
## $ PC011          <dbl> -0.001920390, 1.336478296, -6.817327757, 0.197754796, -…
## $ PC012          <dbl> 0.059279809, 1.177824128, -0.534747159, 0.184446007, 0.…
## $ PC013          <dbl> -0.14566606, 0.31930765, 11.47716064, -0.01058192, -0.1…
## $ PC014          <dbl> 0.39041150, 1.54693786, 5.84997009, 0.45344582, 0.33257…
## $ PC015          <dbl> 0.054545166, 1.691180318, 7.852837503, -0.009946727, 0.…
## $ PC016          <dbl> 0.05533530, -0.04563165, -4.93355625, 0.06385475, 0.087…
## $ PC017          <dbl> 0.148604899, 2.209315558, 2.524084214, 0.104080032, 0.3…
## $ PC018          <dbl> -0.07505821, -0.61086027, -0.44929026, 0.11730644, -0.1…
## $ PC019          <dbl> -0.096856666, -1.215803333, 0.304577797, -0.141038919, …
## $ PC020          <dbl> -0.16952120, 0.86803524, 4.90473023, -0.25932710, -0.25…
## $ PC021          <dbl> -0.55042230, 2.03252544, -3.24424236, -0.46377890, -0.2…
## $ PC022          <dbl> 0.337347229, 1.317070225, -0.084181417, 0.042821756, 0.…
## $ PC023          <dbl> 0.19439007, -0.97953951, -0.40897602, 0.01131004, 0.052…
## $ PC024          <dbl> -0.17639723, -0.78983710, 2.79426522, -0.16291426, -0.2…
## $ PC025          <dbl> -0.152757233, -0.568505730, 2.146103906, -0.107360340, …
## $ PC026          <dbl> -0.26782860, -0.17478505, 0.94690125, -0.17523564, -0.3…
## $ PC027          <dbl> -0.153593645, 0.496621454, 0.813469990, -0.262807773, -…
## $ PC028          <dbl> -0.07203111, -0.04486998, 0.98827563, -0.08247323, -0.3…
## $ PC029          <dbl> 0.268790278, -0.021231484, -0.208806248, 0.187109569, 0…
## $ PC030          <dbl> -0.075093605, 0.203480457, -0.718575684, -0.307736621, …
## $ PC031          <dbl> -0.23044571, -0.49390697, -0.31440086, -0.29705591, -0.…
## $ PC032          <dbl> 0.02213031, -0.70668644, 0.44432751, 0.03696639, -0.062…
## $ PC033          <dbl> -0.02474626, 0.31193001, -0.23959437, 0.28898674, 0.293…
## $ PC034          <dbl> -0.100208653, 0.704149241, 0.663143908, -0.001475734, -…
## $ PC035          <dbl> -0.41074285, -0.54581523, -1.11260036, -0.35198738, -0.…
## $ PC036          <dbl> 0.135453034, -0.512279062, -0.600869983, 0.311168069, 0…
## $ PC037          <dbl> -0.37849044, -0.14009954, -0.02597591, -0.49673620, -0.…
## $ PC038          <dbl> 0.03123226, -0.07720410, 1.24256702, 0.10438820, -0.033…
## $ PC039          <dbl> -0.23210490, -0.05316535, -0.22110127, -0.20817566, -0.…
## $ PC040          <dbl> 0.06537634, 0.41928338, 1.41044550, 0.13518248, -0.0900…
## $ PC041          <dbl> 0.06016963, -0.32303040, 0.88699175, 0.08617660, 0.0343…
## $ PC042          <dbl> 0.02644978, 0.37071581, 0.39916604, -0.28369101, -0.112…
## $ PC043          <dbl> 0.04645658, 0.35191241, -0.74207216, 0.01655839, -0.196…
## $ PC044          <dbl> 0.549046740, 0.852448848, -0.971729823, 0.137505884, -0…
## $ PC045          <dbl> -0.4169176, -0.5643996, 1.5440835, 0.1378845, 0.1220122…
## $ PC046          <dbl> -0.06834800, 1.31384710, -2.41003606, -0.80649736, -0.3…
## $ PC047          <dbl> -0.52682312, -0.28350883, -0.63084035, -0.39053615, 0.1…
## $ PC048          <dbl> -0.134351335, -0.351269409, 0.146134169, -0.005121718, …
## $ PC049          <dbl> -0.65683842, -0.98542097, -0.14070297, -0.44072914, 0.6…
## $ PC050          <dbl> -0.2728304, -1.4260914, -0.3617300, -0.2627399, 1.11000…
## $ PC051          <dbl> -1.0354968, -1.2292462, -0.7254983, 1.4883345, -0.25034…
## $ PC052          <dbl> -0.111103681, 0.308774629, -0.346813686, -0.014828753, …
## $ PC053          <dbl> -0.26686599, -1.22002607, 0.72727723, 0.71566062, 0.403…
## $ PC054          <dbl> 1.0628427, -0.8532219, 0.2619691, 0.8379334, 0.2574674,…
## $ PC055          <dbl> 0.06471345, -0.51590266, 1.88834852, 0.36873213, 0.7402…
## $ PC056          <dbl> 0.67225272, 0.24716920, 2.02357037, 0.84621830, 0.71023…
## $ PC057          <dbl> 1.2052207, -0.1461956, -1.8396807, 0.2455131, 1.2021103…
## $ PC058          <dbl> 0.24819864, -1.92922719, -0.31507220, -0.48667926, 0.81…
## $ PC059          <dbl> 0.19516259, 0.48888258, 1.66508121, 1.27555637, 1.54516…
## $ PC060          <dbl> 0.08405527, -0.22398215, 0.40396747, -0.26056350, -0.59…
## $ PC061          <dbl> 0.117397852, -0.897939676, -0.774037466, -0.400782854, …
## $ PC062          <dbl> -1.16586833, -1.78845802, -0.33065844, 1.07998213, 0.53…
## $ PC063          <dbl> 6.04451714, -0.43977268, -0.11904071, -0.27872955, -0.7…
## $ PC064          <dbl> -1.22862791, -0.12709165, -0.20874562, 0.37053037, 0.53…
## $ PC065          <dbl> 0.99280788, 0.14182619, 0.30266791, -0.58542873, -1.328…
## $ PC066          <dbl> 0.42720435, 0.26946049, 0.35207384, 0.39117618, -0.9434…
## $ PC067          <dbl> 0.34827276, -0.61190699, 0.09392997, -0.80800600, -0.41…
## $ PC068          <dbl> 0.42553987, -0.09053114, -0.60730043, -0.91179904, 0.40…
## $ PC069          <dbl> 0.4047318462, -0.2586270754, -0.7716921008, -0.20337946…
## $ PC070          <dbl> -0.1369133846, -0.1984831679, 0.5775421120, -0.38260596…
## $ PC071          <dbl> -0.40166200, -0.34384276, 0.21664639, 1.18908084, 0.998…
## $ PC072          <dbl> 0.050282601, 0.027211763, -0.189591662, -0.501285391, 0…
## $ PC073          <dbl> 0.124839745, 0.111912530, -0.138403201, -0.391682511, -…
## $ PC074          <dbl> -0.14391287, -0.13216064, 0.27698682, -0.18000421, 0.33…
## $ PC075          <dbl> -0.151653461, 0.007305476, -0.124059397, 0.361553161, 0…
## $ PC076          <dbl> -0.24964179, 0.25773206, 0.12093798, 0.66010961, 0.5083…
## $ PC077          <dbl> 0.08098210, -0.15365750, -0.03955820, 0.05274046, -0.21…
## $ PC078          <dbl> 0.049065008, 0.695470867, 0.221261857, -1.075890881, 0.…
## $ PC079          <dbl> 0.033885189, 0.149551188, 0.110897534, -0.025573643, -0…
## $ PC080          <dbl> 0.112034417, -0.551456450, -0.307182372, 0.124825224, -…
## $ PC081          <dbl> -0.067034451, 0.289454251, 0.157355865, 0.109138224, 0.…
## $ PC082          <dbl> 0.118177999, 0.032752081, -0.025354542, -0.734545439, -…
## $ PC083          <dbl> -0.04006708, -0.22012711, 0.08961337, 0.57530104, 0.240…
## $ PC084          <dbl> 0.02648743, -0.32263845, 0.05220229, -0.34680517, 0.052…
## $ PC085          <dbl> 0.0388407749, -0.1302709324, 0.0177546031, 0.0153542147…
## $ PC086          <dbl> -0.012525757, 0.320409543, -0.075759382, -0.250017846, …
## $ PC087          <dbl> 0.1317017624, 0.4529150669, -0.0076706503, -0.642614006…
## $ PC088          <dbl> 0.015200231, -0.100585122, 0.097429313, 0.049479217, -0…
## $ PC089          <dbl> -0.028872740, -0.301709167, 0.132020788, 1.015555358, 0…
## $ PC090          <dbl> 0.009295313, -0.350925586, 0.022680899, 0.147365136, 0.…
## $ PC091          <dbl> 0.040118753, 0.200825630, -0.058020508, -0.248538224, -…
## $ PC092          <dbl> -0.001779097, -0.067928923, -0.129851368, 0.297352798, …
## $ PC093          <dbl> -0.007409048, -0.226581629, -0.087315769, 0.869338809, …
## $ PC094          <dbl> 0.007242279, -0.051553298, -0.258230224, 0.201152483, -…
## $ PC095          <dbl> -0.02625024, -0.36404860, 0.34302763, 0.54444689, 0.307…
## $ PC096          <dbl> -0.01738217, 0.38653756, 0.12486177, -0.05201923, 0.080…
## $ PC097          <dbl> -0.005209655, 0.067104801, -0.438352290, -0.375387189, …
## $ PC098          <dbl> -0.03657649, 0.17696969, -0.07145621, -0.62650808, -0.0…
## $ PC099          <dbl> 0.01808599, 0.24546316, 0.02176352, -0.64378796, -0.626…
## $ PC100          <dbl> 0.130157201, -0.050899665, -0.235267710, -0.479033770, …
## $ PC101          <dbl> -0.03587941, 0.16442513, -0.11355426, 0.65632862, 1.084…
## $ PC102          <dbl> -0.078350826, -0.055132159, -0.022587681, -0.034449372,…
## $ PC103          <dbl> 0.01162045, -0.05562929, 0.04068569, 0.64924900, -0.049…
## $ PC104          <dbl> -0.01583167, -0.56792854, -0.24534404, -0.09547160, 0.6…
## $ PC105          <dbl> -0.021644029, 0.247982442, 0.085005078, 0.884502665, 0.…
## $ PC106          <dbl> -0.003199351, 0.109736306, -0.117604240, -0.128746797, …
## $ PC107          <dbl> 0.133760477, 0.308324647, -0.152899611, -3.470741780, -…
## $ PC108          <dbl> -0.035012117, 0.056283261, 0.192374939, 1.518588442, 0.…
## $ PC109          <dbl> 0.0039121819, 0.0082291637, 0.1540239257, 0.5251662974,…
## $ PC110          <dbl> 0.050827937, 0.204311905, 0.089043022, -1.871429548, -0…
## $ PC111          <dbl> -0.064190533, -0.255198694, 0.023385873, 1.628614167, 1…
## $ PC112          <dbl> -0.049210245, -0.062326170, 0.146903519, 1.368847950, 0…
## $ PC113          <dbl> 0.063036968, -0.121741035, -0.198811646, -2.570081869, …
## $ PC114          <dbl> -0.011495415, 0.205206951, 0.060166129, 0.340580831, 0.…
## $ PC115          <dbl> 0.039470079, -1.219429792, -0.111477743, -2.360600167, …
## $ PC116          <dbl> -0.022952913, 0.553228565, 0.036406205, 1.534760032, 0.…
## $ PC117          <dbl> -0.0955013792, -0.2163897074, -0.0281346252, 4.43676916…
## $ PC118          <dbl> 0.02311319, 0.15181947, 0.60872900, 0.75677279, -0.0924…
## $ PC119          <dbl> -0.029494019, -1.214226995, -0.072989271, 3.118691587, …
## $ PC120          <dbl> 0.0420414280, -0.0446911558, -0.0433149295, -1.66397318…
## $ PC121          <dbl> -0.006656182, 0.639223858, -0.049948716, 0.949270934, -…
## $ PC122          <dbl> -0.001207122, -0.333641595, 0.039838444, 0.383587936, 0…
## $ PC123          <dbl> -0.065490823, 0.373516598, 0.296751364, 3.178601524, 1.…
## $ PC124          <dbl> -0.001512480, 0.005504913, -0.108405391, 2.581232449, 0…
## $ PC125          <dbl> -0.008848391, -0.035840393, -0.011342032, -1.321117947,…
## $ PC126          <dbl> -0.004846318, -0.103396820, -0.062813226, -1.719131305,…
## $ PC127          <dbl> 0.0005853877, 0.0623490105, 0.1218630653, 2.1461616468,…
## $ PC128          <dbl> 0.013509719, -0.410109546, -0.038259305, 1.473683828, -…
## $ PC129          <dbl> 0.006549437, -0.345862308, -0.272236775, 7.091225804, 0…
## $ PC130          <dbl> -0.0009589146, 0.1508325164, -0.2764207922, 4.615453071…
## $ PC131          <dbl> -0.0398724428, -0.1853821666, 0.3129548144, -3.97370237…
## $ PC132          <dbl> -0.020858626, 0.551380778, -0.130038049, 4.730653552, 0…
## $ PC133          <dbl> 0.002108776, 0.519235335, 0.439595180, 3.826704533, 0.4…
## $ PC134          <dbl> 0.0009653117, 0.2480305189, -0.3115996765, 2.3743981908…
## $ PC135          <dbl> 0.009409829, 0.054433361, -0.281622055, 4.964155107, -2…
## $ PC136          <dbl> -0.007652736, 0.237673553, -0.299525980, 2.088193832, -…
## $ PC137          <dbl> 0.005508404, 0.361214302, -0.128265803, 2.058158180, -1…
## $ PC138          <dbl> -0.009426309, 0.181035880, 0.248244000, -1.516047231, 0…
## $ PC139          <dbl> -0.017463124, -0.028054790, 0.518901030, -0.770369657, …
## $ PC140          <dbl> -0.002190932, -0.310087157, -0.339626056, 0.912604379, …
## $ PC141          <dbl> -0.01462934, 0.25410173, -0.07197596, -0.89466466, 0.39…
## $ PC142          <dbl> -0.027298291, 0.252470228, 0.147153684, 0.533977684, 0.…
## $ PC143          <dbl> -0.081576952, 0.183902459, 0.044971140, -2.951355387, 1…
## $ PC144          <dbl> 0.006091719, -0.590749429, -0.499292365, -0.257897284, …
## $ PC145          <dbl> 0.0225544259, -0.4730229277, -0.1484135386, 1.047488890…
## $ PC146          <dbl> 0.005606635, -0.171795438, -0.055920507, 0.157761773, 0…
## $ PC147          <dbl> -0.0026799993, 0.1943058733, 0.1810209036, -0.045250401…
## $ PC148          <dbl> 0.007395822, -0.510923454, -0.036782834, 0.231367707, 3…
## $ PC149          <dbl> 0.0013635622, 0.1024397925, -0.1582218131, 0.2313576313…
## $ PC150          <dbl> 0.009220915, -0.414749061, -0.157009147, 0.301948899, -…
## $ PC151          <dbl> -0.003151287, -0.094947530, 0.052249570, 0.228593476, 0…
## $ PC152          <dbl> 0.009719887, 0.130645464, -0.275134982, 0.500979995, 1.…
## $ PC153          <dbl> 0.001824192, -0.477127722, -0.489455209, -0.359261781, …
## $ PC154          <dbl> -5.260614e-03, 2.817254e-01, 7.734351e-03, 7.405531e-01…
## $ PC155          <dbl> 0.0001983975, 0.1800033870, -0.0528821546, -0.403765046…
## $ PC156          <dbl> 0.009994433, 0.296214626, -0.573868301, -0.266265429, -…
## $ PC157          <dbl> 0.003852707, -0.289961973, -0.077868979, 0.262789425, -…
## $ PC158          <dbl> 0.017786762, -0.126965440, -0.230078372, 0.001752409, 1…
## $ PC159          <dbl> 0.0005006848, -0.0200531931, 0.3899540605, -0.055225454…
## $ PC160          <dbl> -0.007710249, -1.096447470, 0.197089158, 0.044540516, -…
## $ PC161          <dbl> -0.011983893, -0.162160503, 0.208223809, -0.484638218, …
## $ PC162          <dbl> -0.007831744, 0.909810223, -0.072751336, 0.863896073, 2…
## $ PC163          <dbl> -0.0274344775, -0.7052843751, 0.2533509582, -0.47921053…
## $ PC164          <dbl> -0.013085808, 0.665660789, 0.379501263, -0.001444243, -…
## $ PC165          <dbl> 0.018942578, -0.764531176, -0.739943244, 0.441003389, 2…
## $ PC166          <dbl> -0.001081316, 0.003771023, 0.057557172, 0.034954457, -0…
## $ PC167          <dbl> -0.007329023, 0.034080832, 0.113702774, -0.178316400, -…
## $ PC168          <dbl> -0.002410476, 0.074295662, 0.182165854, -0.248054886, 0…
## $ PC169          <dbl> -0.03477926, 0.15620953, 0.40895070, -0.51041965, -2.25…
## $ PC170          <dbl> -0.010808093, 0.263164919, -0.326998732, 0.013181809, -…
## $ PC171          <dbl> 0.01990131, 0.04088698, 0.44598754, -0.02865767, 0.8782…
## $ PC172          <dbl> -0.017281772, 0.310144099, -0.173978170, 0.116274289, -…
## $ PC173          <dbl> 0.041144849, 0.473625327, -0.530825471, 0.976420278, 1.…
## $ PC174          <dbl> 0.011077095, 0.306594872, -0.372643247, 0.693078095, 0.…
## $ PC175          <dbl> -0.074062968, 0.016905126, -0.333793589, -1.007909667, …
## $ PC176          <dbl> 0.008957886, -0.065961751, -0.080226680, 0.222807071, -…
## $ PC177          <dbl> -4.801772e-13, -4.655201e-12, 1.219347e-11, -4.383818e-…
## $ PC178          <dbl> 1.035727e-12, 3.761708e-11, -9.444938e-11, 2.953033e-11…
## $ PC179          <dbl> 9.625988e-13, -8.318033e-13, -7.528708e-13, 1.129544e-1…
## $ PC180          <dbl> -2.301175e-12, -7.345445e-11, 7.448518e-11, -4.330480e-…
## $ PC181          <dbl> -6.733781e-13, 5.441513e-11, -1.566678e-11, -4.968741e-…
## $ PC182          <dbl> -8.944890e-13, -1.068780e-11, 1.930200e-11, -1.717601e-…
## $ PC183          <dbl> 7.692213e-13, 8.315710e-11, -7.691856e-11, 1.484040e-11…
## $ PC184          <dbl> 9.196187e-13, 2.158604e-11, -2.090122e-11, 1.967394e-11…
## $ PC185          <dbl> 7.601899e-13, 2.786817e-11, -2.375648e-11, 1.410366e-11…
## $ PC186          <dbl> 9.776386e-15, 2.183460e-11, -6.951951e-12, -6.313638e-1…
## $ PC187          <dbl> 1.293592e-12, 7.828355e-11, -1.076028e-10, 4.892272e-11…
## $ PC188          <dbl> 0.0027175944, 0.1870791070, -0.2389785919, 0.0351344558…
## $ PC189          <dbl> 0.0019125823, 0.0261021330, 0.1169397085, 0.0581437548,…
## $ PC190          <dbl> 0.0048336448, -0.1182450112, 0.0355958429, -0.019223837…
## $ PC191          <dbl> 0.001695906, -0.541163389, -0.008828921, -0.067997264, …
## $ PC192          <dbl> -0.0101185614, -0.1503661505, 0.2777378000, -0.11037297…
## $ PC193          <dbl> -0.002979742, 0.353402052, -0.043597706, -0.032093294, …
## $ PC194          <dbl> -0.0000887438, 0.0432363458, 0.6278115092, -0.311169616…
## $ PC195          <dbl> -0.0090285730, -0.2084534088, 0.2332646471, -0.04757576…
## $ PC196          <dbl> 3.902372e-03, -5.458551e-01, -1.062683e-01, 5.859725e-0…
## $ PC197          <dbl> -0.0089607603, -0.0312830541, -0.0194499899, 0.00903662…
## $ PC198          <dbl> -0.0127895824, 0.4459738370, 0.2098579442, 0.0101024583…
## $ PC199          <dbl> 0.00526072, 0.78236971, 0.47037185, 0.17021765, -0.0393…
## $ PC200          <dbl> -0.0027287736, -0.4196192935, -0.2343781449, -0.0756188…
## $ PC201          <dbl> -0.002127028, 0.377958054, -0.263277239, 0.126033711, 0…
## $ PC202          <dbl> -0.005535218, 0.165078482, -0.021186909, 0.210399241, -…
## $ PC203          <dbl> 0.006225173, -0.549708756, 0.063772085, 0.035068848, 0.…
## $ PC204          <dbl> 0.001642222, -1.365765431, 0.319724036, 0.201314384, -0…
## $ PC205          <dbl> -0.0013572095, -0.2558193438, -0.7242554065, 0.07202482…
## $ PC206          <dbl> -0.004796366, -0.326290936, 0.979916441, -0.253429787, …
## $ PC207          <dbl> -0.0028914411, 1.7494444691, 0.0418889639, -0.055597667…
## $ PC208          <dbl> 0.002866645, -2.193133737, -0.262203143, -0.205208638, …
## $ PC209          <dbl> 0.004833549, -0.367593046, -0.123938634, -0.009411983, …
## $ PC210          <dbl> 0.004048847, -0.792548753, -1.306476636, 0.240327114, 0…
## $ PC211          <dbl> -0.0026366485, 0.2780549540, 0.4800352970, 0.1547293760…
## $ PC212          <dbl> 0.003565533, 1.105326750, 0.476560895, 0.021565431, 0.3…
## $ PC213          <dbl> -0.000265590, 0.860996290, 1.003031315, 0.157166969, 0.…
## $ PC214          <dbl> 0.015608339, 0.495158285, 0.250657616, 0.222402364, -0.…
## $ PC215          <dbl> 0.0006436328, -1.2889387924, 0.2191370028, -0.054355925…
## $ PC216          <dbl> 0.002037527, -0.265668045, -0.367671911, 0.010457920, -…
## $ PC217          <dbl> -0.003183836, -1.203520333, -0.507041486, 0.184697077, …
## $ PC218          <dbl> 0.004219977, 0.385071337, 0.069200103, -0.233889101, 0.…
## $ PC219          <dbl> -0.006588963, -1.864520340, -0.558681233, -0.113901226,…
## $ PC220          <dbl> -0.001265919, 0.834893396, -0.288799668, 0.137079808, 0…
## $ PC221          <dbl> 0.003297695, 0.553070015, 0.126517162, 0.158385852, -0.…
## $ PC222          <dbl> 0.0001297895, 0.0810176988, 0.3187879600, -0.0662039819…
## $ PC223          <dbl> -0.001411781, 1.723254626, -0.394759672, 0.255547510, 0…
## $ PC224          <dbl> -0.0031022591, -1.6581835755, -1.2196189053, 0.04237754…
## $ PC225          <dbl> 0.005234873, 0.489186258, -0.948963915, 0.392040536, 0.…
## $ PC226          <dbl> 0.0002689848, -0.4395683805, -0.7497862738, -0.15589689…
## $ PC227          <dbl> -0.002204202, -0.193503938, -0.508344834, -0.180462920,…
## $ PC228          <dbl> 0.003791765, -0.121848422, -0.531398794, 0.008484718, 0…
## $ PC229          <dbl> 0.009629828, -1.090445394, -0.350627039, -0.049937679, …
## $ PC230          <dbl> -0.006596595, 0.746806494, 0.397872821, -0.092868251, 0…
## $ PC231          <dbl> -0.0039564714, 0.8923443910, 0.9008615456, -0.002412030…
## $ PC232          <dbl> -0.0099936785, -1.3188544135, 0.1358971380, -0.09639014…
## $ PC233          <dbl> 0.010834297, -0.156552250, -1.657588033, 0.184780559, 0…
## $ PC234          <dbl> 4.544332e-02, -2.596731e+00, 9.307128e-01, 4.153693e-01…
## $ PC235          <dbl> -5.097396e-13, -3.308216e+00, 1.189449e+00, -2.734266e-…
## $ PC236          <dbl> -2.134543e-13, 3.418837e-01, -1.092200e+00, -4.653813e-…
## $ PC237          <dbl> -2.495728e-13, -4.358125e+00, 7.956980e-01, 2.477197e-1…
## $ PC238          <dbl> -2.475567e-15, -1.199187e+00, 2.017858e-01, -1.217328e-…
## $ PC239          <dbl> 1.236213e-13, -4.363646e+00, -1.212075e+00, 7.771784e-1…
## $ PC240          <dbl> 2.701972e-13, 2.713578e+00, -2.457444e+00, 4.329946e-12…
## $ PC241          <dbl> 7.102746e-13, 1.812087e+00, -7.120476e-01, 4.280854e-13…
## $ PC242          <dbl> -1.580948e-13, 4.504101e-01, -1.654994e+00, 1.433930e-1…
## $ PC243          <dbl> 4.064722e-13, -1.262279e+00, -1.092984e+00, 3.875732e-1…
## $ PC244          <dbl> -3.526041e-13, 8.994788e-01, 9.779800e-01, -8.268150e-1…
## $ PC245          <dbl> 1.243730e-13, -1.756316e-01, 1.432743e+00, 8.312477e-14…
## $ PC246          <dbl> -9.043248e-14, -3.541526e+00, 9.984431e-01, 3.926498e-1…
## $ PC247          <dbl> 2.127947e-14, 5.399782e+00, -6.769646e-01, -1.867612e-1…
## $ PC248          <dbl> 4.420261e-13, 4.448277e+00, 1.140487e+00, 4.998897e-12,…
## $ PC249          <dbl> -1.743935e-13, 2.562737e+00, -3.715780e-01, -2.269978e-…
## $ PC250          <dbl> -6.774601e-15, 5.372262e-01, 3.221396e+00, 8.369012e-13…
## $ PC251          <dbl> 8.733488e-14, 9.644833e-01, -3.282847e-01, -1.105628e-1…
## $ PC252          <dbl> 2.094744e-13, 2.291012e+00, -1.242089e+00, -1.266296e-1…
## $ PC253          <dbl> -7.772568e-14, 3.745651e+00, -4.215903e+00, 8.664184e-1…
## $ PC254          <dbl> -4.003649e-13, 2.698464e+00, -1.806444e+00, -2.658274e-…
## $ PC255          <dbl> -6.408471e-13, -9.329335e-01, -2.683803e+00, -3.400751e…
## $ PC256          <dbl> 3.183630e-13, -3.827894e-01, 2.207179e+00, 1.013435e-12…
## $ PC257          <dbl> 5.487286e-13, 1.157215e+00, 2.316718e+00, 2.941807e-12,…
## $ PC258          <dbl> -1.824274e-13, 1.635076e+00, 8.438208e-01, 5.720420e-12…
## $ PC259          <dbl> -4.601193e-13, -1.159458e+00, 2.870641e-01, -3.447091e-…
## $ PC260          <dbl> 3.042788e-13, 1.781702e+00, -2.721614e+00, -1.052039e-1…
## $ PC261          <dbl> 2.351293e-13, -2.384032e+00, -5.442862e+00, -4.111142e-…
## $ PC262          <dbl> -1.718365e-14, 2.571735e+00, -1.287015e+00, 3.662094e-1…
## $ PC263          <dbl> 2.977985e-14, 2.021474e+00, 5.376115e-01, 5.884688e-14,…
## $ PC264          <dbl> 1.738789e-13, -2.925011e+00, -3.314277e+00, 2.320374e-1…
## $ PC265          <dbl> 1.088463e-13, 2.920760e+00, -1.210785e+00, -2.250419e-1…
## $ PC266          <dbl> 5.725221e-13, 1.150488e+00, 8.609672e-01, 7.396702e-12,…
## $ PC267          <dbl> -4.566323e-13, 3.084175e+00, 1.054608e+00, -2.720738e-1…
## $ PC268          <dbl> 4.951431e-13, 2.722297e+00, 2.379938e+00, 3.712514e-12,…
## $ PC269          <dbl> 9.017731e-14, -7.398670e+00, -1.054852e+00, -1.424526e-…
## $ PC270          <dbl> -3.541477e-13, 6.010025e-01, -3.939164e-01, -3.679252e-…
## $ PC271          <dbl> 2.667421e-13, 1.827106e+00, -3.324207e+00, 7.451580e-13…
## $ PC272          <dbl> -2.308794e-13, -5.546391e+00, 2.371860e+00, -6.967429e-…
## $ PC273          <dbl> -2.955564e-13, -5.504672e+00, -3.353835e-01, -5.378438e…
## $ PC274          <dbl> 2.068973e-13, 1.046736e+00, -1.129663e+00, -5.376053e-1…
## $ PC275          <dbl> -5.006505e-13, -6.083612e+00, -1.488286e+00, -3.086006e…
## $ PC276          <dbl> -4.997061e-13, -6.718384e-01, 1.520541e+00, -6.543628e-…
## $ PC277          <dbl> 3.394243e-13, 3.292575e+00, -8.931513e-01, 1.217444e-12…
## $ PC278          <dbl> -3.645819e-14, -1.344339e+00, -4.024266e+00, -1.167151e…
## $ PC279          <dbl> 2.632300e-13, -1.081679e+00, 2.294405e+00, 1.911863e-12…
## $ PC280          <dbl> 2.034108e-14, 5.175591e+00, -4.730313e+00, -7.450328e-1…
## $ PC281          <dbl> -5.192295e-13, 2.753953e+00, 2.022668e+00, -4.891621e-1…
## $ PC282          <dbl> 1.943768e-13, -1.313238e+00, 1.395188e+00, 2.651543e-12…
## $ PC283          <dbl> 4.619602e-13, 2.075345e+00, 2.817176e+00, 1.417751e-12,…
## $ PC284          <dbl> 3.853956e-13, 6.636591e+00, -2.308326e+00, -6.659703e-1…
## $ PC285          <dbl> 2.109655e-13, -2.083343e-01, -2.508499e-01, -8.300424e-…
## $ PC286          <dbl> 4.230721e-14, 3.449020e+00, 8.533005e-01, -2.553658e-12…
## $ PC287          <dbl> -3.276824e-13, -3.251590e+00, 1.151648e+00, -2.017275e-…
## $ PC288          <dbl> -2.136375e-13, -4.463394e+00, -2.525710e+00, 2.135694e-…
## $ PC289          <dbl> 1.118237e-13, 1.101519e+00, -2.260978e+00, 1.734963e-12…
## $ PC290          <dbl> -2.000904e-14, -4.702403e+00, 1.692531e+00, 3.099149e-1…
## $ PC291          <dbl> -3.695807e-14, -8.039702e-01, 3.037284e-01, 4.362467e-1…
## $ PC292          <dbl> -3.861666e-14, -2.324691e+00, 2.553326e-01, -3.511306e-…
## $ PC293          <dbl> 1.254711e-13, -1.446749e-01, 5.810007e-01, 2.898260e-12…
## $ PC294          <dbl> 4.650764e-14, 2.004190e+00, -1.375500e+00, 2.996977e-12…
## $ PC295          <dbl> 4.429323e-13, -2.886743e+00, 9.719293e-01, 7.873766e-12…
## $ PC296          <dbl> -1.141885e-14, -4.107959e-01, -1.547576e+00, -4.400725e…
## $ PC297          <dbl> 2.200227e-13, -1.225073e+00, -1.267860e+00, 5.296767e-1…
## $ PC298          <dbl> -4.619274e-13, -5.361683e-01, 7.045472e-01, -8.033041e-…
## $ PC299          <dbl> 7.868535e-14, -7.928860e-01, -8.352721e-01, 4.670518e-1…
## $ PC300          <dbl> 3.162477e-13, 2.614843e-01, -4.985688e+00, 8.809558e-12…
## $ PC301          <dbl> 8.797869e-13, -1.068521e+00, -1.513437e+00, 6.128644e-1…
## $ PC302          <dbl> -2.749162e-14, 1.511249e-01, -1.900435e+00, -4.726299e-…
## $ PC303          <dbl> 5.251196e-13, -9.878842e-01, 4.079732e-02, 6.202328e-12…
## $ PC304          <dbl> -2.493274e-13, 4.074596e-01, 2.219608e+00, 3.074444e-12…
## $ PC305          <dbl> -2.752048e-03, -1.098190e+00, 6.887316e-01, 7.882089e-0…
## $ PC306          <dbl> -0.003539233, -2.098233402, -3.882046746, 0.022487714, …
## $ PC307          <dbl> -0.001827828, 3.784935379, 1.822634345, -0.041169414, -…
## $ PC308          <dbl> -0.0035419401, 3.5322109027, 0.9748611423, -0.158461089…
## $ PC309          <dbl> -0.0032030420, 2.0152389790, -0.5156415696, 0.113071152…
## $ PC310          <dbl> 0.005492634, 2.659002893, 2.530468925, 0.047746302, 0.0…
## $ PC311          <dbl> -0.001160866, -1.531758135, -3.418987665, -0.029453680,…
## $ PC312          <dbl> -0.006591333, -1.026135686, 0.546856830, 0.078239329, -…
## $ PC313          <dbl> -0.0005555683, -0.9534210888, -1.9786866349, -0.1665768…
## $ PC314          <dbl> 0.003490666, -0.676163106, 3.975044210, -0.035803803, -…
## $ PC315          <dbl> -0.0002580851, -2.3682427644, 1.5666406819, 0.002703384…
## $ PC316          <dbl> -0.0031456797, -0.6956393334, 0.6089733766, 0.009485282…
## $ PC317          <dbl> 1.010728e-02, 2.918643e+00, -1.139730e+00, -1.739954e-0…
## $ PC318          <dbl> 0.0136925748, 3.6196123289, 2.2452346605, 0.1230945291,…
## $ PC319          <dbl> 7.805497e-03, 2.197652e-01, -5.229190e-01, -4.956363e-0…
## $ PC320          <dbl> -6.681769e-05, -4.987003e-02, 5.912350e-01, 3.592559e-0…
## $ PC321          <dbl> -0.008849971, -1.594034638, -0.560857300, 0.103771029, …
## $ PC322          <dbl> 0.001073815, 1.782688346, 1.763470890, 0.042893626, -0.…
## $ PC323          <dbl> 0.004961900, 0.693097857, 1.322107033, -0.105125162, -0…
## $ PC324          <dbl> -0.008332585, -0.924604815, 0.035539458, -0.011160264, …
## $ PC325          <dbl> -0.0006209019, -1.2022431598, -0.9517389050, -0.0384782…
## $ PC326          <dbl> 0.005401590, 0.620433856, -0.618935812, -0.169672325, 0…
## $ PC327          <dbl> -0.0002318278, -0.7147232816, -0.0617406243, -0.0310831…
## $ PC328          <dbl> 0.0047634409, 1.5960195646, 1.6200357268, 0.0500769244,…
## $ PC329          <dbl> -0.0006632301, -0.7489418660, -2.1613597618, 0.03876811…
## $ PC330          <dbl> -0.0017860342, -0.0466816003, -1.0104508734, -0.0053120…
## $ PC331          <dbl> 0.001773990, -0.930440124, -1.447451792, -0.076891389, …
## $ PC332          <dbl> -0.005799567, -0.200600894, -0.783028034, -0.020112772,…
## $ PC333          <dbl> 0.0064600280, 0.7502750203, 0.9155327420, -0.0525169058…
## $ PC334          <dbl> -0.002157544, -0.415803957, -0.926700672, -0.031890267,…
## $ PC335          <dbl> -0.0008388314, 0.9492232389, 1.1839559924, -0.025439799…
## $ PC336          <dbl> -0.005691695, 0.714068835, 1.794194342, 0.168454490, -0…
## $ PC337          <dbl> 0.0037784456, -0.0899590406, -1.1350093128, -0.03856113…
## $ PC338          <dbl> -0.0006788045, 0.7083304418, 1.0474260650, 0.0687635470…
## $ PC339          <dbl> 0.0085777481, -0.0306833375, 5.8598633983, -0.028446321…
## $ PC340          <dbl> -0.007022861, 0.781735022, 2.333777026, -0.043889935, -…
## $ PC341          <dbl> -0.006835553, -1.730152799, 8.793915946, 0.001261258, 0…
## $ PC342          <dbl> 0.001484162, 1.327467871, 4.769473924, 0.011748773, 0.0…
## $ PC343          <dbl> -0.0006034311, 0.6190231417, -0.3461948187, 0.029709760…
## $ PC344          <dbl> -0.005195972, -0.856928759, -1.129122062, 0.004486849, …
## $ PC345          <dbl> 0.003265117, -2.159266713, -0.011422420, 0.006746204, 0…
## $ PC346          <dbl> 0.0026329011, -0.6160895880, -0.9471170527, -0.00948431…
## $ PC347          <dbl> 0.007916581, -1.295798726, 0.515576259, 0.056884071, 0.…
## $ PC348          <dbl> 0.002410618, 2.381510961, -0.850818927, -0.082057559, 0…
## $ PC349          <dbl> -7.212014e-03, 2.851473e-01, 2.771604e-01, 3.196899e-02…
## $ PC350          <dbl> -0.003575917, 0.466907172, 0.049324437, 0.009682000, -0…
## $ PC351          <dbl> -0.002977418, -0.072128445, -0.216280609, 0.029431768, …
## $ PC352          <dbl> -2.296895e-03, 2.895660e-01, -4.411920e-02, -3.338306e-…
## $ PC353          <dbl> 0.0009787517, 0.1576487503, -0.3391854905, -0.017136062…
## $ PC354          <dbl> -0.0006376691, -0.0037994360, -0.1879111539, 0.01433350…
## $ PC355          <dbl> 0.0006020932, -0.1712257896, 0.1230339855, 0.0467088114…
## $ PC356          <dbl> -5.743524e-05, -8.285974e-02, 8.402902e-02, 1.030330e-0…
## $ PC357          <dbl> 0.0049555655, 0.1177691393, -0.0322324629, 0.0068472419…
## $ PC358          <dbl> 6.386107e-05, 8.528096e-02, 2.308962e-01, 1.337644e-02,…
## $ PC359          <dbl> -4.353760e-02, -4.744780e-03, -2.997123e-02, -3.345015e…
## $ PC360          <dbl> -3.016789e-13, 3.388878e-12, -9.351191e-12, -9.420365e-…
## $ PC361          <dbl> -4.695421e-13, 2.085462e-11, 3.441747e-13, -3.800931e-1…
## $ PC362          <dbl> 5.510182e-13, -1.195956e-11, -9.542551e-12, 6.950169e-1…
## $ PC363          <dbl> 1.098691e-13, -3.574859e-12, -6.916146e-12, 1.567590e-1…
## $ PC364          <dbl> -5.521494e-13, 1.307293e-11, 2.128134e-11, -2.881991e-1…
## $ PC365          <dbl> 8.227010e-13, 2.975602e-11, 1.557617e-11, 6.094720e-12,…
## $ PC366          <dbl> -4.684530e-13, -5.511271e-12, -2.120127e-11, -2.218969e…
## $ PC367          <dbl> -4.714613e-13, -1.856394e-11, -1.074940e-11, -4.463169e…
## $ PC368          <dbl> 1.658082e-12, 2.294873e-11, 2.682899e-12, 8.957725e-12,…
## $ PC369          <dbl> 3.370127e-13, -7.465780e-13, -1.957091e-11, 2.742106e-1…
## $ PC370          <dbl> 5.024913e-13, 6.085573e-12, -5.232142e-12, 2.251689e-12…
## $ PC371          <dbl> 3.755635e-13, 8.588905e-12, 1.775505e-13, 4.508681e-12,…
## $ PC372          <dbl> -4.870356e-14, 5.098481e-12, 3.699921e-11, -5.833349e-1…
## $ PC373          <dbl> 1.347765e-13, -5.259675e-12, -1.049113e-11, 1.624689e-1…
## $ PC374          <dbl> 2.015205e-13, -7.931529e-13, 1.873245e-12, 4.838208e-13…
## $ PC375          <dbl> 8.373806e-13, 8.599259e-12, 1.855729e-11, 5.512483e-12,…
## $ PC376          <dbl> -2.993702e-13, 6.209966e-12, -1.236690e-11, -3.207131e-…
## $ PC377          <dbl> 6.262152e-14, -4.448341e-11, -6.203892e-11, 9.991606e-1…
## $ PC378          <dbl> 5.055672e-15, -1.488483e-11, 1.127107e-11, -1.819656e-1…
## $ PC379          <dbl> 7.304455e-14, 1.202862e-11, -2.122429e-11, -7.561394e-1…
## $ PC380          <dbl> -6.223601e-14, -1.666430e-11, -1.558682e-11, -1.804311e…
## $ PC381          <dbl> 2.863111e-13, 2.333011e-12, 2.690000e-11, 4.097401e-13,…
## $ PC382          <dbl> 7.805000e-14, -1.363604e-11, -5.358467e-12, 3.842669e-1…
## $ PC383          <dbl> 1.290817e-13, 5.313745e-12, 2.617461e-12, 8.729034e-13,…
## $ PC384          <dbl> 1.209060e-13, -1.414361e-11, 1.583384e-12, 5.197519e-13…
## $ PC385          <dbl> -4.065136e-15, -3.048828e-12, -2.314881e-11, -6.446881e…
## $ PC386          <dbl> 3.027766e-14, 2.105315e-12, -1.292603e-11, -9.139289e-1…
## $ PC387          <dbl> -9.198047e-14, 1.595923e-11, -5.182389e-12, 2.357712e-1…
## $ PC388          <dbl> -5.387720e-13, -1.448850e-11, -8.990924e-12, -2.838096e…
## $ PC389          <dbl> 2.738009e-13, 1.351633e-13, 2.910701e-12, 3.343689e-12,…
## $ PC390          <dbl> -2.324179e-13, 6.857801e-12, -1.479998e-12, -9.230356e-…
## $ PC391          <dbl> 2.889298e-13, -5.050689e-13, -7.846528e-12, 2.637003e-1…
## $ PC392          <dbl> -3.876843e-13, -7.836867e-12, 5.035355e-12, -5.686987e-…
## $ PC393          <dbl> 6.910310e-14, -6.254076e-11, 2.054020e-11, -4.464290e-1…
## $ PC394          <dbl> -4.321226e-14, 9.067120e-12, -7.160536e-12, -1.080528e-…
## $ PC395          <dbl> -3.154492e-13, 2.429443e-12, -4.029608e-12, 6.245524e-1…
## $ PC396          <dbl> -2.968354e-13, 1.376641e-11, 2.731950e-12, -1.143731e-1…
## $ PC397          <dbl> 4.887632e-14, 5.757585e-13, 1.566592e-11, -1.411584e-12…
## $ PC398          <dbl> 4.032365e-13, 1.586060e-11, -1.199829e-12, 3.712513e-12…
## $ PC399          <dbl> -2.621817e-13, -2.287813e-14, 8.994870e-12, -1.417837e-…
## $ PC400          <dbl> -1.848229e-13, 7.795712e-12, 8.914404e-13, -1.579756e-1…
## $ PC401          <dbl> 3.319975e-13, 9.721885e-14, 9.525108e-12, 4.232176e-12,…
## $ PC402          <dbl> -6.643963e-14, 9.137490e-13, 1.437141e-11, 6.108884e-13…
## $ PC403          <dbl> -7.241014e-14, -7.122198e-12, 8.390923e-12, 1.917778e-1…
## $ PC404          <dbl> -2.076459e-13, -1.326651e-11, -1.890053e-11, -1.353809e…
## $ PC405          <dbl> 9.984112e-14, 1.076164e-11, 1.095611e-11, -5.456981e-13…
## $ PC406          <dbl> -2.040290e-13, -1.127584e-12, -2.669729e-11, -2.361269e…
## $ PC407          <dbl> -2.885431e-14, -3.865085e-12, 9.131689e-12, 6.653126e-1…
## $ PC408          <dbl> -1.030434e-13, -8.496198e-12, 9.502583e-12, -2.431212e-…
## $ PC409          <dbl> 4.936651e-13, -1.982776e-11, 3.646660e-12, 3.547823e-12…
## $ PC410          <dbl> -2.494358e-13, 3.740589e-11, -2.500964e-11, -1.658755e-…
## $ PC411          <dbl> 5.782808e-13, -4.954075e-12, 8.309358e-12, 5.498782e-12…
## $ PC412          <dbl> 6.529677e-13, 1.026938e-11, -2.132102e-11, 5.152598e-12…
## $ PC413          <dbl> -9.319094e-15, -1.021062e-11, -1.036303e-11, 2.637021e-…
## $ PC414          <dbl> 1.074379e-13, 2.009443e-12, -2.737501e-12, 4.257007e-13…
## $ PC415          <dbl> 2.456300e-13, -8.197998e-12, 1.046542e-11, 2.314253e-12…
## $ PC416          <dbl> 1.035026e-13, -1.219996e-11, -2.170164e-12, 1.818256e-1…
## $ PC417          <dbl> 2.030742e-14, -8.001932e-13, -2.502575e-12, 3.495371e-1…
## $ PC418          <dbl> -3.187960e-14, -6.870018e-12, -9.531009e-12, -2.678159e…
## $ PC419          <dbl> -2.100020e-13, 7.667999e-13, 9.093606e-12, -2.022597e-1…
## $ PC420          <dbl> -2.176478e-14, 1.014556e-12, 5.184245e-12, -3.684389e-1…
## $ PC421          <dbl> -1.629617e-13, 9.193772e-12, 6.886857e-12, -2.346597e-1…
## $ PC422          <dbl> -1.535061e-13, -2.754833e-14, 3.076922e-12, -2.085259e-…
## $ PC423          <dbl> -2.647448e-13, -2.166009e-12, 4.148851e-12, -2.178127e-…
## $ PC424          <dbl> -1.645318e-13, -1.043314e-11, 2.158546e-12, -2.138526e-…
## $ PC425          <dbl> 1.495654e-13, -3.044092e-12, -6.371723e-12, 6.768693e-1…
## $ PC426          <dbl> -2.635363e-14, -5.126138e-12, -4.661435e-12, -2.708750e…
## $ PC427          <dbl> -2.381190e-13, 9.649910e-12, 5.203765e-13, -1.781100e-1…
## $ PC428          <dbl> 9.786843e-14, 2.941582e-11, -3.961682e-12, 2.110994e-12…
## $ PC429          <dbl> 5.208849e-13, 9.709235e-14, 1.535821e-12, 4.378511e-12,…
## $ PC430          <dbl> 3.362871e-13, -1.647942e-12, -5.066989e-12, 1.665037e-1…
## $ PC431          <dbl> 2.191638e-13, 8.252730e-12, 1.125049e-11, 2.042806e-12,…
## $ PC432          <dbl> 1.077374e-13, 1.449400e-11, 1.457628e-11, 1.246361e-12,…
## $ PC433          <dbl> -1.429901e-13, 3.113811e-12, -1.410831e-11, -1.120820e-…
## $ PC434          <dbl> 5.683753e-13, 4.616167e-12, -4.892800e-12, 4.439452e-12…
## $ PC435          <dbl> 2.892847e-13, 2.562475e-13, -8.129247e-12, 1.958642e-12…
## $ PC436          <dbl> -2.337293e-13, -7.950577e-12, -1.679863e-11, -2.488724e…
## $ PC437          <dbl> 9.552409e-14, 2.263903e-12, 8.785037e-12, 1.575479e-12,…
## $ PC438          <dbl> 6.859279e-14, 3.525266e-12, 1.282669e-12, 2.693896e-12,…
## $ PC439          <dbl> -2.295937e-13, -1.015567e-11, 5.735318e-12, -1.804341e-…
## $ PC440          <dbl> -1.067646e-13, -6.107059e-12, -4.222259e-12, -1.797544e…
## $ PC441          <dbl> -1.717900e-13, 1.256920e-11, -7.204855e-12, -5.079482e-…
## $ PC442          <dbl> 2.287689e-13, 3.267005e-12, 9.140455e-12, 1.940192e-12,…
## $ PC443          <dbl> 1.113840e-13, 2.931141e-12, 3.455070e-12, -7.414893e-13…
## $ PC444          <dbl> 2.654062e-13, 1.229516e-11, 6.206001e-14, 1.674765e-12,…

specify model

library(usemodels)
usemodels::use_xgboost(classification ~ ., data = data_train)
## xgboost_recipe <- 
##   recipe(formula = classification ~ ., data = data_train) %>% 
##   step_zv(all_predictors()) 
## 
## xgboost_spec <- 
##   boost_tree(trees = tune(), min_n = tune(), tree_depth = tune(), learn_rate = tune(), 
##     loss_reduction = tune(), sample_size = tune()) %>% 
##   set_mode("classification") %>% 
##   set_engine("xgboost") 
## 
## xgboost_workflow <- 
##   workflow() %>% 
##   add_recipe(xgboost_recipe) %>% 
##   add_model(xgboost_spec) 
## 
## set.seed(66780)
## xgboost_tune <-
##   tune_grid(xgboost_workflow, resamples = stop("add your rsample object"), grid = stop("add number of candidate points"))
xgboost_spec <- 
    boost_tree(trees = tune(), tree_depth = tune()) %>%
    set_mode("classification") %>%
    set_engine("xgboost")

xgboost_workflow <-
    workflow() %>%
    add_recipe(xgboost_rec) %>%
    add_model(xgboost_spec)

tune hyper parameters

library(doParallel)
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loading required package: iterators
## Loading required package: parallel
library(foreach)

tree_grid <- grid_regular(trees(), tree_depth(), levels = 10)

doParallel::registerDoParallel()

set.seed(2242)
xgboost_tune <-
    tune_grid(xgboost_workflow, resamples = data_cv, grid = 5,
              control = control_grid(save_pred = TRUE))

identify optimal values for hyperparameters

collect_metrics(xgboost_tune)
## # A tibble: 15 × 8
##    trees tree_depth .metric     .estimator  mean     n std_err .config          
##    <int>      <int> <chr>       <chr>      <dbl> <int>   <dbl> <chr>            
##  1  1812          3 accuracy    binary     0.508    10 0.0111  Preprocessor1_Mo…
##  2  1812          3 brier_class binary     0.410    10 0.00743 Preprocessor1_Mo…
##  3  1812          3 roc_auc     binary     0.511    10 0.0111  Preprocessor1_Mo…
##  4   795          4 accuracy    binary     0.496    10 0.00951 Preprocessor1_Mo…
##  5   795          4 brier_class binary     0.408    10 0.00935 Preprocessor1_Mo…
##  6   795          4 roc_auc     binary     0.508    10 0.0117  Preprocessor1_Mo…
##  7  1435          7 accuracy    binary     0.498    10 0.0130  Preprocessor1_Mo…
##  8  1435          7 brier_class binary     0.395    10 0.00912 Preprocessor1_Mo…
##  9  1435          7 roc_auc     binary     0.506    10 0.0104  Preprocessor1_Mo…
## 10    53         10 accuracy    binary     0.491    10 0.0126  Preprocessor1_Mo…
## 11    53         10 brier_class binary     0.362    10 0.00914 Preprocessor1_Mo…
## 12    53         10 roc_auc     binary     0.498    10 0.0143  Preprocessor1_Mo…
## 13   878         13 accuracy    binary     0.487    10 0.0109  Preprocessor1_Mo…
## 14   878         13 brier_class binary     0.404    10 0.00809 Preprocessor1_Mo…
## 15   878         13 roc_auc     binary     0.509    10 0.0119  Preprocessor1_Mo…
collect_predictions(xgboost_tune) %>%
    group_by(id) %>%
    roc_curve(classification, .pred_possible) %>%
    autoplot()

fit for the model for the last time

xgboost_last <- xgboost_workflow %>%
    finalize_workflow(select_best(xgboost_tune, metric = "accuracy")) %>%
    last_fit(data_split)
## → A | warning: ! There are new levels in `county`: "Westchester County", "Transylvania
##                  County", "Sampson County", "Robeson County", "Wilkes County", "Duplin
##                  County", "Sarpy County", "Tioga County", "Ross County", "Payne County",
##                  "Tulsa County", "Athens County", "Hughes County", "Logan County", "McClain
##                  County", "Harney County", "Potter County", "Mobile County", …, "McLean
##                  County", and "Bourbon County".
##                ℹ Consider using step_novel() (`?recipes::step_novel()`) \ before
##                  `step_dummy()` to handle unseen values.
## 
There were issues with some computations   A: x1

There were issues with some computations   A: x1
collect_metrics(xgboost_last)
## # A tibble: 3 × 4
##   .metric     .estimator .estimate .config             
##   <chr>       <chr>          <dbl> <chr>               
## 1 accuracy    binary         0.523 Preprocessor1_Model1
## 2 roc_auc     binary         0.529 Preprocessor1_Model1
## 3 brier_class binary         0.413 Preprocessor1_Model1
collect_predictions(xgboost_last) %>%
    yardstick::conf_mat(classification, .pred_class) %>%
    autoplot()

variable importance

library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
xgboost_last %>%
    workflows::extract_fit_engine() %>%
    vip()

conclusion

Step normalize had a negtive effect on the model. Adding Step YeoJohnson helped the model show a bit of improvement to .420. Step pca helped in getting the model up to an accuracy of .523. I put the threshold at .75 because .50 and .99 made the model worse. Algorithm tuning had no postive affect on the model. Overall, step pca and step yeojohnson helped the model the most but the model is still not that great.