Goal: Predict classification of a Bigfoot report

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(correlationfunnel)
## ══ correlationfunnel Tip #2 ════════════════════════════════════════════════════
## Clean your NA's prior to using `binarize()`.
## Missing values and cleaning data are critical to getting great correlations. :)
data <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-09-13/bigfoot.csv')
## Rows: 5021 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): observed, location_details, county, state, season, title, classif...
## dbl  (17): latitude, longitude, number, temperature_high, temperature_mid, t...
## date  (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Explore data

skimr::skim(data)
Data summary
Name data
Number of rows 5021
Number of columns 28
_______________________
Column type frequency:
character 10
Date 1
numeric 17
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
observed 38 0.99 1 30374 0 4982 0
location_details 758 0.85 1 3876 0 4196 0
county 0 1.00 10 30 0 1037 0
state 0 1.00 4 14 0 49 0
season 0 1.00 4 7 0 5 0
title 976 0.81 23 235 0 4045 0
classification 0 1.00 7 7 0 3 0
geohash 976 0.81 10 10 0 4001 0
precip_type 3298 0.34 4 4 0 2 0
summary 1655 0.67 15 103 0 321 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
date 976 0.81 1869-11-10 2021-11-27 2003-11-16 3111

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
latitude 976 0.81 39.36 5.68 25.14 35.35 39.30 43.93 64.89 ▂▇▆▁▁
longitude 976 0.81 -97.42 16.73 -167.13 -117.06 -91.77 -83.07 -68.23 ▁▁▆▆▇
number 0 1.00 21520.23 19259.15 60.00 4595.00 15473.00 33979.00 71997.00 ▇▃▂▂▁
temperature_high 1683 0.66 67.12 17.78 -0.62 55.14 69.97 81.10 106.51 ▁▂▅▇▃
temperature_mid 1835 0.63 57.84 16.40 -8.46 46.77 59.36 70.38 94.03 ▁▁▆▇▃
temperature_low 1832 0.64 48.64 15.94 -22.78 37.50 49.40 60.66 84.34 ▁▁▅▇▃
dew_point 1648 0.67 46.23 16.44 -11.21 34.77 46.69 59.00 77.40 ▁▂▆▇▅
humidity 1648 0.67 0.71 0.16 0.08 0.62 0.73 0.82 1.00 ▁▁▃▇▅
cloud_cover 1937 0.61 0.44 0.33 0.00 0.12 0.40 0.73 1.00 ▇▅▃▃▅
moon_phase 1625 0.68 0.50 0.29 0.00 0.25 0.49 0.75 1.00 ▇▇▇▇▇
precip_intensity 2309 0.54 0.01 0.05 0.00 0.00 0.00 0.00 2.07 ▇▁▁▁▁
precip_probability 2311 0.54 0.30 0.42 0.00 0.00 0.00 0.73 1.00 ▇▁▁▁▃
pressure 2402 0.52 1017.08 6.14 980.34 1013.42 1016.96 1020.64 1042.41 ▁▁▇▆▁
uv_index 1629 0.68 5.16 3.14 0.00 3.00 5.00 8.00 13.00 ▆▇▅▆▁
visibility 1972 0.61 8.49 2.06 0.74 7.66 9.45 10.00 10.00 ▁▁▁▂▇
wind_bearing 1634 0.67 196.57 96.38 0.00 128.00 203.00 273.00 359.00 ▅▅▇▇▆
wind_speed 1632 0.67 3.87 3.28 0.00 1.34 2.93 5.56 23.94 ▇▃▁▁▁

Issues with data:

data_clean <- data %>% 
   
    # Treat missing values
    select(-precip_type, -precip_intensity, -precip_probability) %>% 
    na.omit() %>% 
    
    # Drop date (temporary)
    select(-c(date))

#data_clean <- data %>% 
   
    # Address factors imported as numeric
    # none
    
    # Drop zero-variance variables
    # none

Explore data

data_clean %>% count(classification)
## # A tibble: 3 × 2
##   classification     n
##   <chr>          <int>
## 1 Class A         1019
## 2 Class B         1053
## 3 Class C            8
data_clean %>%
    ggplot(aes(classification)) +
    geom_bar()

classification vs. Temperature_High

data_clean %>% 
    ggplot(aes(classification, temperature_high)) + 
    geom_boxplot()

Correlation Plot

# Step 1: Binarize
data_binarized <- data_clean %>% 
    select(-number) %>%
    binarize()

data_binarized %>% glimpse()
## Rows: 2,080
## Columns: 128
## $ observed__.                                                                                                     <dbl> …
## $ `observed__-OTHER`                                                                                              <dbl> …
## $ `location_details__(edited)`                                                                                    <dbl> …
## $ `location_details__-OTHER`                                                                                      <dbl> …
## $ county__Jackson_County                                                                                          <dbl> …
## $ county__Jefferson_County                                                                                        <dbl> …
## $ county__King_County                                                                                             <dbl> …
## $ county__Pierce_County                                                                                           <dbl> …
## $ county__Snohomish_County                                                                                        <dbl> …
## $ county__Washington_County                                                                                       <dbl> …
## $ `county__-OTHER`                                                                                                <dbl> …
## $ state__Alabama                                                                                                  <dbl> …
## $ state__Arkansas                                                                                                 <dbl> …
## $ state__California                                                                                               <dbl> …
## $ state__Colorado                                                                                                 <dbl> …
## $ state__Florida                                                                                                  <dbl> …
## $ state__Georgia                                                                                                  <dbl> …
## $ state__Idaho                                                                                                    <dbl> …
## $ state__Illinois                                                                                                 <dbl> …
## $ state__Indiana                                                                                                  <dbl> …
## $ state__Iowa                                                                                                     <dbl> …
## $ state__Kansas                                                                                                   <dbl> …
## $ state__Kentucky                                                                                                 <dbl> …
## $ state__Michigan                                                                                                 <dbl> …
## $ state__Missouri                                                                                                 <dbl> …
## $ state__New_Jersey                                                                                               <dbl> …
## $ state__New_York                                                                                                 <dbl> …
## $ state__North_Carolina                                                                                           <dbl> …
## $ state__Ohio                                                                                                     <dbl> …
## $ state__Oklahoma                                                                                                 <dbl> …
## $ state__Oregon                                                                                                   <dbl> …
## $ state__Pennsylvania                                                                                             <dbl> …
## $ state__Tennessee                                                                                                <dbl> …
## $ state__Texas                                                                                                    <dbl> …
## $ state__Virginia                                                                                                 <dbl> …
## $ state__Washington                                                                                               <dbl> …
## $ state__West_Virginia                                                                                            <dbl> …
## $ state__Wisconsin                                                                                                <dbl> …
## $ `state__-OTHER`                                                                                                 <dbl> …
## $ season__Fall                                                                                                    <dbl> …
## $ season__Spring                                                                                                  <dbl> …
## $ season__Summer                                                                                                  <dbl> …
## $ season__Unknown                                                                                                 <dbl> …
## $ season__Winter                                                                                                  <dbl> …
## $ `title__Report_10006:_A_woman_has_late_night_sighting_when_a_motion_detecting_light_illuminates_her_back_porch` <dbl> …
## $ `title__-OTHER`                                                                                                 <dbl> …
## $ `latitude__-Inf_35.2904625`                                                                                     <dbl> …
## $ latitude__35.2904625_39.642495                                                                                  <dbl> …
## $ latitude__39.642495_43.486905                                                                                   <dbl> …
## $ latitude__43.486905_Inf                                                                                         <dbl> …
## $ `longitude__-Inf_-112.275025`                                                                                   <dbl> …
## $ `longitude__-112.275025_-88.76895`                                                                              <dbl> …
## $ `longitude__-88.76895_-82.1319875`                                                                              <dbl> …
## $ `longitude__-82.1319875_Inf`                                                                                    <dbl> …
## $ classification__Class_A                                                                                         <dbl> …
## $ classification__Class_B                                                                                         <dbl> …
## $ `classification__-OTHER`                                                                                        <dbl> …
## $ geohash__c22fq2jr5r                                                                                             <dbl> …
## $ `geohash__-OTHER`                                                                                               <dbl> …
## $ `temperature_high__-Inf_54.685`                                                                                 <dbl> …
## $ temperature_high__54.685_69.945                                                                                 <dbl> …
## $ temperature_high__69.945_81.2525                                                                                <dbl> …
## $ temperature_high__81.2525_Inf                                                                                   <dbl> …
## $ `temperature_mid__-Inf_46.79875`                                                                                <dbl> …
## $ temperature_mid__46.79875_59.82                                                                                 <dbl> …
## $ temperature_mid__59.82_70.85125                                                                                 <dbl> …
## $ temperature_mid__70.85125_Inf                                                                                   <dbl> …
## $ `temperature_low__-Inf_38.065`                                                                                  <dbl> …
## $ temperature_low__38.065_49.945                                                                                  <dbl> …
## $ temperature_low__49.945_61.415                                                                                  <dbl> …
## $ temperature_low__61.415_Inf                                                                                     <dbl> …
## $ `dew_point__-Inf_35.59`                                                                                         <dbl> …
## $ dew_point__35.59_47.51                                                                                          <dbl> …
## $ dew_point__47.51_59.6125                                                                                        <dbl> …
## $ dew_point__59.6125_Inf                                                                                          <dbl> …
## $ `humidity__-Inf_0.64`                                                                                           <dbl> …
## $ humidity__0.64_0.74                                                                                             <dbl> …
## $ humidity__0.74_0.82                                                                                             <dbl> …
## $ humidity__0.82_Inf                                                                                              <dbl> …
## $ `cloud_cover__-Inf_0.13`                                                                                        <dbl> …
## $ cloud_cover__0.13_0.41                                                                                          <dbl> …
## $ cloud_cover__0.41_0.74                                                                                          <dbl> …
## $ cloud_cover__0.74_Inf                                                                                           <dbl> …
## $ `moon_phase__-Inf_0.25`                                                                                         <dbl> …
## $ moon_phase__0.25_0.51                                                                                           <dbl> …
## $ moon_phase__0.51_0.75                                                                                           <dbl> …
## $ moon_phase__0.75_Inf                                                                                            <dbl> …
## $ `pressure__-Inf_1013.34`                                                                                        <dbl> …
## $ pressure__1013.34_1016.95                                                                                       <dbl> …
## $ pressure__1016.95_1020.64                                                                                       <dbl> …
## $ pressure__1020.64_Inf                                                                                           <dbl> …
## $ summary__Clear_throughout_the_day.                                                                              <dbl> …
## $ summary__Foggy_in_the_morning.                                                                                  <dbl> …
## $ summary__Foggy_overnight.                                                                                       <dbl> …
## $ summary__Humid_and_mostly_cloudy_throughout_the_day.                                                            <dbl> …
## $ summary__Light_rain_in_the_morning.                                                                             <dbl> …
## $ summary__Mostly_cloudy_in_the_morning.                                                                          <dbl> …
## $ summary__Mostly_cloudy_overnight.                                                                               <dbl> …
## $ summary__Mostly_cloudy_starting_in_the_afternoon.                                                               <dbl> …
## $ summary__Mostly_cloudy_starting_in_the_evening.                                                                 <dbl> …
## $ summary__Mostly_cloudy_throughout_the_day.                                                                      <dbl> …
## $ summary__Mostly_cloudy_until_afternoon.                                                                         <dbl> …
## $ summary__Mostly_cloudy_until_evening.                                                                           <dbl> …
## $ summary__Overcast_in_the_morning.                                                                               <dbl> …
## $ summary__Overcast_throughout_the_day.                                                                           <dbl> …
## $ summary__Partly_cloudy_in_the_morning.                                                                          <dbl> …
## $ `summary__Partly_cloudy_starting_in_the_afternoon,_continuing_until_evening.`                                   <dbl> …
## $ summary__Partly_cloudy_throughout_the_day.                                                                      <dbl> …
## $ summary__Partly_cloudy_until_afternoon.                                                                         <dbl> …
## $ summary__Partly_cloudy_until_evening.                                                                           <dbl> …
## $ summary__Rain_in_the_morning_and_afternoon.                                                                     <dbl> …
## $ summary__Rain_in_the_morning.                                                                                   <dbl> …
## $ `summary__-OTHER`                                                                                               <dbl> …
## $ `uv_index__-Inf_3`                                                                                              <dbl> …
## $ uv_index__3_5                                                                                                   <dbl> …
## $ uv_index__5_8                                                                                                   <dbl> …
## $ uv_index__8_Inf                                                                                                 <dbl> …
## $ `visibility__-Inf_7.64525`                                                                                      <dbl> …
## $ visibility__7.64525_9.42                                                                                        <dbl> …
## $ visibility__9.42_Inf                                                                                            <dbl> …
## $ `wind_bearing__-Inf_127`                                                                                        <dbl> …
## $ wind_bearing__127_202                                                                                           <dbl> …
## $ wind_bearing__202_268                                                                                           <dbl> …
## $ wind_bearing__268_Inf                                                                                           <dbl> …
## $ `wind_speed__-Inf_1.42`                                                                                         <dbl> …
## $ wind_speed__1.42_2.975                                                                                          <dbl> …
## $ wind_speed__2.975_5.53                                                                                          <dbl> …
## $ wind_speed__5.53_Inf                                                                                            <dbl> …
# Step 2: Correlation 
data_correlation <- data_binarized %>% 
    correlate(classification__Class_A)

data_correlation
## # A tibble: 128 × 3
##    feature        bin                   correlation
##    <fct>          <chr>                       <dbl>
##  1 classification Class_A                    1     
##  2 classification Class_B                   -0.992 
##  3 wind_speed     -Inf_1.42                 -0.0893
##  4 longitude      -112.275025_-88.76895      0.0716
##  5 longitude      -Inf_-112.275025          -0.0683
##  6 state          California                -0.0667
##  7 wind_speed     5.53_Inf                   0.0661
##  8 wind_bearing   -Inf_127                   0.0638
##  9 classification -OTHER                    -0.0609
## 10 dew_point      35.59_47.51               -0.0582
## # ℹ 118 more rows
# Step 3: Plot
data_correlation %>% 
    correlationfunnel::plot_correlation_funnel()
## Warning: ggrepel: 114 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps