Goal: Predict classification of a Bigfoot report
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(correlationfunnel)
## ══ correlationfunnel Tip #2 ════════════════════════════════════════════════════
## Clean your NA's prior to using `binarize()`.
## Missing values and cleaning data are critical to getting great correlations. :)
data <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-09-13/bigfoot.csv')
## Rows: 5021 Columns: 28
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): observed, location_details, county, state, season, title, classif...
## dbl (17): latitude, longitude, number, temperature_high, temperature_mid, t...
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(data)
Name | data |
Number of rows | 5021 |
Number of columns | 28 |
_______________________ | |
Column type frequency: | |
character | 10 |
Date | 1 |
numeric | 17 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
observed | 38 | 0.99 | 1 | 30374 | 0 | 4982 | 0 |
location_details | 758 | 0.85 | 1 | 3876 | 0 | 4196 | 0 |
county | 0 | 1.00 | 10 | 30 | 0 | 1037 | 0 |
state | 0 | 1.00 | 4 | 14 | 0 | 49 | 0 |
season | 0 | 1.00 | 4 | 7 | 0 | 5 | 0 |
title | 976 | 0.81 | 23 | 235 | 0 | 4045 | 0 |
classification | 0 | 1.00 | 7 | 7 | 0 | 3 | 0 |
geohash | 976 | 0.81 | 10 | 10 | 0 | 4001 | 0 |
precip_type | 3298 | 0.34 | 4 | 4 | 0 | 2 | 0 |
summary | 1655 | 0.67 | 15 | 103 | 0 | 321 | 0 |
Variable type: Date
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
date | 976 | 0.81 | 1869-11-10 | 2021-11-27 | 2003-11-16 | 3111 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
latitude | 976 | 0.81 | 39.36 | 5.68 | 25.14 | 35.35 | 39.30 | 43.93 | 64.89 | ▂▇▆▁▁ |
longitude | 976 | 0.81 | -97.42 | 16.73 | -167.13 | -117.06 | -91.77 | -83.07 | -68.23 | ▁▁▆▆▇ |
number | 0 | 1.00 | 21520.23 | 19259.15 | 60.00 | 4595.00 | 15473.00 | 33979.00 | 71997.00 | ▇▃▂▂▁ |
temperature_high | 1683 | 0.66 | 67.12 | 17.78 | -0.62 | 55.14 | 69.97 | 81.10 | 106.51 | ▁▂▅▇▃ |
temperature_mid | 1835 | 0.63 | 57.84 | 16.40 | -8.46 | 46.77 | 59.36 | 70.38 | 94.03 | ▁▁▆▇▃ |
temperature_low | 1832 | 0.64 | 48.64 | 15.94 | -22.78 | 37.50 | 49.40 | 60.66 | 84.34 | ▁▁▅▇▃ |
dew_point | 1648 | 0.67 | 46.23 | 16.44 | -11.21 | 34.77 | 46.69 | 59.00 | 77.40 | ▁▂▆▇▅ |
humidity | 1648 | 0.67 | 0.71 | 0.16 | 0.08 | 0.62 | 0.73 | 0.82 | 1.00 | ▁▁▃▇▅ |
cloud_cover | 1937 | 0.61 | 0.44 | 0.33 | 0.00 | 0.12 | 0.40 | 0.73 | 1.00 | ▇▅▃▃▅ |
moon_phase | 1625 | 0.68 | 0.50 | 0.29 | 0.00 | 0.25 | 0.49 | 0.75 | 1.00 | ▇▇▇▇▇ |
precip_intensity | 2309 | 0.54 | 0.01 | 0.05 | 0.00 | 0.00 | 0.00 | 0.00 | 2.07 | ▇▁▁▁▁ |
precip_probability | 2311 | 0.54 | 0.30 | 0.42 | 0.00 | 0.00 | 0.00 | 0.73 | 1.00 | ▇▁▁▁▃ |
pressure | 2402 | 0.52 | 1017.08 | 6.14 | 980.34 | 1013.42 | 1016.96 | 1020.64 | 1042.41 | ▁▁▇▆▁ |
uv_index | 1629 | 0.68 | 5.16 | 3.14 | 0.00 | 3.00 | 5.00 | 8.00 | 13.00 | ▆▇▅▆▁ |
visibility | 1972 | 0.61 | 8.49 | 2.06 | 0.74 | 7.66 | 9.45 | 10.00 | 10.00 | ▁▁▁▂▇ |
wind_bearing | 1634 | 0.67 | 196.57 | 96.38 | 0.00 | 128.00 | 203.00 | 273.00 | 359.00 | ▅▅▇▇▆ |
wind_speed | 1632 | 0.67 | 3.87 | 3.28 | 0.00 | 1.34 | 2.93 | 5.56 | 23.94 | ▇▃▁▁▁ |
Issues with data:
data_clean <- data %>%
# Treat missing values
select(-precip_type, -precip_intensity, -precip_probability) %>%
na.omit() %>%
# Drop date (temporary)
select(-c(date))
#data_clean <- data %>%
# Address factors imported as numeric
# none
# Drop zero-variance variables
# none
data_clean %>% count(classification)
## # A tibble: 3 × 2
## classification n
## <chr> <int>
## 1 Class A 1019
## 2 Class B 1053
## 3 Class C 8
data_clean %>%
ggplot(aes(classification)) +
geom_bar()
classification vs. Temperature_High
data_clean %>%
ggplot(aes(classification, temperature_high)) +
geom_boxplot()
Correlation Plot
# Step 1: Binarize
data_binarized <- data_clean %>%
select(-number) %>%
binarize()
data_binarized %>% glimpse()
## Rows: 2,080
## Columns: 128
## $ observed__. <dbl> …
## $ `observed__-OTHER` <dbl> …
## $ `location_details__(edited)` <dbl> …
## $ `location_details__-OTHER` <dbl> …
## $ county__Jackson_County <dbl> …
## $ county__Jefferson_County <dbl> …
## $ county__King_County <dbl> …
## $ county__Pierce_County <dbl> …
## $ county__Snohomish_County <dbl> …
## $ county__Washington_County <dbl> …
## $ `county__-OTHER` <dbl> …
## $ state__Alabama <dbl> …
## $ state__Arkansas <dbl> …
## $ state__California <dbl> …
## $ state__Colorado <dbl> …
## $ state__Florida <dbl> …
## $ state__Georgia <dbl> …
## $ state__Idaho <dbl> …
## $ state__Illinois <dbl> …
## $ state__Indiana <dbl> …
## $ state__Iowa <dbl> …
## $ state__Kansas <dbl> …
## $ state__Kentucky <dbl> …
## $ state__Michigan <dbl> …
## $ state__Missouri <dbl> …
## $ state__New_Jersey <dbl> …
## $ state__New_York <dbl> …
## $ state__North_Carolina <dbl> …
## $ state__Ohio <dbl> …
## $ state__Oklahoma <dbl> …
## $ state__Oregon <dbl> …
## $ state__Pennsylvania <dbl> …
## $ state__Tennessee <dbl> …
## $ state__Texas <dbl> …
## $ state__Virginia <dbl> …
## $ state__Washington <dbl> …
## $ state__West_Virginia <dbl> …
## $ state__Wisconsin <dbl> …
## $ `state__-OTHER` <dbl> …
## $ season__Fall <dbl> …
## $ season__Spring <dbl> …
## $ season__Summer <dbl> …
## $ season__Unknown <dbl> …
## $ season__Winter <dbl> …
## $ `title__Report_10006:_A_woman_has_late_night_sighting_when_a_motion_detecting_light_illuminates_her_back_porch` <dbl> …
## $ `title__-OTHER` <dbl> …
## $ `latitude__-Inf_35.2904625` <dbl> …
## $ latitude__35.2904625_39.642495 <dbl> …
## $ latitude__39.642495_43.486905 <dbl> …
## $ latitude__43.486905_Inf <dbl> …
## $ `longitude__-Inf_-112.275025` <dbl> …
## $ `longitude__-112.275025_-88.76895` <dbl> …
## $ `longitude__-88.76895_-82.1319875` <dbl> …
## $ `longitude__-82.1319875_Inf` <dbl> …
## $ classification__Class_A <dbl> …
## $ classification__Class_B <dbl> …
## $ `classification__-OTHER` <dbl> …
## $ geohash__c22fq2jr5r <dbl> …
## $ `geohash__-OTHER` <dbl> …
## $ `temperature_high__-Inf_54.685` <dbl> …
## $ temperature_high__54.685_69.945 <dbl> …
## $ temperature_high__69.945_81.2525 <dbl> …
## $ temperature_high__81.2525_Inf <dbl> …
## $ `temperature_mid__-Inf_46.79875` <dbl> …
## $ temperature_mid__46.79875_59.82 <dbl> …
## $ temperature_mid__59.82_70.85125 <dbl> …
## $ temperature_mid__70.85125_Inf <dbl> …
## $ `temperature_low__-Inf_38.065` <dbl> …
## $ temperature_low__38.065_49.945 <dbl> …
## $ temperature_low__49.945_61.415 <dbl> …
## $ temperature_low__61.415_Inf <dbl> …
## $ `dew_point__-Inf_35.59` <dbl> …
## $ dew_point__35.59_47.51 <dbl> …
## $ dew_point__47.51_59.6125 <dbl> …
## $ dew_point__59.6125_Inf <dbl> …
## $ `humidity__-Inf_0.64` <dbl> …
## $ humidity__0.64_0.74 <dbl> …
## $ humidity__0.74_0.82 <dbl> …
## $ humidity__0.82_Inf <dbl> …
## $ `cloud_cover__-Inf_0.13` <dbl> …
## $ cloud_cover__0.13_0.41 <dbl> …
## $ cloud_cover__0.41_0.74 <dbl> …
## $ cloud_cover__0.74_Inf <dbl> …
## $ `moon_phase__-Inf_0.25` <dbl> …
## $ moon_phase__0.25_0.51 <dbl> …
## $ moon_phase__0.51_0.75 <dbl> …
## $ moon_phase__0.75_Inf <dbl> …
## $ `pressure__-Inf_1013.34` <dbl> …
## $ pressure__1013.34_1016.95 <dbl> …
## $ pressure__1016.95_1020.64 <dbl> …
## $ pressure__1020.64_Inf <dbl> …
## $ summary__Clear_throughout_the_day. <dbl> …
## $ summary__Foggy_in_the_morning. <dbl> …
## $ summary__Foggy_overnight. <dbl> …
## $ summary__Humid_and_mostly_cloudy_throughout_the_day. <dbl> …
## $ summary__Light_rain_in_the_morning. <dbl> …
## $ summary__Mostly_cloudy_in_the_morning. <dbl> …
## $ summary__Mostly_cloudy_overnight. <dbl> …
## $ summary__Mostly_cloudy_starting_in_the_afternoon. <dbl> …
## $ summary__Mostly_cloudy_starting_in_the_evening. <dbl> …
## $ summary__Mostly_cloudy_throughout_the_day. <dbl> …
## $ summary__Mostly_cloudy_until_afternoon. <dbl> …
## $ summary__Mostly_cloudy_until_evening. <dbl> …
## $ summary__Overcast_in_the_morning. <dbl> …
## $ summary__Overcast_throughout_the_day. <dbl> …
## $ summary__Partly_cloudy_in_the_morning. <dbl> …
## $ `summary__Partly_cloudy_starting_in_the_afternoon,_continuing_until_evening.` <dbl> …
## $ summary__Partly_cloudy_throughout_the_day. <dbl> …
## $ summary__Partly_cloudy_until_afternoon. <dbl> …
## $ summary__Partly_cloudy_until_evening. <dbl> …
## $ summary__Rain_in_the_morning_and_afternoon. <dbl> …
## $ summary__Rain_in_the_morning. <dbl> …
## $ `summary__-OTHER` <dbl> …
## $ `uv_index__-Inf_3` <dbl> …
## $ uv_index__3_5 <dbl> …
## $ uv_index__5_8 <dbl> …
## $ uv_index__8_Inf <dbl> …
## $ `visibility__-Inf_7.64525` <dbl> …
## $ visibility__7.64525_9.42 <dbl> …
## $ visibility__9.42_Inf <dbl> …
## $ `wind_bearing__-Inf_127` <dbl> …
## $ wind_bearing__127_202 <dbl> …
## $ wind_bearing__202_268 <dbl> …
## $ wind_bearing__268_Inf <dbl> …
## $ `wind_speed__-Inf_1.42` <dbl> …
## $ wind_speed__1.42_2.975 <dbl> …
## $ wind_speed__2.975_5.53 <dbl> …
## $ wind_speed__5.53_Inf <dbl> …
# Step 2: Correlation
data_correlation <- data_binarized %>%
correlate(classification__Class_A)
data_correlation
## # A tibble: 128 × 3
## feature bin correlation
## <fct> <chr> <dbl>
## 1 classification Class_A 1
## 2 classification Class_B -0.992
## 3 wind_speed -Inf_1.42 -0.0893
## 4 longitude -112.275025_-88.76895 0.0716
## 5 longitude -Inf_-112.275025 -0.0683
## 6 state California -0.0667
## 7 wind_speed 5.53_Inf 0.0661
## 8 wind_bearing -Inf_127 0.0638
## 9 classification -OTHER -0.0609
## 10 dew_point 35.59_47.51 -0.0582
## # ℹ 118 more rows
# Step 3: Plot
data_correlation %>%
correlationfunnel::plot_correlation_funnel()
## Warning: ggrepel: 114 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps