library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(purrr)
data_path <- "C:/Users/tsander/OneDrive - Rehab United Inc/R/Module 6 Data/data"
# Get all Month-XX.csv files from that folder
files <- list.files(path = data_path,
pattern = "(?i)^Month-\\d{1,2}\\.csv$", # matches Month-1.csv or Month-01.csv
full.names = TRUE)
# Read and combine
all_data <- map_dfr(files, ~ suppressMessages(read_csv(.x)))
# Confirm size
dim(all_data) # should be 698159 rows, 10 columns
## [1] 698159 10
glimpse(all_data)
## Rows: 698,159
## Columns: 10
## $ Account_ID <dbl> 5, 16, 28, 40, 62, 64, 69, 69, 70, 79, 88, 90, 9…
## $ Transaction_Timestamp <dttm> 2009-01-08 00:16:41, 2009-01-20 22:40:08, 2009-…
## $ Factor_A <dbl> 2, 2, 2, 2, 2, 7, 2, 2, 2, 7, 8, 10, 10, 2, 2, 2…
## $ Factor_B <dbl> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 18, 6, 6, 6, 6, 6,…
## $ Factor_C <chr> "VI", "VI", "VI", "VI", "VI", "MC", "VI", "VI", …
## $ Factor_D <dbl> 20, 20, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, …
## $ Factor_E <chr> "A", "H", "NULL", "H", "B", "NULL", "H", "H", "B…
## $ Response <dbl> 1020, 1020, 1020, 1020, 1020, 1020, 1020, 1020, …
## $ Transaction_Status <chr> "Approved", "Approved", "Approved", "Approved", …
## $ Month <chr> "Jan", "Jan", "Jan", "Jan", "Jan", "Jan", "Jan",…
map(all_data, class)
## $Account_ID
## [1] "numeric"
##
## $Transaction_Timestamp
## [1] "POSIXct" "POSIXt"
##
## $Factor_A
## [1] "numeric"
##
## $Factor_B
## [1] "numeric"
##
## $Factor_C
## [1] "character"
##
## $Factor_D
## [1] "numeric"
##
## $Factor_E
## [1] "character"
##
## $Response
## [1] "numeric"
##
## $Transaction_Status
## [1] "character"
##
## $Month
## [1] "character"
map_int(all_data, ~ length(unique(.x)))
## Account_ID Transaction_Timestamp Factor_A
## 475413 686538 7
## Factor_B Factor_C Factor_D
## 6 4 15
## Factor_E Response Transaction_Status
## 63 42 2
## Month
## 11
# Mutate anything that is 26 to 25
all_data <- all_data %>%
mutate(Factor_D = if_else(Factor_D == 26, 25, Factor_D))
# Count unique values
n_distinct(all_data$Factor_D)
## [1] 14
# Count observations for each level
all_data %>%
count(Factor_D)
## # A tibble: 14 × 2
## Factor_D n
## <dbl> <int>
## 1 10 4595
## 2 15 1089
## 3 20 527882
## 4 21 68072
## 5 25 41021
## 6 30 7030
## 7 31 512
## 8 35 25298
## 9 40 2720
## 10 50 3709
## 11 55 15200
## 12 70 54
## 13 85 4
## 14 90 973
After recoding, Factor_D now contains 14 unique values.
all_data %>%
summarise(across(starts_with("Factor_"), ~ sum(. == "NULL", na.rm = TRUE)))
## # A tibble: 1 × 5
## Factor_A Factor_B Factor_C Factor_D Factor_E
## <int> <int> <int> <int> <int>
## 1 0 0 0 0 208622
# keep only rows where ALL Factor_ columns are not "NULL"
all_data <- all_data %>%
filter_at(vars(starts_with("Factor_")), all_vars(. != "NULL"))
nrow(all_data)
## [1] 489537
After filtering out rows containing the string “NULL” in any Factor_ column, the dataset has 489537 rows.
all_data <- all_data %>%
mutate_at(vars(-Transaction_Timestamp), as.factor) %>%
mutate(Month = factor(Month,
levels = c("Jan","Feb","Mar","Apr","May","Jun",
"Jul","Aug","Sep","Oct","Nov","Dec"),
ordered = TRUE))
# make sure all features are changed to factors
glimpse(df)
## function (x, df1, df2, ncp, log = FALSE)
# make sure the Month variable is an ordered factor
levels(all_data$Month)
## [1] "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov" "Dec"
all_data %>%
summarize_if(is.factor, n_distinct)
## # A tibble: 1 × 9
## Account_ID Factor_A Factor_B Factor_C Factor_D Factor_E Response
## <int> <int> <int> <int> <int> <int> <int>
## 1 324174 2 3 2 12 62 30
## # ℹ 2 more variables: Transaction_Status <int>, Month <int>
all_data %>%
group_by(Transaction_Status) %>%
summarize_if(is.factor, n_distinct)
## # A tibble: 2 × 9
## Transaction_Status Account_ID Factor_A Factor_B Factor_C Factor_D Factor_E
## <fct> <int> <int> <int> <int> <int> <int>
## 1 Approved 316172 2 3 2 11 59
## 2 Declined 14066 2 3 2 11 57
## # ℹ 2 more variables: Response <int>, Month <int>
When grouping by Transaction_Status, most variables show
an equal number of unique values across the Approved and
Declined transactions. The only exceptions are:
This indicates near-equal representation across variables overall, with some imbalance in these two cases.
convert_to_qtr <- function(month_vec) {
case_when(
month_vec %in% c("Jan", "Feb", "Mar") ~ "Q1",
month_vec %in% c("Apr", "May", "Jun") ~ "Q2",
month_vec %in% c("Jul", "Aug", "Sep") ~ "Q3",
month_vec %in% c("Oct", "Nov", "Dec") ~ "Q4",
TRUE ~ NA_character_
)
}
# TEST
example_months <- c("Jan", "Mar", "May", "May", "Aug", "Nov", "Nov", "Dec")
convert_to_qtr(example_months)
## [1] "Q1" "Q1" "Q2" "Q2" "Q3" "Q4" "Q4" "Q4"
all_data <- all_data %>%
mutate(Qtr = convert_to_qtr(Month))
all_data %>%
count(Qtr)
## # A tibble: 4 × 2
## Qtr n
## <chr> <int>
## 1 Q1 85588
## 2 Q2 100227
## 3 Q3 161071
## 4 Q4 142651
library(repurrrsive)
sw_people %>% map_chr(~ .x$name)
## [1] "Luke Skywalker" "C-3PO" "R2-D2"
## [4] "Darth Vader" "Leia Organa" "Owen Lars"
## [7] "Beru Whitesun lars" "R5-D4" "Biggs Darklighter"
## [10] "Obi-Wan Kenobi" "Anakin Skywalker" "Wilhuff Tarkin"
## [13] "Chewbacca" "Han Solo" "Greedo"
## [16] "Jabba Desilijic Tiure" "Wedge Antilles" "Jek Tono Porkins"
## [19] "Yoda" "Palpatine" "Boba Fett"
## [22] "IG-88" "Bossk" "Lando Calrissian"
## [25] "Lobot" "Ackbar" "Mon Mothma"
## [28] "Arvel Crynyd" "Wicket Systri Warrick" "Nien Nunb"
## [31] "Qui-Gon Jinn" "Nute Gunray" "Finis Valorum"
## [34] "Jar Jar Binks" "Roos Tarpals" "Rugor Nass"
## [37] "Ric Olié" "Watto" "Sebulba"
## [40] "Quarsh Panaka" "Shmi Skywalker" "Darth Maul"
## [43] "Bib Fortuna" "Ayla Secura" "Dud Bolt"
## [46] "Gasgano" "Ben Quadinaros" "Mace Windu"
## [49] "Ki-Adi-Mundi" "Kit Fisto" "Eeth Koth"
## [52] "Adi Gallia" "Saesee Tiin" "Yarael Poof"
## [55] "Plo Koon" "Mas Amedda" "Gregar Typho"
## [58] "Cordé" "Cliegg Lars" "Poggle the Lesser"
## [61] "Luminara Unduli" "Barriss Offee" "Dormé"
## [64] "Dooku" "Bail Prestor Organa" "Jango Fett"
## [67] "Zam Wesell" "Dexter Jettster" "Lama Su"
## [70] "Taun We" "Jocasta Nu" "Ratts Tyerell"
## [73] "R4-P17" "Wat Tambor" "San Hill"
## [76] "Shaak Ti" "Grievous" "Tarfful"
## [79] "Raymus Antilles" "Sly Moore" "Tion Medon"
## [82] "Finn" "Rey" "Poe Dameron"
## [85] "BB8" "Captain Phasma" "Padmé Amidala"
sw_people %>% map_int(~ length(.x$films))
## [1] 5 6 7 4 5 3 3 1 1 6 3 2 5 4 1 3 3 1 5 5 3 1 1 2 1 2 1 1 1 1 1 3 1 2 1 1 1 2
## [39] 1 1 2 1 1 3 1 1 1 3 3 3 2 2 2 1 3 2 1 1 1 2 2 1 1 2 2 1 1 1 1 1 1 1 2 1 1 2
## [77] 1 1 2 2 1 1 1 1 1 1 3
top10_films <- tibble(
name = sw_people %>% map_chr("name"),
n_films = sw_people %>% map_int(~ length(.x$films))
) %>%
arrange(desc(n_films)) %>%
slice_head(n = 10)
top10_films
## # A tibble: 10 × 2
## name n_films
## <chr> <int>
## 1 R2-D2 7
## 2 C-3PO 6
## 3 Obi-Wan Kenobi 6
## 4 Luke Skywalker 5
## 5 Leia Organa 5
## 6 Chewbacca 5
## 7 Yoda 5
## 8 Palpatine 5
## 9 Darth Vader 4
## 10 Han Solo 4
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.2
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
sw_people %>%
map_chr("name") %>%
set_names(sw_people, nm = .) %>%
map_df(~ length(.x$films)) %>%
pivot_longer(
cols = everything(),
names_to = "Character",
values_to = "Films"
) %>%
ggplot(aes(Films, reorder(Character, Films))) +
geom_point()