Loading packages
library(tidyverse)
library(purrr)
library(readr)
library(dplyr)
library(here)
monthly_data_files <- here("data/")
csv_files <- list.files(monthly_data_files, pattern = "*.csv", full.names = TRUE)
df <- map_df(csv_files, read_csv)
Rows: 54535 Columns: 10── Column specification ────────────────────────────────────────────────
Delimiter: ","
chr (4): Factor_C, Factor_E, Transaction_Status, Month
dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
dttm (1): Transaction_Timestamp
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 44380 Columns: 10── Column specification ────────────────────────────────────────────────
Delimiter: ","
chr (4): Factor_C, Factor_E, Transaction_Status, Month
dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
dttm (1): Transaction_Timestamp
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 53259 Columns: 10── Column specification ────────────────────────────────────────────────
Delimiter: ","
chr (4): Factor_C, Factor_E, Transaction_Status, Month
dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
dttm (1): Transaction_Timestamp
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 51033 Columns: 10── Column specification ────────────────────────────────────────────────
Delimiter: ","
chr (4): Factor_C, Factor_E, Transaction_Status, Month
dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
dttm (1): Transaction_Timestamp
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 55079 Columns: 10── Column specification ────────────────────────────────────────────────
Delimiter: ","
chr (4): Factor_C, Factor_E, Transaction_Status, Month
dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
dttm (1): Transaction_Timestamp
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 59666 Columns: 10── Column specification ────────────────────────────────────────────────
Delimiter: ","
chr (4): Factor_C, Factor_E, Transaction_Status, Month
dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
dttm (1): Transaction_Timestamp
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 64268 Columns: 10── Column specification ────────────────────────────────────────────────
Delimiter: ","
chr (4): Factor_C, Factor_E, Transaction_Status, Month
dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
dttm (1): Transaction_Timestamp
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 69492 Columns: 10── Column specification ────────────────────────────────────────────────
Delimiter: ","
chr (4): Factor_C, Factor_E, Transaction_Status, Month
dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
dttm (1): Transaction_Timestamp
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 71855 Columns: 10── Column specification ────────────────────────────────────────────────
Delimiter: ","
chr (4): Factor_C, Factor_E, Transaction_Status, Month
dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
dttm (1): Transaction_Timestamp
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 80277 Columns: 10── Column specification ────────────────────────────────────────────────
Delimiter: ","
chr (4): Factor_C, Factor_E, Transaction_Status, Month
dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
dttm (1): Transaction_Timestamp
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 94315 Columns: 10── Column specification ────────────────────────────────────────────────
Delimiter: ","
chr (4): Factor_C, Factor_E, Transaction_Status, Month
dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
dttm (1): Transaction_Timestamp
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(df)
Rows: 698,159
Columns: 10
$ Account_ID <dbl> 5, 16, 28, 40, 62, 64, 69, 69, 70, 79, 8…
$ Transaction_Timestamp <dttm> 2009-01-08 00:16:41, 2009-01-20 22:40:0…
$ Factor_A <dbl> 2, 2, 2, 2, 2, 7, 2, 2, 2, 7, 8, 10, 10,…
$ Factor_B <dbl> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 18, 6, 6, …
$ Factor_C <chr> "VI", "VI", "VI", "VI", "VI", "MC", "VI"…
$ Factor_D <dbl> 20, 20, 21, 20, 20, 20, 20, 20, 20, 20, …
$ Factor_E <chr> "A", "H", "NULL", "H", "B", "NULL", "H",…
$ Response <dbl> 1020, 1020, 1020, 1020, 1020, 1020, 1020…
$ Transaction_Status <chr> "Approved", "Approved", "Approved", "App…
$ Month <chr> "Jan", "Jan", "Jan", "Jan", "Jan", "Jan"…
classes <- map(df, class)
classes
$Account_ID
[1] "numeric"
$Transaction_Timestamp
[1] "POSIXct" "POSIXt"
$Factor_A
[1] "numeric"
$Factor_B
[1] "numeric"
$Factor_C
[1] "character"
$Factor_D
[1] "numeric"
$Factor_E
[1] "character"
$Response
[1] "numeric"
$Transaction_Status
[1] "character"
$Month
[1] "character"
unique <- map_int(df, ~ n_distinct(.))
unique
Account_ID Transaction_Timestamp Factor_A
475413 686538 7
Factor_B Factor_C Factor_D
6 4 15
Factor_E Response Transaction_Status
63 42 2
Month
11
df %>%
mutate(Factor_D, value = ifelse(Factor_D == 26, 25, Factor_D)) %>%
count(value)
df_filter <- df %>%
filter_at(vars(starts_with("Factor_")), ~ . != "NULL")
df_filter
## How many rows does your data now have (hint: it should be less than 500,000)?
nrow(df_filter)
[1] 489537
df <- df_filter %>% mutate_at(vars(-Transaction_Timestamp), as.factor) %>%
mutate(Month = factor(Month,
levels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"),
ordered = TRUE))
glimpse(df)
Rows: 489,537
Columns: 10
$ Account_ID <fct> 5, 16, 40, 62, 69, 69, 70, 95, 101, 101,…
$ Transaction_Timestamp <dttm> 2009-01-08 00:16:41, 2009-01-20 22:40:0…
$ Factor_A <fct> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2…
$ Factor_B <fct> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6…
$ Factor_C <fct> VI, VI, VI, VI, VI, VI, VI, VI, VI, VI, …
$ Factor_D <fct> 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, …
$ Factor_E <fct> A, H, H, B, H, H, B, G, G2, G2, A, A, C,…
$ Response <fct> 1020, 1020, 1020, 1020, 1020, 1020, 1020…
$ Transaction_Status <fct> Approved, Approved, Approved, Approved, …
$ Month <ord> Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, …
levels(df$Month)
[1] "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov"
[12] "Dec"
df %>%
summarize_if(is.factor, n_distinct)
df %>%
group_by(Transaction_Status) %>%
summarise_if(is.factor, n_distinct)
convert_to_qtr <- function(month) {
case_when(
month %in% c("Jan", "Feb", "Mar") ~ "Q1",
month %in% c("Apr", "May", "Jun") ~ "Q2",
month %in% c("Jul", "Aug", "Sep") ~ "Q3",
month %in% c("Oct", "Nov", "Dec") ~ "Q4"
)
}
example_months <- c("Jan", "Mar", "May", "May", "Aug", "Nov", "Nov", "Dec")
convert_to_qtr(example_months)
[1] "Q1" "Q1" "Q2" "Q2" "Q3" "Q4" "Q4" "Q4"
df <- df %>%
mutate(Qtr = convert_to_qtr(Month))
df %>% count(Qtr)
library(repurrrsive)
library(purrr)
sw_people %>%
map_chr(~.x$name)
[1] "Luke Skywalker" "C-3PO"
[3] "R2-D2" "Darth Vader"
[5] "Leia Organa" "Owen Lars"
[7] "Beru Whitesun lars" "R5-D4"
[9] "Biggs Darklighter" "Obi-Wan Kenobi"
[11] "Anakin Skywalker" "Wilhuff Tarkin"
[13] "Chewbacca" "Han Solo"
[15] "Greedo" "Jabba Desilijic Tiure"
[17] "Wedge Antilles" "Jek Tono Porkins"
[19] "Yoda" "Palpatine"
[21] "Boba Fett" "IG-88"
[23] "Bossk" "Lando Calrissian"
[25] "Lobot" "Ackbar"
[27] "Mon Mothma" "Arvel Crynyd"
[29] "Wicket Systri Warrick" "Nien Nunb"
[31] "Qui-Gon Jinn" "Nute Gunray"
[33] "Finis Valorum" "Jar Jar Binks"
[35] "Roos Tarpals" "Rugor Nass"
[37] "Ric Olié" "Watto"
[39] "Sebulba" "Quarsh Panaka"
[41] "Shmi Skywalker" "Darth Maul"
[43] "Bib Fortuna" "Ayla Secura"
[45] "Dud Bolt" "Gasgano"
[47] "Ben Quadinaros" "Mace Windu"
[49] "Ki-Adi-Mundi" "Kit Fisto"
[51] "Eeth Koth" "Adi Gallia"
[53] "Saesee Tiin" "Yarael Poof"
[55] "Plo Koon" "Mas Amedda"
[57] "Gregar Typho" "Cordé"
[59] "Cliegg Lars" "Poggle the Lesser"
[61] "Luminara Unduli" "Barriss Offee"
[63] "Dormé" "Dooku"
[65] "Bail Prestor Organa" "Jango Fett"
[67] "Zam Wesell" "Dexter Jettster"
[69] "Lama Su" "Taun We"
[71] "Jocasta Nu" "Ratts Tyerell"
[73] "R4-P17" "Wat Tambor"
[75] "San Hill" "Shaak Ti"
[77] "Grievous" "Tarfful"
[79] "Raymus Antilles" "Sly Moore"
[81] "Tion Medon" "Finn"
[83] "Rey" "Poe Dameron"
[85] "BB8" "Captain Phasma"
[87] "Padmé Amidala"
Exercise 10
sw_people%>%
map_int(~ length(.x$films))
[1] 5 6 7 4 5 3 3 1 1 6 3 2 5 4 1 3 3 1 5 5 3 1 1 2 1 2 1 1 1 1 1 3 1 2
[35] 1 1 1 2 1 1 2 1 1 3 1 1 1 3 3 3 2 2 2 1 3 2 1 1 1 2 2 1 1 2 2 1 1 1
[69] 1 1 1 1 2 1 1 2 1 1 2 2 1 1 1 1 1 1 3
Exercise 11
library(ggplot2)
library(tidyr)
library(purrr)
library(dplyr)
sw_people %>%
map_chr(~.x$name) %>%
set_names(sw_people, nm = .) %>%
map_df(~ length(.x$films)) %>%
pivot_longer(
cols=everything(),
names_to = "Character",
values_to = "Films",
) %>%
ggplot(aes(Films, reorder(Character, Films))) +
geom_point()