library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(repurrrsive)
library(here)
## here() starts at C:/Users/anewe/OneDrive/Desktop/Data Wrangling/module6_lab
here::set_here("/Users/anewe/OneDrive/Desktop/Data Wrangling/module6_lab/data")
## File .here already exists in C:\Users\anewe\OneDrive\Desktop\Data Wrangling\module6_lab\data
files <- list.files(here("data"), pattern = "Month", full.names = TRUE)
df_list <- list()
for (i in files) {
df <- read_csv(i)
df_list[[length(df_list) + 1]] <- df
}
## Rows: 54535 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 44380 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 53259 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 51033 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 55079 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 59666 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 64268 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 69492 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 71855 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 80277 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 94315 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_merged <- bind_rows(df_list)
glimpse(df_merged)
## Rows: 698,159
## Columns: 10
## $ Account_ID <dbl> 5, 16, 28, 40, 62, 64, 69, 69, 70, 79, 88, 90, 9…
## $ Transaction_Timestamp <dttm> 2009-01-08 00:16:41, 2009-01-20 22:40:08, 2009-…
## $ Factor_A <dbl> 2, 2, 2, 2, 2, 7, 2, 2, 2, 7, 8, 10, 10, 2, 2, 2…
## $ Factor_B <dbl> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 18, 6, 6, 6, 6, 6,…
## $ Factor_C <chr> "VI", "VI", "VI", "VI", "VI", "MC", "VI", "VI", …
## $ Factor_D <dbl> 20, 20, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, …
## $ Factor_E <chr> "A", "H", "NULL", "H", "B", "NULL", "H", "H", "B…
## $ Response <dbl> 1020, 1020, 1020, 1020, 1020, 1020, 1020, 1020, …
## $ Transaction_Status <chr> "Approved", "Approved", "Approved", "Approved", …
## $ Month <chr> "Jan", "Jan", "Jan", "Jan", "Jan", "Jan", "Jan",…
column_classes <- map(df_merged, class)
print(column_classes)
## $Account_ID
## [1] "numeric"
##
## $Transaction_Timestamp
## [1] "POSIXct" "POSIXt"
##
## $Factor_A
## [1] "numeric"
##
## $Factor_B
## [1] "numeric"
##
## $Factor_C
## [1] "character"
##
## $Factor_D
## [1] "numeric"
##
## $Factor_E
## [1] "character"
##
## $Response
## [1] "numeric"
##
## $Transaction_Status
## [1] "character"
##
## $Month
## [1] "character"
unique_counts <- map_int(df_merged, ~ length(unique(.x)))
print(unique_counts)
## Account_ID Transaction_Timestamp Factor_A
## 475413 686538 7
## Factor_B Factor_C Factor_D
## 6 4 15
## Factor_E Response Transaction_Status
## 63 42 2
## Month
## 11
df_merged <- df_merged %>%
mutate(Factor_D = if_else(Factor_D == 26, 25, Factor_D))
unique_counts <- map_int(df_merged, ~ length(unique(.x)))
print(unique_counts)
## Account_ID Transaction_Timestamp Factor_A
## 475413 686538 7
## Factor_B Factor_C Factor_D
## 6 4 14
## Factor_E Response Transaction_Status
## 63 42 2
## Month
## 11
df_merged %>%
count(Factor_D)
## # A tibble: 14 × 2
## Factor_D n
## <dbl> <int>
## 1 10 4595
## 2 15 1089
## 3 20 527882
## 4 21 68072
## 5 25 41021
## 6 30 7030
## 7 31 512
## 8 35 25298
## 9 40 2720
## 10 50 3709
## 11 55 15200
## 12 70 54
## 13 85 4
## 14 90 973
df_merged_clean <- df_merged %>%
filter_at(vars(starts_with("Factor_")), all_vars(. != "NULL"))
nrow(df_merged_clean)
## [1] 489537
df_merged_clean <- df_merged_clean %>%
mutate_at(vars(-Transaction_Timestamp), as.factor) %>%
mutate(Month = factor(Month,
levels = c("Jan","Feb","Mar","Apr","May","Jun",
"Jul","Aug","Sep","Oct","Nov","Dec"),
ordered = TRUE))
glimpse(df_merged_clean)
## Rows: 489,537
## Columns: 10
## $ Account_ID <fct> 5, 16, 40, 62, 69, 69, 70, 95, 101, 101, 102, 10…
## $ Transaction_Timestamp <dttm> 2009-01-08 00:16:41, 2009-01-20 22:40:08, 2009-…
## $ Factor_A <fct> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ Factor_B <fct> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, …
## $ Factor_C <fct> VI, VI, VI, VI, VI, VI, VI, VI, VI, VI, VI, VI, …
## $ Factor_D <fct> 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, …
## $ Factor_E <fct> A, H, H, B, H, H, B, G, G2, G2, A, A, C, A, A, A…
## $ Response <fct> 1020, 1020, 1020, 1020, 1020, 1020, 1020, 1020, …
## $ Transaction_Status <fct> Approved, Approved, Approved, Approved, Approved…
## $ Month <ord> Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan…
levels(df_merged_clean$Month)
## [1] "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov" "Dec"
df_merged_clean %>%
summarize_if(is.factor, n_distinct)
## # A tibble: 1 × 9
## Account_ID Factor_A Factor_B Factor_C Factor_D Factor_E Response
## <int> <int> <int> <int> <int> <int> <int>
## 1 324174 2 3 2 12 62 30
## # ℹ 2 more variables: Transaction_Status <int>, Month <int>
Now, if you group by “Transaction_Status”, are there approximately equal distributions of unique values across all the variables? In other words, for transactions that are approved versus declined, do we have near equal representation of observations across each variable?
df_merged_clean %>%
group_by(Transaction_Status) %>%
summarize_if(is.factor, n_distinct)
## # A tibble: 2 × 9
## Transaction_Status Account_ID Factor_A Factor_B Factor_C Factor_D Factor_E
## <fct> <int> <int> <int> <int> <int> <int>
## 1 Approved 316172 2 3 2 11 59
## 2 Declined 14066 2 3 2 11 57
## # ℹ 2 more variables: Response <int>, Month <int>
convert_to_qtr <- function(month_vec) {
case_when(
month_vec %in% c("Jan", "Feb", "Mar") ~ "Q1",
month_vec %in% c("Apr", "May", "Jun") ~ "Q2",
month_vec %in% c("Jul", "Aug", "Sep") ~ "Q3",
month_vec %in% c("Oct", "Nov", "Dec") ~ "Q4",
)}
example_months <- c("Jan", "Mar", "May", "May", "Aug", "Nov", "Nov", "Dec")
convert_to_qtr(example_months)
## [1] "Q1" "Q1" "Q2" "Q2" "Q3" "Q4" "Q4" "Q4"
Now, use this function you created above in a mutate() statement to create a new variable called “Qtr” in the data frame. How many observations do you have in each quarter?
df_merged_clean <- df_merged_clean %>%
mutate(Qtr = convert_to_qtr(Month))
df_merged_clean %>%
count(Qtr)
## # A tibble: 4 × 2
## Qtr n
## <chr> <int>
## 1 Q1 85588
## 2 Q2 100227
## 3 Q3 161071
## 4 Q4 142651
str
## function (object, ...)
## UseMethod("str")
## <bytecode: 0x0000019e2d8b1b98>
## <environment: namespace:utils>
character_names <- map_chr(sw_people, 1)
character_names
## [1] "Luke Skywalker" "C-3PO" "R2-D2"
## [4] "Darth Vader" "Leia Organa" "Owen Lars"
## [7] "Beru Whitesun lars" "R5-D4" "Biggs Darklighter"
## [10] "Obi-Wan Kenobi" "Anakin Skywalker" "Wilhuff Tarkin"
## [13] "Chewbacca" "Han Solo" "Greedo"
## [16] "Jabba Desilijic Tiure" "Wedge Antilles" "Jek Tono Porkins"
## [19] "Yoda" "Palpatine" "Boba Fett"
## [22] "IG-88" "Bossk" "Lando Calrissian"
## [25] "Lobot" "Ackbar" "Mon Mothma"
## [28] "Arvel Crynyd" "Wicket Systri Warrick" "Nien Nunb"
## [31] "Qui-Gon Jinn" "Nute Gunray" "Finis Valorum"
## [34] "Jar Jar Binks" "Roos Tarpals" "Rugor Nass"
## [37] "Ric Olié" "Watto" "Sebulba"
## [40] "Quarsh Panaka" "Shmi Skywalker" "Darth Maul"
## [43] "Bib Fortuna" "Ayla Secura" "Dud Bolt"
## [46] "Gasgano" "Ben Quadinaros" "Mace Windu"
## [49] "Ki-Adi-Mundi" "Kit Fisto" "Eeth Koth"
## [52] "Adi Gallia" "Saesee Tiin" "Yarael Poof"
## [55] "Plo Koon" "Mas Amedda" "Gregar Typho"
## [58] "Cordé" "Cliegg Lars" "Poggle the Lesser"
## [61] "Luminara Unduli" "Barriss Offee" "Dormé"
## [64] "Dooku" "Bail Prestor Organa" "Jango Fett"
## [67] "Zam Wesell" "Dexter Jettster" "Lama Su"
## [70] "Taun We" "Jocasta Nu" "Ratts Tyerell"
## [73] "R4-P17" "Wat Tambor" "San Hill"
## [76] "Shaak Ti" "Grievous" "Tarfful"
## [79] "Raymus Antilles" "Sly Moore" "Tion Medon"
## [82] "Finn" "Rey" "Poe Dameron"
## [85] "BB8" "Captain Phasma" "Padmé Amidala"
sw_people %>%
map_int(~ length(.x$films))
## [1] 5 6 7 4 5 3 3 1 1 6 3 2 5 4 1 3 3 1 5 5 3 1 1 2 1 2 1 1 1 1 1 3 1 2 1 1 1 2
## [39] 1 1 2 1 1 3 1 1 1 3 3 3 2 2 2 1 3 2 1 1 1 2 2 1 1 2 2 1 1 1 1 1 1 1 2 1 1 2
## [77] 1 1 2 2 1 1 1 1 1 1 3
sw_people %>%
map_chr("name") %>%
set_names(sw_people, nm = .) %>%
map_df(~ tibble(Films = length(.x$films)), .id = "Character") %>%
ggplot(aes(reorder(Character, Films), Films)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(
x = NULL,
y = "Number of Films",
title = "Star Wars Characters by Number of Films"
) +
theme_minimal(base_size = 12) +
theme(axis.text.y = element_text(size = 8))