library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(repurrrsive)
library(here)
## here() starts at C:/Users/anewe/OneDrive/Desktop/Data Wrangling/module6_lab
here::set_here("/Users/anewe/OneDrive/Desktop/Data Wrangling/module6_lab/data")
## File .here already exists in C:\Users\anewe\OneDrive\Desktop\Data Wrangling\module6_lab\data
files <- list.files(here("data"), pattern = "Month", full.names = TRUE)
df_list <- list()
for (i in files) {
  df <- read_csv(i)
  df_list[[length(df_list) + 1]] <- df
}
## Rows: 54535 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 44380 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 53259 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 51033 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 55079 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 59666 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 64268 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 69492 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 71855 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 80277 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 94315 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_merged <- bind_rows(df_list)
glimpse(df_merged)
## Rows: 698,159
## Columns: 10
## $ Account_ID            <dbl> 5, 16, 28, 40, 62, 64, 69, 69, 70, 79, 88, 90, 9…
## $ Transaction_Timestamp <dttm> 2009-01-08 00:16:41, 2009-01-20 22:40:08, 2009-…
## $ Factor_A              <dbl> 2, 2, 2, 2, 2, 7, 2, 2, 2, 7, 8, 10, 10, 2, 2, 2…
## $ Factor_B              <dbl> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 18, 6, 6, 6, 6, 6,…
## $ Factor_C              <chr> "VI", "VI", "VI", "VI", "VI", "MC", "VI", "VI", …
## $ Factor_D              <dbl> 20, 20, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, …
## $ Factor_E              <chr> "A", "H", "NULL", "H", "B", "NULL", "H", "H", "B…
## $ Response              <dbl> 1020, 1020, 1020, 1020, 1020, 1020, 1020, 1020, …
## $ Transaction_Status    <chr> "Approved", "Approved", "Approved", "Approved", …
## $ Month                 <chr> "Jan", "Jan", "Jan", "Jan", "Jan", "Jan", "Jan",…
  1. Use one of the map() function variants to check the current class of each column i.e. (class(df$Account_ID)).
column_classes <- map(df_merged, class)
print(column_classes)
## $Account_ID
## [1] "numeric"
## 
## $Transaction_Timestamp
## [1] "POSIXct" "POSIXt" 
## 
## $Factor_A
## [1] "numeric"
## 
## $Factor_B
## [1] "numeric"
## 
## $Factor_C
## [1] "character"
## 
## $Factor_D
## [1] "numeric"
## 
## $Factor_E
## [1] "character"
## 
## $Response
## [1] "numeric"
## 
## $Transaction_Status
## [1] "character"
## 
## $Month
## [1] "character"
  1. Use one of the map() function variants to assess how many unique values exists in each column?
unique_counts <- map_int(df_merged, ~ length(unique(.x)))
print(unique_counts)
##            Account_ID Transaction_Timestamp              Factor_A 
##                475413                686538                     7 
##              Factor_B              Factor_C              Factor_D 
##                     6                     4                    15 
##              Factor_E              Response    Transaction_Status 
##                    63                    42                     2 
##                 Month 
##                    11
  1. The “Factor_D” variable contains 15 unique values (i.e. 10, 15, 20, 21, . . . , 85, 90). There is at least one single observation where Factor_D = 26 (possibly more). Assume these observations were improperly recorded and, in fact, the value should be 25. Using ifelse() (or dplyr’s if_else()) inside mutate(), recode any values where Factor_D == 26 to be. After completing this, how many unique values exist in this column? How many observations are there for each level of Factor_D?
df_merged <- df_merged %>%
  mutate(Factor_D = if_else(Factor_D == 26, 25, Factor_D))
unique_counts <- map_int(df_merged, ~ length(unique(.x)))
print(unique_counts)
##            Account_ID Transaction_Timestamp              Factor_A 
##                475413                686538                     7 
##              Factor_B              Factor_C              Factor_D 
##                     6                     4                    14 
##              Factor_E              Response    Transaction_Status 
##                    63                    42                     2 
##                 Month 
##                    11
df_merged %>%
  count(Factor_D)
## # A tibble: 14 × 2
##    Factor_D      n
##       <dbl>  <int>
##  1       10   4595
##  2       15   1089
##  3       20 527882
##  4       21  68072
##  5       25  41021
##  6       30   7030
##  7       31    512
##  8       35  25298
##  9       40   2720
## 10       50   3709
## 11       55  15200
## 12       70     54
## 13       85      4
## 14       90    973
  1. Unfortunately, some of the “Factor_” variables have observations that contain the value “NULL” (they are recorded as a character string, not the actual NULL value. Use filter_at() to filter out any of these observations. We have not spent much time using filter_at() so you may need to do some research on it! How many rows does your data now have (hint: it should be less than 500,000)?
df_merged_clean <- df_merged %>%
  filter_at(vars(starts_with("Factor_")), all_vars(. != "NULL"))

nrow(df_merged_clean)
## [1] 489537
  1. Using mutate_at() , convert all variables except for “Transaction_Timestamp” to factors. However, make sure the “Month” variable is an ordered factor. This may require you to do two separate mutate() statements.
df_merged_clean <- df_merged_clean %>%
  mutate_at(vars(-Transaction_Timestamp), as.factor) %>%
  mutate(Month = factor(Month,
                        levels = c("Jan","Feb","Mar","Apr","May","Jun",
                                   "Jul","Aug","Sep","Oct","Nov","Dec"),
                        ordered = TRUE))

glimpse(df_merged_clean)
## Rows: 489,537
## Columns: 10
## $ Account_ID            <fct> 5, 16, 40, 62, 69, 69, 70, 95, 101, 101, 102, 10…
## $ Transaction_Timestamp <dttm> 2009-01-08 00:16:41, 2009-01-20 22:40:08, 2009-…
## $ Factor_A              <fct> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ Factor_B              <fct> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, …
## $ Factor_C              <fct> VI, VI, VI, VI, VI, VI, VI, VI, VI, VI, VI, VI, …
## $ Factor_D              <fct> 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, …
## $ Factor_E              <fct> A, H, H, B, H, H, B, G, G2, G2, A, A, C, A, A, A…
## $ Response              <fct> 1020, 1020, 1020, 1020, 1020, 1020, 1020, 1020, …
## $ Transaction_Status    <fct> Approved, Approved, Approved, Approved, Approved…
## $ Month                 <ord> Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan…
levels(df_merged_clean$Month)
##  [1] "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov" "Dec"
  1. Use the summarize_if() function to assess how many unique values there are for all the other variables in our data set? Hint: use n_distinct().
df_merged_clean %>%
  summarize_if(is.factor, n_distinct)
## # A tibble: 1 × 9
##   Account_ID Factor_A Factor_B Factor_C Factor_D Factor_E Response
##        <int>    <int>    <int>    <int>    <int>    <int>    <int>
## 1     324174        2        3        2       12       62       30
## # ℹ 2 more variables: Transaction_Status <int>, Month <int>

Now, if you group by “Transaction_Status”, are there approximately equal distributions of unique values across all the variables? In other words, for transactions that are approved versus declined, do we have near equal representation of observations across each variable?

df_merged_clean %>%
  group_by(Transaction_Status) %>%
  summarize_if(is.factor, n_distinct)
## # A tibble: 2 × 9
##   Transaction_Status Account_ID Factor_A Factor_B Factor_C Factor_D Factor_E
##   <fct>                   <int>    <int>    <int>    <int>    <int>    <int>
## 1 Approved               316172        2        3        2       11       59
## 2 Declined                14066        2        3        2       11       57
## # ℹ 2 more variables: Response <int>, Month <int>
  1. Create a function convert_to_qtr() that converts monthly values to quarters. This function should take a vector of character month values (“Jan”, “Feb”, . . . , “Dec”) and convert to “Q1”, “Q2”, “Q3”, or “Q4”. Do it such that: • If the month input is Jan-Mar, then the function returns “Q1” • If the month input is Apr-Jun, then the function returns “Q2” • If the month input is Jul-Sep, then the function returns “Q3” • If the month input is Oct-Dec, then the function returns “Q4”
convert_to_qtr <- function(month_vec) {
  case_when(
    month_vec %in% c("Jan", "Feb", "Mar") ~ "Q1",
    month_vec %in% c("Apr", "May", "Jun") ~ "Q2",
    month_vec %in% c("Jul", "Aug", "Sep") ~ "Q3",
    month_vec %in% c("Oct", "Nov", "Dec") ~ "Q4",
  )}
example_months <- c("Jan", "Mar", "May", "May", "Aug", "Nov", "Nov", "Dec")
convert_to_qtr(example_months)
## [1] "Q1" "Q1" "Q2" "Q2" "Q3" "Q4" "Q4" "Q4"

Now, use this function you created above in a mutate() statement to create a new variable called “Qtr” in the data frame. How many observations do you have in each quarter?

df_merged_clean <- df_merged_clean %>%
  mutate(Qtr = convert_to_qtr(Month))

df_merged_clean %>%
  count(Qtr)
## # A tibble: 4 × 2
##   Qtr        n
##   <chr>  <int>
## 1 Q1     85588
## 2 Q2    100227
## 3 Q3    161071
## 4 Q4    142651
  1. Take some time to understand the sw_people data set provided by the the repurrrsive package (?sw_people). Using a map_xxx() variant, extract the name of each Star Wars character in the sw_people data set. Hint: the first element of each list item is the characters name.
str
## function (object, ...) 
## UseMethod("str")
## <bytecode: 0x0000019e2d8b1b98>
## <environment: namespace:utils>
character_names <- map_chr(sw_people, 1)

character_names
##  [1] "Luke Skywalker"        "C-3PO"                 "R2-D2"                
##  [4] "Darth Vader"           "Leia Organa"           "Owen Lars"            
##  [7] "Beru Whitesun lars"    "R5-D4"                 "Biggs Darklighter"    
## [10] "Obi-Wan Kenobi"        "Anakin Skywalker"      "Wilhuff Tarkin"       
## [13] "Chewbacca"             "Han Solo"              "Greedo"               
## [16] "Jabba Desilijic Tiure" "Wedge Antilles"        "Jek Tono Porkins"     
## [19] "Yoda"                  "Palpatine"             "Boba Fett"            
## [22] "IG-88"                 "Bossk"                 "Lando Calrissian"     
## [25] "Lobot"                 "Ackbar"                "Mon Mothma"           
## [28] "Arvel Crynyd"          "Wicket Systri Warrick" "Nien Nunb"            
## [31] "Qui-Gon Jinn"          "Nute Gunray"           "Finis Valorum"        
## [34] "Jar Jar Binks"         "Roos Tarpals"          "Rugor Nass"           
## [37] "Ric Olié"              "Watto"                 "Sebulba"              
## [40] "Quarsh Panaka"         "Shmi Skywalker"        "Darth Maul"           
## [43] "Bib Fortuna"           "Ayla Secura"           "Dud Bolt"             
## [46] "Gasgano"               "Ben Quadinaros"        "Mace Windu"           
## [49] "Ki-Adi-Mundi"          "Kit Fisto"             "Eeth Koth"            
## [52] "Adi Gallia"            "Saesee Tiin"           "Yarael Poof"          
## [55] "Plo Koon"              "Mas Amedda"            "Gregar Typho"         
## [58] "Cordé"                 "Cliegg Lars"           "Poggle the Lesser"    
## [61] "Luminara Unduli"       "Barriss Offee"         "Dormé"                
## [64] "Dooku"                 "Bail Prestor Organa"   "Jango Fett"           
## [67] "Zam Wesell"            "Dexter Jettster"       "Lama Su"              
## [70] "Taun We"               "Jocasta Nu"            "Ratts Tyerell"        
## [73] "R4-P17"                "Wat Tambor"            "San Hill"             
## [76] "Shaak Ti"              "Grievous"              "Tarfful"              
## [79] "Raymus Antilles"       "Sly Moore"             "Tion Medon"           
## [82] "Finn"                  "Rey"                   "Poe Dameron"          
## [85] "BB8"                   "Captain Phasma"        "Padmé Amidala"
  1. Using the sw_people data set, find the number of films each Star Wars characters appears in. Be sure to use the most appropriate map_xxx() variant.
sw_people %>% 
  map_int(~ length(.x$films))
##  [1] 5 6 7 4 5 3 3 1 1 6 3 2 5 4 1 3 3 1 5 5 3 1 1 2 1 2 1 1 1 1 1 3 1 2 1 1 1 2
## [39] 1 1 2 1 1 3 1 1 1 3 3 3 2 2 2 1 3 2 1 1 1 2 2 1 1 2 2 1 1 1 1 1 1 1 2 1 1 2
## [77] 1 1 2 2 1 1 1 1 1 1 3
sw_people %>%
  map_chr("name") %>%                 
  set_names(sw_people, nm = .) %>%    
  map_df(~ tibble(Films = length(.x$films)), .id = "Character") %>%
  ggplot(aes(reorder(Character, Films), Films)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(
    x = NULL,
    y = "Number of Films",
    title = "Star Wars Characters by Number of Films"
  ) +
  theme_minimal(base_size = 12) +
  theme(axis.text.y = element_text(size = 8))