Module Lab 6 - Suraj Singh

# R code goes here


library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(purrr)
library(readr)
library(repurrrsive)
library(dplyr)
library(ggplot2)
library(tidyr) 


## Exercise 1

  setwd("/Users/surajsingh/MS BANA/DATA WRANGLING 7025-002 /Data/Lab 6/data")
  file_list <- list.files(pattern = "Month-\\d{2}\\.csv")
  df <- map_df(file_list, read_csv)

## Rows: 54535 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 44380 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 53259 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 51033 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 55079 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 59666 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 64268 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 69492 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 71855 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 80277 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 94315 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

  glimpse(df)

## Rows: 698,159
## Columns: 10
## $ Account_ID            <dbl> 5, 16, 28, 40, 62, 64, 69, 69, 70, 79, 88, 90, 9…
## $ Transaction_Timestamp <dttm> 2009-01-08 00:16:41, 2009-01-20 22:40:08, 2009-…
## $ Factor_A              <dbl> 2, 2, 2, 2, 2, 7, 2, 2, 2, 7, 8, 10, 10, 2, 2, 2…
## $ Factor_B              <dbl> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 18, 6, 6, 6, 6, 6,…
## $ Factor_C              <chr> "VI", "VI", "VI", "VI", "VI", "MC", "VI", "VI", …
## $ Factor_D              <dbl> 20, 20, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, …
## $ Factor_E              <chr> "A", "H", "NULL", "H", "B", "NULL", "H", "H", "B…
## $ Response              <dbl> 1020, 1020, 1020, 1020, 1020, 1020, 1020, 1020, …
## $ Transaction_Status    <chr> "Approved", "Approved", "Approved", "Approved", …
## $ Month                 <chr> "Jan", "Jan", "Jan", "Jan", "Jan", "Jan", "Jan",…

## Exercise 2

  map_column <- map(df, class)
  print(map_column)

## $Account_ID
## [1] "numeric"
## 
## $Transaction_Timestamp
## [1] "POSIXct" "POSIXt" 
## 
## $Factor_A
## [1] "numeric"
## 
## $Factor_B
## [1] "numeric"
## 
## $Factor_C
## [1] "character"
## 
## $Factor_D
## [1] "numeric"
## 
## $Factor_E
## [1] "character"
## 
## $Response
## [1] "numeric"
## 
## $Transaction_Status
## [1] "character"
## 
## $Month
## [1] "character"

## Exercise 3
  
  unique_values_count <- map_int(df, ~n_distinct(.x))
  print(unique_values_count)

##            Account_ID Transaction_Timestamp              Factor_A 
##                475413                686538                     7 
##              Factor_B              Factor_C              Factor_D 
##                     6                     4                    15 
##              Factor_E              Response    Transaction_Status 
##                    63                    42                     2 
##                 Month 
##                    11

## Exercise 4
  
  df <- df %>%mutate(Factor_D = if_else(Factor_D == 26, 25, Factor_D))
  unique_values_count <- n_distinct(df$Factor_D)
  observations_per_level <- df %>%count(Factor_D) %>%arrange(Factor_D)
  print(paste("Number of unique values in Factor_D:", unique_values_count))

## [1] "Number of unique values in Factor_D: 14"

  print("Observations per level of Factor_D:")

## [1] "Observations per level of Factor_D:"

  print(observations_per_level)

## # A tibble: 14 × 2
##    Factor_D      n
##       <dbl>  <int>
##  1       10   4595
##  2       15   1089
##  3       20 527882
##  4       21  68072
##  5       25  41021
##  6       30   7030
##  7       31    512
##  8       35  25298
##  9       40   2720
## 10       50   3709
## 11       55  15200
## 12       70     54
## 13       85      4
## 14       90    973

## Exercise 5

  df_filtered <- df %>%filter_at(vars(starts_with("Factor_")), all_vars(.!="NULL"))
  rows_count <- nrow(df_filtered)
  print(paste("Number of rows after filtering:", rows_count))

## [1] "Number of rows after filtering: 489537"

## Exercise 6

  df <- df_filtered %>% 
    mutate_at(vars(-Transaction_Timestamp), as.factor) %>%
    mutate(Month = factor(Month, levels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"), ordered = TRUE))
  glimpse(df)

## Rows: 489,537
## Columns: 10
## $ Account_ID            <fct> 5, 16, 40, 62, 69, 69, 70, 95, 101, 101, 102, 10…
## $ Transaction_Timestamp <dttm> 2009-01-08 00:16:41, 2009-01-20 22:40:08, 2009-…
## $ Factor_A              <fct> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ Factor_B              <fct> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, …
## $ Factor_C              <fct> VI, VI, VI, VI, VI, VI, VI, VI, VI, VI, VI, VI, …
## $ Factor_D              <fct> 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, …
## $ Factor_E              <fct> A, H, H, B, H, H, B, G, G2, G2, A, A, C, A, A, A…
## $ Response              <fct> 1020, 1020, 1020, 1020, 1020, 1020, 1020, 1020, …
## $ Transaction_Status    <fct> Approved, Approved, Approved, Approved, Approved…
## $ Month                 <ord> Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan…

  levels(df$Month)

##  [1] "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov" "Dec"

## Exercise 7
  
  df %>% 
    summarize_if(is.factor, n_distinct)

## # A tibble: 1 × 9
##   Account_ID Factor_A Factor_B Factor_C Factor_D Factor_E Response
##        <int>    <int>    <int>    <int>    <int>    <int>    <int>
## 1     324174        2        3        2       12       62       30
## # ℹ 2 more variables: Transaction_Status <int>, Month <int>

  df %>% 
    group_by(Transaction_Status) %>% 
    summarize_if(is.factor, n_distinct)

## # A tibble: 2 × 9
##   Transaction_Status Account_ID Factor_A Factor_B Factor_C Factor_D Factor_E
##   <fct>                   <int>    <int>    <int>    <int>    <int>    <int>
## 1 Approved               316172        2        3        2       11       59
## 2 Declined                14066        2        3        2       11       57
## # ℹ 2 more variables: Response <int>, Month <int>

## Exercise 8
 
  convert_to_qtr <- function(months) {
    case_when(
      months %in% c("Jan", "Feb", "Mar") ~ "Q1",
      months %in% c("Apr", "May", "Jun") ~ "Q2",
      months %in% c("Jul", "Aug", "Sep") ~ "Q3",
      months %in% c("Oct", "Nov", "Dec") ~ "Q4",
      TRUE ~ NA_character_  # This handles any unexpected values
    )
  }
  
  example_months <- c("Jan", "Mar", "May", "May", "Aug", "Nov", "Nov", "Dec")
  convert_to_qtr(example_months)

## [1] "Q1" "Q1" "Q2" "Q2" "Q3" "Q4" "Q4" "Q4"

  df <- df %>%
    mutate(Qtr = convert_to_qtr(substr(Month, 1, 3)))
  df %>%
    count(Qtr) %>%
    arrange(Qtr)

## # A tibble: 4 × 2
##   Qtr        n
##   <chr>  <int>
## 1 Q1     85588
## 2 Q2    100227
## 3 Q3    161071
## 4 Q4    142651

## Exercise 9
  
  sw_people %>% map_chr(~.x[[1]])

##  [1] "Luke Skywalker"        "C-3PO"                 "R2-D2"                
##  [4] "Darth Vader"           "Leia Organa"           "Owen Lars"            
##  [7] "Beru Whitesun lars"    "R5-D4"                 "Biggs Darklighter"    
## [10] "Obi-Wan Kenobi"        "Anakin Skywalker"      "Wilhuff Tarkin"       
## [13] "Chewbacca"             "Han Solo"              "Greedo"               
## [16] "Jabba Desilijic Tiure" "Wedge Antilles"        "Jek Tono Porkins"     
## [19] "Yoda"                  "Palpatine"             "Boba Fett"            
## [22] "IG-88"                 "Bossk"                 "Lando Calrissian"     
## [25] "Lobot"                 "Ackbar"                "Mon Mothma"           
## [28] "Arvel Crynyd"          "Wicket Systri Warrick" "Nien Nunb"            
## [31] "Qui-Gon Jinn"          "Nute Gunray"           "Finis Valorum"        
## [34] "Jar Jar Binks"         "Roos Tarpals"          "Rugor Nass"           
## [37] "Ric Olié"              "Watto"                 "Sebulba"              
## [40] "Quarsh Panaka"         "Shmi Skywalker"        "Darth Maul"           
## [43] "Bib Fortuna"           "Ayla Secura"           "Dud Bolt"             
## [46] "Gasgano"               "Ben Quadinaros"        "Mace Windu"           
## [49] "Ki-Adi-Mundi"          "Kit Fisto"             "Eeth Koth"            
## [52] "Adi Gallia"            "Saesee Tiin"           "Yarael Poof"          
## [55] "Plo Koon"              "Mas Amedda"            "Gregar Typho"         
## [58] "Cordé"                 "Cliegg Lars"           "Poggle the Lesser"    
## [61] "Luminara Unduli"       "Barriss Offee"         "Dormé"                
## [64] "Dooku"                 "Bail Prestor Organa"   "Jango Fett"           
## [67] "Zam Wesell"            "Dexter Jettster"       "Lama Su"              
## [70] "Taun We"               "Jocasta Nu"            "Ratts Tyerell"        
## [73] "R4-P17"                "Wat Tambor"            "San Hill"             
## [76] "Shaak Ti"              "Grievous"              "Tarfful"              
## [79] "Raymus Antilles"       "Sly Moore"             "Tion Medon"           
## [82] "Finn"                  "Rey"                   "Poe Dameron"          
## [85] "BB8"                   "Captain Phasma"        "Padmé Amidala"

  character_names <- sw_people %>% map_chr(~.x[[1]])
  print(character_names)

##  [1] "Luke Skywalker"        "C-3PO"                 "R2-D2"                
##  [4] "Darth Vader"           "Leia Organa"           "Owen Lars"            
##  [7] "Beru Whitesun lars"    "R5-D4"                 "Biggs Darklighter"    
## [10] "Obi-Wan Kenobi"        "Anakin Skywalker"      "Wilhuff Tarkin"       
## [13] "Chewbacca"             "Han Solo"              "Greedo"               
## [16] "Jabba Desilijic Tiure" "Wedge Antilles"        "Jek Tono Porkins"     
## [19] "Yoda"                  "Palpatine"             "Boba Fett"            
## [22] "IG-88"                 "Bossk"                 "Lando Calrissian"     
## [25] "Lobot"                 "Ackbar"                "Mon Mothma"           
## [28] "Arvel Crynyd"          "Wicket Systri Warrick" "Nien Nunb"            
## [31] "Qui-Gon Jinn"          "Nute Gunray"           "Finis Valorum"        
## [34] "Jar Jar Binks"         "Roos Tarpals"          "Rugor Nass"           
## [37] "Ric Olié"              "Watto"                 "Sebulba"              
## [40] "Quarsh Panaka"         "Shmi Skywalker"        "Darth Maul"           
## [43] "Bib Fortuna"           "Ayla Secura"           "Dud Bolt"             
## [46] "Gasgano"               "Ben Quadinaros"        "Mace Windu"           
## [49] "Ki-Adi-Mundi"          "Kit Fisto"             "Eeth Koth"            
## [52] "Adi Gallia"            "Saesee Tiin"           "Yarael Poof"          
## [55] "Plo Koon"              "Mas Amedda"            "Gregar Typho"         
## [58] "Cordé"                 "Cliegg Lars"           "Poggle the Lesser"    
## [61] "Luminara Unduli"       "Barriss Offee"         "Dormé"                
## [64] "Dooku"                 "Bail Prestor Organa"   "Jango Fett"           
## [67] "Zam Wesell"            "Dexter Jettster"       "Lama Su"              
## [70] "Taun We"               "Jocasta Nu"            "Ratts Tyerell"        
## [73] "R4-P17"                "Wat Tambor"            "San Hill"             
## [76] "Shaak Ti"              "Grievous"              "Tarfful"              
## [79] "Raymus Antilles"       "Sly Moore"             "Tion Medon"           
## [82] "Finn"                  "Rey"                   "Poe Dameron"          
## [85] "BB8"                   "Captain Phasma"        "Padmé Amidala"

## Exercise 10
  
  film_counts <- sw_people %>%
    map_int(~ length(.x$films))
  #names(film_counts) <- map_chr(sw_people, ~ .x$name)
  print(film_counts)

##  [1] 5 6 7 4 5 3 3 1 1 6 3 2 5 4 1 3 3 1 5 5 3 1 1 2 1 2 1 1 1 1 1 3 1 2 1 1 1 2
## [39] 1 1 2 1 1 3 1 1 1 3 3 3 2 2 2 1 3 2 1 1 1 2 2 1 1 2 2 1 1 1 1 1 1 1 2 1 1 2
## [77] 1 1 2 2 1 1 1 1 1 1 3

## Exercise 11
  
  sw_people %>% map_chr(~ .x$name) %>% 
    set_names(sw_people, nm = .) %>% 
    map_df(~ length(.x$films)) %>% 
    pivot_longer(
      cols = everything(),
      names_to = "Character",
      values_to = "Films") %>% 
    ggplot(aes(Films, reorder(Character, Films))) + 
    geom_point()

Module Lab 6 - Suraj Singh

2024-10-06