week6_lab_tejas

#Loading the required libraries
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.1
## ✔ readr   2.1.2     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(here)

## here() starts at C:/Users/tejas/OneDrive/Desktop/Fall Classes/BANA 7025 - Data Wrangling/Lab Module 6

library(repurrrsive)
library(purrr)

# Exercise 1 - Import Files -----------------------------------------------

monthly_data_files <- here("C:/Users/tejas/OneDrive/Desktop/Fall Classes/BANA 7025 - Data Wrangling/Lab Module 6/data/")

csv_files <- fs::dir_ls(monthly_data_files, regexp = "\\.csv$")
df <- csv_files %>% 
        map_dfr(read_csv)

## Rows: 54535 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 44380 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 53259 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 51033 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 55079 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 59666 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 64268 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 69492 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 71855 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 80277 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 94315 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Exercise 2 - Class of each column ---------------------------------------

df %>%
  map(class)

## $Account_ID
## [1] "numeric"
## 
## $Transaction_Timestamp
## [1] "POSIXct" "POSIXt" 
## 
## $Factor_A
## [1] "numeric"
## 
## $Factor_B
## [1] "numeric"
## 
## $Factor_C
## [1] "character"
## 
## $Factor_D
## [1] "numeric"
## 
## $Factor_E
## [1] "character"
## 
## $Response
## [1] "numeric"
## 
## $Transaction_Status
## [1] "character"
## 
## $Month
## [1] "character"

# Exercise 3 - Unique values ----------------------------------------------

df %>%
  map_int(n_distinct)

##            Account_ID Transaction_Timestamp              Factor_A 
##                475413                686538                     7 
##              Factor_B              Factor_C              Factor_D 
##                     6                     4                    15 
##              Factor_E              Response    Transaction_Status 
##                    63                    42                     2 
##                 Month 
##                    11

#  Exercise 4 -------------------------------------------------------------

df_q4 <- df %>%
  mutate(Factor_D_new = if_else(Factor_D == 26, 25, Factor_D)) %>%
  select(Factor_D_new) %>%
  count(Factor_D_new) 

df_q4 %>% tally()

## # A tibble: 1 × 1
##       n
##   <int>
## 1    14

# Exercise 5 --------------------------------------------------------------

df %>%
  filter_at(vars(3:7), all_vars(. != "NULL"))

## # A tibble: 489,537 × 10
##    Account…¹ Transaction_Times…² Facto…³ Facto…⁴ Facto…⁵ Facto…⁶ Facto…⁷ Respo…⁸
##        <dbl> <dttm>                <dbl>   <dbl> <chr>     <dbl> <chr>     <dbl>
##  1         5 2009-01-08 00:16:41       2       6 VI           20 A          1020
##  2        16 2009-01-20 22:40:08       2       6 VI           20 H          1020
##  3        40 2009-01-05 16:10:58       2       6 VI           20 H          1020
##  4        62 2009-01-21 19:13:13       2       6 VI           20 B          1020
##  5        69 2009-01-08 00:15:19       2       6 VI           20 H          1020
##  6        69 2009-01-19 09:33:22       2       6 VI           20 H          1020
##  7        70 2009-01-05 12:07:47       2       6 VI           20 B          1020
##  8        95 2009-01-17 14:57:45       2       6 VI           20 G          1020
##  9       101 2009-01-14 19:25:56       2       6 VI           20 G2         1020
## 10       101 2009-01-20 14:33:27       2       6 VI           20 G2         1020
## # … with 489,527 more rows, 2 more variables: Transaction_Status <chr>,
## #   Month <chr>, and abbreviated variable names ¹Account_ID,
## #   ²Transaction_Timestamp, ³Factor_A, ⁴Factor_B, ⁵Factor_C, ⁶Factor_D,
## #   ⁷Factor_E, ⁸Response

# 489,527 rows of data

# Exercise 6 --------------------------------------------------------------

df_q6 <- df %>%
  mutate_at(-2, as.factor) %>%
  mutate(Month = factor((Month), levels = month.name))

glimpse(df_q6)

## Rows: 698,159
## Columns: 10
## $ Account_ID            <fct> 5, 16, 28, 40, 62, 64, 69, 69, 70, 79, 88, 90, 9…
## $ Transaction_Timestamp <dttm> 2009-01-08 00:16:41, 2009-01-20 22:40:08, 2009-…
## $ Factor_A              <fct> 2, 2, 2, 2, 2, 7, 2, 2, 2, 7, 8, 10, 10, 2, 2, 2…
## $ Factor_B              <fct> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 18, 6, 6, 6, 6, 6,…
## $ Factor_C              <fct> VI, VI, VI, VI, VI, MC, VI, VI, VI, MC, AX, DI, …
## $ Factor_D              <fct> 20, 20, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, …
## $ Factor_E              <fct> A, H, NULL, H, B, NULL, H, H, B, NULL, NULL, NUL…
## $ Response              <fct> 1020, 1020, 1020, 1020, 1020, 1020, 1020, 1020, …
## $ Transaction_Status    <fct> Approved, Approved, Approved, Approved, Approved…
## $ Month                 <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …

levels(df_q6$Month)

##  [1] "January"   "February"  "March"     "April"     "May"       "June"     
##  [7] "July"      "August"    "September" "October"   "November"  "December"

# Exercise 7 -------------------------------------------------------------

df_q6 %>% summarise_if(is.factor, n_distinct)

## # A tibble: 1 × 9
##   Account_ID Factor_A Factor_B Factor_C Factor_D Factor_E Response Trans…¹ Month
##        <int>    <int>    <int>    <int>    <int>    <int>    <int>   <int> <int>
## 1     475413        7        6        4       15       63       42       2     2
## # … with abbreviated variable name ¹Transaction_Status

#Yes, there are approximately equal distributions of unique values across almost all the variables except "Response" column.
df_q6 %>%
  group_by(Transaction_Status) %>%
  summarise_if(is.factor, n_distinct)

## # A tibble: 2 × 9
##   Transaction_St…¹ Accou…² Facto…³ Facto…⁴ Facto…⁵ Facto…⁶ Facto…⁷ Respo…⁸ Month
##   <fct>              <int>   <int>   <int>   <int>   <int>   <int>   <int> <int>
## 1 Approved          462655       6       5       4      13      60       2     2
## 2 Declined           22013       7       6       4      15      58      40     2
## # … with abbreviated variable names ¹Transaction_Status, ²Account_ID,
## #   ³Factor_A, ⁴Factor_B, ⁵Factor_C, ⁶Factor_D, ⁷Factor_E, ⁸Response

# Exercise 8 -------------------------------------------------------------

convert_to_qtr <- function(Month) dplyr::case_when(
  Month == "Jan" | Month == "Feb" | Month == "Mar" ~ "Q1",
  Month == "Apr" | Month == "May" | Month == "Jun" ~ "Q2", 
  Month == "Jul" | Month == "Aug" | Month == "Sep" ~ "Q3", 
  Month == "Oct" | Month == "Nov" | Month == "Dec" ~ "Q4", 
)

example_months <- c("Jan", "Mar", "May", "May", "Aug", "Nov", "Nov", "Dec") 
convert_to_qtr(example_months)

## [1] "Q1" "Q1" "Q2" "Q2" "Q3" "Q4" "Q4" "Q4"

df %>%
  mutate(Qtr = convert_to_qtr(Month)) %>%
  group_by(Qtr) %>%
  count(Month) %>%
  summarize(total = sum(n))

## # A tibble: 4 × 2
##   Qtr    total
##   <chr>  <int>
## 1 Q1    152174
## 2 Q2    165778
## 3 Q3    205615
## 4 Q4    174592

# Exercise 9 -------------------------------------------------------------

sw_people %>%
  map_chr(~.x$name)

##  [1] "Luke Skywalker"        "C-3PO"                 "R2-D2"                
##  [4] "Darth Vader"           "Leia Organa"           "Owen Lars"            
##  [7] "Beru Whitesun lars"    "R5-D4"                 "Biggs Darklighter"    
## [10] "Obi-Wan Kenobi"        "Anakin Skywalker"      "Wilhuff Tarkin"       
## [13] "Chewbacca"             "Han Solo"              "Greedo"               
## [16] "Jabba Desilijic Tiure" "Wedge Antilles"        "Jek Tono Porkins"     
## [19] "Yoda"                  "Palpatine"             "Boba Fett"            
## [22] "IG-88"                 "Bossk"                 "Lando Calrissian"     
## [25] "Lobot"                 "Ackbar"                "Mon Mothma"           
## [28] "Arvel Crynyd"          "Wicket Systri Warrick" "Nien Nunb"            
## [31] "Qui-Gon Jinn"          "Nute Gunray"           "Finis Valorum"        
## [34] "Jar Jar Binks"         "Roos Tarpals"          "Rugor Nass"           
## [37] "Ric Olié"              "Watto"                 "Sebulba"              
## [40] "Quarsh Panaka"         "Shmi Skywalker"        "Darth Maul"           
## [43] "Bib Fortuna"           "Ayla Secura"           "Dud Bolt"             
## [46] "Gasgano"               "Ben Quadinaros"        "Mace Windu"           
## [49] "Ki-Adi-Mundi"          "Kit Fisto"             "Eeth Koth"            
## [52] "Adi Gallia"            "Saesee Tiin"           "Yarael Poof"          
## [55] "Plo Koon"              "Mas Amedda"            "Gregar Typho"         
## [58] "Cordé"                 "Cliegg Lars"           "Poggle the Lesser"    
## [61] "Luminara Unduli"       "Barriss Offee"         "Dormé"                
## [64] "Dooku"                 "Bail Prestor Organa"   "Jango Fett"           
## [67] "Zam Wesell"            "Dexter Jettster"       "Lama Su"              
## [70] "Taun We"               "Jocasta Nu"            "Ratts Tyerell"        
## [73] "R4-P17"                "Wat Tambor"            "San Hill"             
## [76] "Shaak Ti"              "Grievous"              "Tarfful"              
## [79] "Raymus Antilles"       "Sly Moore"             "Tion Medon"           
## [82] "Finn"                  "Rey"                   "Poe Dameron"          
## [85] "BB8"                   "Captain Phasma"        "Padmé Amidala"

# Exercise 10 -------------------------------------------------------------

sw_people %>%
  map_int(~length(.x$films))

##  [1] 5 6 7 4 5 3 3 1 1 6 3 2 5 4 1 3 3 1 5 5 3 1 1 2 1 2 1 1 1 1 1 3 1 2 1 1 1 2
## [39] 1 1 2 1 1 3 1 1 1 3 3 3 2 2 2 1 3 2 1 1 1 2 2 1 1 2 2 1 1 1 1 1 1 1 2 1 1 2
## [77] 1 1 2 2 1 1 1 1 1 1 3

# Exercise 11 ------------------------------------------------------------- 

sw_people %>%
  map_chr(~.x$name) %>%
  set_names(sw_people, nm = .) %>%
  map_df(~ length(.x$films)) %>%
  pivot_longer(
    cols = everything(),
    names_to = "Character", 
    values_to = "Films",
  ) %>%
  ggplot(aes(Films, reorder(Character, Films))) +
  geom_point()

week6_lab_tejas_ravi

Tejas R

2022-10-02