Module 6 Lab.Quynh.Tran.

Loading packages

library(tidyverse)
library(purrr)
library(readr)
library(dplyr)

Exercise 1


library(here)

monthly_data_files <- here("data/")

csv_files <- list.files(monthly_data_files, pattern = "*.csv", full.names = TRUE)

df <- map_df(csv_files, read_csv)

Rows: 54535 Columns: 10── Column specification ────────────────────────────────────────────────
Delimiter: ","
chr  (4): Factor_C, Factor_E, Transaction_Status, Month
dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
dttm (1): Transaction_Timestamp
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 44380 Columns: 10── Column specification ────────────────────────────────────────────────
Delimiter: ","
chr  (4): Factor_C, Factor_E, Transaction_Status, Month
dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
dttm (1): Transaction_Timestamp
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 53259 Columns: 10── Column specification ────────────────────────────────────────────────
Delimiter: ","
chr  (4): Factor_C, Factor_E, Transaction_Status, Month
dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
dttm (1): Transaction_Timestamp
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 51033 Columns: 10── Column specification ────────────────────────────────────────────────
Delimiter: ","
chr  (4): Factor_C, Factor_E, Transaction_Status, Month
dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
dttm (1): Transaction_Timestamp
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 55079 Columns: 10── Column specification ────────────────────────────────────────────────
Delimiter: ","
chr  (4): Factor_C, Factor_E, Transaction_Status, Month
dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
dttm (1): Transaction_Timestamp
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 59666 Columns: 10── Column specification ────────────────────────────────────────────────
Delimiter: ","
chr  (4): Factor_C, Factor_E, Transaction_Status, Month
dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
dttm (1): Transaction_Timestamp
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 64268 Columns: 10── Column specification ────────────────────────────────────────────────
Delimiter: ","
chr  (4): Factor_C, Factor_E, Transaction_Status, Month
dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
dttm (1): Transaction_Timestamp
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 69492 Columns: 10── Column specification ────────────────────────────────────────────────
Delimiter: ","
chr  (4): Factor_C, Factor_E, Transaction_Status, Month
dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
dttm (1): Transaction_Timestamp
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 71855 Columns: 10── Column specification ────────────────────────────────────────────────
Delimiter: ","
chr  (4): Factor_C, Factor_E, Transaction_Status, Month
dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
dttm (1): Transaction_Timestamp
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 80277 Columns: 10── Column specification ────────────────────────────────────────────────
Delimiter: ","
chr  (4): Factor_C, Factor_E, Transaction_Status, Month
dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
dttm (1): Transaction_Timestamp
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.Rows: 94315 Columns: 10── Column specification ────────────────────────────────────────────────
Delimiter: ","
chr  (4): Factor_C, Factor_E, Transaction_Status, Month
dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
dttm (1): Transaction_Timestamp
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

glimpse(df)

Rows: 698,159
Columns: 10
$ Account_ID            <dbl> 5, 16, 28, 40, 62, 64, 69, 69, 70, 79, 8…
$ Transaction_Timestamp <dttm> 2009-01-08 00:16:41, 2009-01-20 22:40:0…
$ Factor_A              <dbl> 2, 2, 2, 2, 2, 7, 2, 2, 2, 7, 8, 10, 10,…
$ Factor_B              <dbl> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 18, 6, 6, …
$ Factor_C              <chr> "VI", "VI", "VI", "VI", "VI", "MC", "VI"…
$ Factor_D              <dbl> 20, 20, 21, 20, 20, 20, 20, 20, 20, 20, …
$ Factor_E              <chr> "A", "H", "NULL", "H", "B", "NULL", "H",…
$ Response              <dbl> 1020, 1020, 1020, 1020, 1020, 1020, 1020…
$ Transaction_Status    <chr> "Approved", "Approved", "Approved", "App…
$ Month                 <chr> "Jan", "Jan", "Jan", "Jan", "Jan", "Jan"…

Exercise 2

classes <- map(df, class)

classes

$Account_ID
[1] "numeric"

$Transaction_Timestamp
[1] "POSIXct" "POSIXt" 

$Factor_A
[1] "numeric"

$Factor_B
[1] "numeric"

$Factor_C
[1] "character"

$Factor_D
[1] "numeric"

$Factor_E
[1] "character"

$Response
[1] "numeric"

$Transaction_Status
[1] "character"

$Month
[1] "character"

Exercise 3

unique <- map_int(df, ~ n_distinct(.))

unique

           Account_ID Transaction_Timestamp              Factor_A 
               475413                686538                     7 
             Factor_B              Factor_C              Factor_D 
                    6                     4                    15 
             Factor_E              Response    Transaction_Status 
                   63                    42                     2 
                Month 
                   11

Exercise 4

df %>%
  mutate(Factor_D, value = ifelse(Factor_D == 26, 25, Factor_D)) %>%
  count(value)

Exercise 5

df_filter <- df %>%
  filter_at(vars(starts_with("Factor_")), ~ . != "NULL")
df_filter


## How many rows does your data now have (hint: it should be less than 500,000)?
nrow(df_filter)

[1] 489537

Exercise 6

df <- df_filter %>% mutate_at(vars(-Transaction_Timestamp), as.factor) %>% 
  mutate(Month = factor(Month, 
                        levels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun", 
                                   "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"), 
                        ordered = TRUE))

glimpse(df)

Rows: 489,537
Columns: 10
$ Account_ID            <fct> 5, 16, 40, 62, 69, 69, 70, 95, 101, 101,…
$ Transaction_Timestamp <dttm> 2009-01-08 00:16:41, 2009-01-20 22:40:0…
$ Factor_A              <fct> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2…
$ Factor_B              <fct> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6…
$ Factor_C              <fct> VI, VI, VI, VI, VI, VI, VI, VI, VI, VI, …
$ Factor_D              <fct> 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, …
$ Factor_E              <fct> A, H, H, B, H, H, B, G, G2, G2, A, A, C,…
$ Response              <fct> 1020, 1020, 1020, 1020, 1020, 1020, 1020…
$ Transaction_Status    <fct> Approved, Approved, Approved, Approved, …
$ Month                 <ord> Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, …

levels(df$Month)

 [1] "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov"
[12] "Dec"

Exercise 7

df %>%
  summarize_if(is.factor, n_distinct)

df %>%
  group_by(Transaction_Status) %>%
  summarise_if(is.factor, n_distinct)

Exercise 8

convert_to_qtr <- function(month) {
  case_when(
    month %in% c("Jan", "Feb", "Mar") ~ "Q1",
    month %in% c("Apr", "May", "Jun") ~ "Q2",
    month %in% c("Jul", "Aug", "Sep") ~ "Q3",
    month %in% c("Oct", "Nov", "Dec") ~ "Q4"
  )
}


example_months <- c("Jan", "Mar", "May", "May", "Aug", "Nov", "Nov", "Dec")
convert_to_qtr(example_months)

[1] "Q1" "Q1" "Q2" "Q2" "Q3" "Q4" "Q4" "Q4"

df <- df %>%
  mutate(Qtr = convert_to_qtr(Month))


df %>% count(Qtr)

Exercise 9

library(repurrrsive)
library(purrr)

sw_people %>%
  map_chr(~.x$name)

 [1] "Luke Skywalker"        "C-3PO"                
 [3] "R2-D2"                 "Darth Vader"          
 [5] "Leia Organa"           "Owen Lars"            
 [7] "Beru Whitesun lars"    "R5-D4"                
 [9] "Biggs Darklighter"     "Obi-Wan Kenobi"       
[11] "Anakin Skywalker"      "Wilhuff Tarkin"       
[13] "Chewbacca"             "Han Solo"             
[15] "Greedo"                "Jabba Desilijic Tiure"
[17] "Wedge Antilles"        "Jek Tono Porkins"     
[19] "Yoda"                  "Palpatine"            
[21] "Boba Fett"             "IG-88"                
[23] "Bossk"                 "Lando Calrissian"     
[25] "Lobot"                 "Ackbar"               
[27] "Mon Mothma"            "Arvel Crynyd"         
[29] "Wicket Systri Warrick" "Nien Nunb"            
[31] "Qui-Gon Jinn"          "Nute Gunray"          
[33] "Finis Valorum"         "Jar Jar Binks"        
[35] "Roos Tarpals"          "Rugor Nass"           
[37] "Ric Olié"              "Watto"                
[39] "Sebulba"               "Quarsh Panaka"        
[41] "Shmi Skywalker"        "Darth Maul"           
[43] "Bib Fortuna"           "Ayla Secura"          
[45] "Dud Bolt"              "Gasgano"              
[47] "Ben Quadinaros"        "Mace Windu"           
[49] "Ki-Adi-Mundi"          "Kit Fisto"            
[51] "Eeth Koth"             "Adi Gallia"           
[53] "Saesee Tiin"           "Yarael Poof"          
[55] "Plo Koon"              "Mas Amedda"           
[57] "Gregar Typho"          "Cordé"                
[59] "Cliegg Lars"           "Poggle the Lesser"    
[61] "Luminara Unduli"       "Barriss Offee"        
[63] "Dormé"                 "Dooku"                
[65] "Bail Prestor Organa"   "Jango Fett"           
[67] "Zam Wesell"            "Dexter Jettster"      
[69] "Lama Su"               "Taun We"              
[71] "Jocasta Nu"            "Ratts Tyerell"        
[73] "R4-P17"                "Wat Tambor"           
[75] "San Hill"              "Shaak Ti"             
[77] "Grievous"              "Tarfful"              
[79] "Raymus Antilles"       "Sly Moore"            
[81] "Tion Medon"            "Finn"                 
[83] "Rey"                   "Poe Dameron"          
[85] "BB8"                   "Captain Phasma"       
[87] "Padmé Amidala"

Exercise 10

sw_people%>%
  map_int(~ length(.x$films))

 [1] 5 6 7 4 5 3 3 1 1 6 3 2 5 4 1 3 3 1 5 5 3 1 1 2 1 2 1 1 1 1 1 3 1 2
[35] 1 1 1 2 1 1 2 1 1 3 1 1 1 3 3 3 2 2 2 1 3 2 1 1 1 2 2 1 1 2 2 1 1 1
[69] 1 1 1 1 2 1 1 2 1 1 2 2 1 1 1 1 1 1 3

Exercise 11

library(ggplot2)
library(tidyr)
library(purrr)
library(dplyr)

sw_people %>%
  map_chr(~.x$name) %>%
  set_names(sw_people, nm = .) %>%
  map_df(~ length(.x$films)) %>%
  pivot_longer(
    cols=everything(),
    names_to = "Character", 
    values_to = "Films",
  ) %>%
  ggplot(aes(Films, reorder(Character, Films))) +
  geom_point()

LS0tDQp0aXRsZTogIk1vZHVsZSA2IExhYi5RdXluaC5UcmFuLiINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KZWRpdG9yX29wdGlvbnM6IA0KLS0tDQoNCkxvYWRpbmcgcGFja2FnZXMNCg0KYGBge3J9DQpsaWJyYXJ5KHRpZHl2ZXJzZSkNCmxpYnJhcnkocHVycnIpDQpsaWJyYXJ5KHJlYWRyKQ0KbGlicmFyeShkcGx5cikNCmBgYA0KDQoNCiMjIyBFeGVyY2lzZSAxDQoNCmBgYHtyIGVjaG89VFJVRX0NCg0KbGlicmFyeShoZXJlKQ0KDQptb250aGx5X2RhdGFfZmlsZXMgPC0gaGVyZSgiZGF0YS8iKQ0KDQpjc3ZfZmlsZXMgPC0gbGlzdC5maWxlcyhtb250aGx5X2RhdGFfZmlsZXMsIHBhdHRlcm4gPSAiKi5jc3YiLCBmdWxsLm5hbWVzID0gVFJVRSkNCg0KZGYgPC0gbWFwX2RmKGNzdl9maWxlcywgcmVhZF9jc3YpDQoNCmBgYA0KDQpgYGB7cn0NCmdsaW1wc2UoZGYpDQpgYGANCg0KDQojIyMgRXhlcmNpc2UgMg0KDQpgYGB7cn0NCmNsYXNzZXMgPC0gbWFwKGRmLCBjbGFzcykNCg0KY2xhc3Nlcw0KYGBgDQoNCg0KIyMjIEV4ZXJjaXNlIDMNCg0KYGBge3J9DQp1bmlxdWUgPC0gbWFwX2ludChkZiwgfiBuX2Rpc3RpbmN0KC4pKQ0KDQp1bmlxdWUNCmBgYA0KDQoNCiMjIyBFeGVyY2lzZSA0DQoNCmBgYHtyfQ0KZGYgJT4lDQogIG11dGF0ZShGYWN0b3JfRCwgdmFsdWUgPSBpZmVsc2UoRmFjdG9yX0QgPT0gMjYsIDI1LCBGYWN0b3JfRCkpICU+JQ0KICBjb3VudCh2YWx1ZSkNCmBgYA0KDQoNCiMjIyBFeGVyY2lzZSA1DQoNCmBgYHtyfQ0KZGZfZmlsdGVyIDwtIGRmICU+JQ0KICBmaWx0ZXJfYXQodmFycyhzdGFydHNfd2l0aCgiRmFjdG9yXyIpKSwgfiAuICE9ICJOVUxMIikNCmRmX2ZpbHRlcg0KDQojIyBIb3cgbWFueSByb3dzIGRvZXMgeW91ciBkYXRhIG5vdyBoYXZlIChoaW50OiBpdCBzaG91bGQgYmUgbGVzcyB0aGFuIDUwMCwwMDApPw0KbnJvdyhkZl9maWx0ZXIpDQpgYGANCg0KIyMjIEV4ZXJjaXNlIDYNCg0KDQpgYGB7cn0NCmRmIDwtIGRmX2ZpbHRlciAlPiUgbXV0YXRlX2F0KHZhcnMoLVRyYW5zYWN0aW9uX1RpbWVzdGFtcCksIGFzLmZhY3RvcikgJT4lIA0KICBtdXRhdGUoTW9udGggPSBmYWN0b3IoTW9udGgsIA0KICAgICAgICAgICAgICAgICAgICAgICAgbGV2ZWxzID0gYygiSmFuIiwgIkZlYiIsICJNYXIiLCAiQXByIiwgIk1heSIsICJKdW4iLCANCiAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgIkp1bCIsICJBdWciLCAiU2VwIiwgIk9jdCIsICJOb3YiLCAiRGVjIiksIA0KICAgICAgICAgICAgICAgICAgICAgICAgb3JkZXJlZCA9IFRSVUUpKQ0KDQpnbGltcHNlKGRmKQ0KYGBgDQpgYGB7cn0NCmxldmVscyhkZiRNb250aCkNCmBgYA0KDQoNCiMjIyBFeGVyY2lzZSA3DQoNCmBgYHtyfQ0KZGYgJT4lDQogIHN1bW1hcml6ZV9pZihpcy5mYWN0b3IsIG5fZGlzdGluY3QpDQpgYGANCg0KDQpgYGB7cn0NCmRmICU+JQ0KICBncm91cF9ieShUcmFuc2FjdGlvbl9TdGF0dXMpICU+JQ0KICBzdW1tYXJpc2VfaWYoaXMuZmFjdG9yLCBuX2Rpc3RpbmN0KQ0KYGBgDQoNCg0KIyMjIEV4ZXJjaXNlIDgNCg0KYGBge3J9DQpjb252ZXJ0X3RvX3F0ciA8LSBmdW5jdGlvbihtb250aCkgew0KICBjYXNlX3doZW4oDQogICAgbW9udGggJWluJSBjKCJKYW4iLCAiRmViIiwgIk1hciIpIH4gIlExIiwNCiAgICBtb250aCAlaW4lIGMoIkFwciIsICJNYXkiLCAiSnVuIikgfiAiUTIiLA0KICAgIG1vbnRoICVpbiUgYygiSnVsIiwgIkF1ZyIsICJTZXAiKSB+ICJRMyIsDQogICAgbW9udGggJWluJSBjKCJPY3QiLCAiTm92IiwgIkRlYyIpIH4gIlE0Ig0KICApDQp9DQoNCg0KZXhhbXBsZV9tb250aHMgPC0gYygiSmFuIiwgIk1hciIsICJNYXkiLCAiTWF5IiwgIkF1ZyIsICJOb3YiLCAiTm92IiwgIkRlYyIpDQpjb252ZXJ0X3RvX3F0cihleGFtcGxlX21vbnRocykNCg0KYGBgDQoNCg0KYGBge3J9DQpkZiA8LSBkZiAlPiUNCiAgbXV0YXRlKFF0ciA9IGNvbnZlcnRfdG9fcXRyKE1vbnRoKSkNCg0KDQpkZiAlPiUgY291bnQoUXRyKQ0KYGBgDQoNCiMjIyBFeGVyY2lzZSA5DQoNCg0KYGBge3J9DQpsaWJyYXJ5KHJlcHVycnJzaXZlKQ0KbGlicmFyeShwdXJycikNCg0Kc3dfcGVvcGxlICU+JQ0KICBtYXBfY2hyKH4ueCRuYW1lKQ0KYGBgDQoNCg0KRXhlcmNpc2UgMTANCg0KYGBge3J9DQpzd19wZW9wbGUlPiUNCiAgbWFwX2ludCh+IGxlbmd0aCgueCRmaWxtcykpDQoNCmBgYA0KDQpFeGVyY2lzZSAxMQ0KDQpgYGB7ciBmaWcuaGVpZ2h0PTEwLCBmaWcud2lkdGg9NX0NCmxpYnJhcnkoZ2dwbG90MikNCmxpYnJhcnkodGlkeXIpDQpsaWJyYXJ5KHB1cnJyKQ0KbGlicmFyeShkcGx5cikNCg0Kc3dfcGVvcGxlICU+JQ0KICBtYXBfY2hyKH4ueCRuYW1lKSAlPiUNCiAgc2V0X25hbWVzKHN3X3Blb3BsZSwgbm0gPSAuKSAlPiUNCiAgbWFwX2RmKH4gbGVuZ3RoKC54JGZpbG1zKSkgJT4lDQogIHBpdm90X2xvbmdlcigNCiAgICBjb2xzPWV2ZXJ5dGhpbmcoKSwNCiAgICBuYW1lc190byA9ICJDaGFyYWN0ZXIiLCANCiAgICB2YWx1ZXNfdG8gPSAiRmlsbXMiLA0KICApICU+JQ0KICBnZ3Bsb3QoYWVzKEZpbG1zLCByZW9yZGVyKENoYXJhY3RlciwgRmlsbXMpKSkgKw0KICBnZW9tX3BvaW50KCkNCmBgYA0KDQoNCg0K