exercise 1
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
install.packages("repurrrsive", repos="https://cran.r-project.org/")
## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'repurrrsive' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\HP\AppData\Local\Temp\Rtmp46Buhl\downloaded_packages
library(repurrrsive)
install.packages("here", repos="https://cran.r-project.org/")
## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)
## package 'here' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\HP\AppData\Local\Temp\Rtmp46Buhl\downloaded_packages
library(purrr)
library(lubridate)
##
## Attaching package: 'lubridate'
##
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(here)
## here() starts at C:/Users/HP/Desktop/BANA7025-WRANGLING/week6/assignment
library(dplyr)
monthly_data_files <- here("data/data")
list.files(monthly_data_files)
## [1] "Month-01.csv" "Month-02.csv" "Month-03.csv" "Month-04.csv" "Month-05.csv"
## [6] "Month-06.csv" "Month-07.csv" "Month-08.csv" "Month-09.csv" "Month-10.csv"
## [11] "Month-11.csv"
for(data_file in list.files(monthly_data_files)) {
df <- readr::read_csv(here(monthly_data_files, data_file))
new_name <- stringr :: str_sub(data_file, end = -5)
assign(new_name, df)
}
## Rows: 54535 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 44380 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 53259 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 51033 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 55079 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 59666 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 64268 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 69492 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 71855 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 80277 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 94315 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df <- rbind(`Month-01`,`Month-02`,`Month-03`,`Month-04`, `Month-05`, `Month-06`,`Month-07`, `Month-08`, `Month-09`, `Month-10`, `Month-11`)
glimpse(df)
## Rows: 698,159
## Columns: 10
## $ Account_ID <dbl> 5, 16, 28, 40, 62, 64, 69, 69, 70, 79, 88, 90, 9…
## $ Transaction_Timestamp <dttm> 2009-01-08 00:16:41, 2009-01-20 22:40:08, 2009-…
## $ Factor_A <dbl> 2, 2, 2, 2, 2, 7, 2, 2, 2, 7, 8, 10, 10, 2, 2, 2…
## $ Factor_B <dbl> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 18, 6, 6, 6, 6, 6,…
## $ Factor_C <chr> "VI", "VI", "VI", "VI", "VI", "MC", "VI", "VI", …
## $ Factor_D <dbl> 20, 20, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, …
## $ Factor_E <chr> "A", "H", "NULL", "H", "B", "NULL", "H", "H", "B…
## $ Response <dbl> 1020, 1020, 1020, 1020, 1020, 1020, 1020, 1020, …
## $ Transaction_Status <chr> "Approved", "Approved", "Approved", "Approved", …
## $ Month <chr> "Jan", "Jan", "Jan", "Jan", "Jan", "Jan", "Jan",…
exercise 2
lapply(df, class)
## $Account_ID
## [1] "numeric"
##
## $Transaction_Timestamp
## [1] "POSIXct" "POSIXt"
##
## $Factor_A
## [1] "numeric"
##
## $Factor_B
## [1] "numeric"
##
## $Factor_C
## [1] "character"
##
## $Factor_D
## [1] "numeric"
##
## $Factor_E
## [1] "character"
##
## $Response
## [1] "numeric"
##
## $Transaction_Status
## [1] "character"
##
## $Month
## [1] "character"
exercise 3
lapply(df, n_distinct)
## $Account_ID
## [1] 475413
##
## $Transaction_Timestamp
## [1] 686538
##
## $Factor_A
## [1] 7
##
## $Factor_B
## [1] 6
##
## $Factor_C
## [1] 4
##
## $Factor_D
## [1] 15
##
## $Factor_E
## [1] 63
##
## $Response
## [1] 42
##
## $Transaction_Status
## [1] 2
##
## $Month
## [1] 11
exercise 4
df %>%
mutate(Factor_D, value = ifelse(Factor_D == 26, 25, Factor_D)) %>%
count(value)
## # A tibble: 14 × 2
## value n
## <dbl> <int>
## 1 10 4595
## 2 15 1089
## 3 20 527882
## 4 21 68072
## 5 25 41021
## 6 30 7030
## 7 31 512
## 8 35 25298
## 9 40 2720
## 10 50 3709
## 11 55 15200
## 12 70 54
## 13 85 4
## 14 90 973
exercise 5
df %>%
filter_at(vars(3:7), any_vars(grepl("NULL", . , ignore.case = TRUE)))
## # A tibble: 208,622 × 10
## Account…¹ Transaction_Times…² Facto…³ Facto…⁴ Facto…⁵ Facto…⁶ Facto…⁷ Respo…⁸
## <dbl> <dttm> <dbl> <dbl> <chr> <dbl> <chr> <dbl>
## 1 28 2009-01-19 13:24:55 2 6 VI 21 NULL 1020
## 2 64 2009-01-01 18:53:02 7 6 MC 20 NULL 1020
## 3 79 2009-01-07 19:41:18 7 6 MC 20 NULL 1020
## 4 88 2009-01-26 15:21:28 8 18 AX 20 NULL 1020
## 5 90 2009-01-21 22:38:08 10 6 DI 20 NULL 1020
## 6 90 2009-01-29 14:26:19 10 6 DI 20 NULL 1020
## 7 111 2009-01-16 11:46:05 7 6 MC 20 NULL 1020
## 8 111 2009-01-28 16:04:45 7 6 MC 20 NULL 1320
## 9 111 2009-01-28 17:51:52 7 6 MC 20 NULL 1320
## 10 111 2009-01-28 17:53:18 7 6 MC 20 NULL 1320
## # … with 208,612 more rows, 2 more variables: Transaction_Status <chr>,
## # Month <chr>, and abbreviated variable names ¹Account_ID,
## # ²Transaction_Timestamp, ³Factor_A, ⁴Factor_B, ⁵Factor_C, ⁶Factor_D,
## # ⁷Factor_E, ⁸Response
exercise 6
df <- df %>%
mutate_at(-2, as.factor) %>%
mutate(Month = factor(Month), levels = 1)
glimpse(df)
## Rows: 698,159
## Columns: 11
## $ Account_ID <fct> 5, 16, 28, 40, 62, 64, 69, 69, 70, 79, 88, 90, 9…
## $ Transaction_Timestamp <dttm> 2009-01-08 00:16:41, 2009-01-20 22:40:08, 2009-…
## $ Factor_A <fct> 2, 2, 2, 2, 2, 7, 2, 2, 2, 7, 8, 10, 10, 2, 2, 2…
## $ Factor_B <fct> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 18, 6, 6, 6, 6, 6,…
## $ Factor_C <fct> VI, VI, VI, VI, VI, MC, VI, VI, VI, MC, AX, DI, …
## $ Factor_D <fct> 20, 20, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, …
## $ Factor_E <fct> A, H, NULL, H, B, NULL, H, H, B, NULL, NULL, NUL…
## $ Response <fct> 1020, 1020, 1020, 1020, 1020, 1020, 1020, 1020, …
## $ Transaction_Status <fct> Approved, Approved, Approved, Approved, Approved…
## $ Month <fct> Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan…
## $ levels <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
levels(df$Month)
## [1] "Apr" "Aug" "Feb" "Jan" "Jul" "Jun" "Mar" "May" "Nov" "Oct" "Sep"
exercise 7
df %>%
summarise_if(is.factor, n_distinct )
## # A tibble: 1 × 9
## Account_ID Factor_A Factor_B Factor_C Factor_D Factor_E Response Trans…¹ Month
## <int> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 475413 7 6 4 15 63 42 2 11
## # … with abbreviated variable name ¹Transaction_Status
df %>%
group_by(Transaction_Status)%>%
summarise_if(is.factor, n_distinct )
## # A tibble: 2 × 9
## Transaction_St…¹ Accou…² Facto…³ Facto…⁴ Facto…⁵ Facto…⁶ Facto…⁷ Respo…⁸ Month
## <fct> <int> <int> <int> <int> <int> <int> <int> <int>
## 1 Approved 462655 6 5 4 13 60 2 11
## 2 Declined 22013 7 6 4 15 58 40 11
## # … with abbreviated variable names ¹Transaction_Status, ²Account_ID,
## # ³Factor_A, ⁴Factor_B, ⁵Factor_C, ⁶Factor_D, ⁷Factor_E, ⁸Response
exercise 8
convert_to_qtr <- function(Month) {
case_when(
'Jan' == Month~ "Q1",
"Feb" == Month ~ "Q1" ,
"Mar" == Month ~ "Q1",
"Apr" == Month ~ "Q2",
"May" ==Month ~ "Q2",
"Jun" ==Month ~ "Q2",
"Jul" ==Month ~ "Q3",
"Aug" ==Month ~ "Q3",
"Sep" ==Month ~ "Q3",
"Oct" ==Month ~ "Q4",
"Nov" ==Month ~ "Q4",
"Dec" ==Month ~ "Q4"
)
}
df <- df %>%
mutate(QTR = convert_to_qtr(Month))
exercise 9
sw_people%>%
map_chr(~.x$name)
## [1] "Luke Skywalker" "C-3PO" "R2-D2"
## [4] "Darth Vader" "Leia Organa" "Owen Lars"
## [7] "Beru Whitesun lars" "R5-D4" "Biggs Darklighter"
## [10] "Obi-Wan Kenobi" "Anakin Skywalker" "Wilhuff Tarkin"
## [13] "Chewbacca" "Han Solo" "Greedo"
## [16] "Jabba Desilijic Tiure" "Wedge Antilles" "Jek Tono Porkins"
## [19] "Yoda" "Palpatine" "Boba Fett"
## [22] "IG-88" "Bossk" "Lando Calrissian"
## [25] "Lobot" "Ackbar" "Mon Mothma"
## [28] "Arvel Crynyd" "Wicket Systri Warrick" "Nien Nunb"
## [31] "Qui-Gon Jinn" "Nute Gunray" "Finis Valorum"
## [34] "Jar Jar Binks" "Roos Tarpals" "Rugor Nass"
## [37] "Ric Olié" "Watto" "Sebulba"
## [40] "Quarsh Panaka" "Shmi Skywalker" "Darth Maul"
## [43] "Bib Fortuna" "Ayla Secura" "Dud Bolt"
## [46] "Gasgano" "Ben Quadinaros" "Mace Windu"
## [49] "Ki-Adi-Mundi" "Kit Fisto" "Eeth Koth"
## [52] "Adi Gallia" "Saesee Tiin" "Yarael Poof"
## [55] "Plo Koon" "Mas Amedda" "Gregar Typho"
## [58] "Cordé" "Cliegg Lars" "Poggle the Lesser"
## [61] "Luminara Unduli" "Barriss Offee" "Dormé"
## [64] "Dooku" "Bail Prestor Organa" "Jango Fett"
## [67] "Zam Wesell" "Dexter Jettster" "Lama Su"
## [70] "Taun We" "Jocasta Nu" "Ratts Tyerell"
## [73] "R4-P17" "Wat Tambor" "San Hill"
## [76] "Shaak Ti" "Grievous" "Tarfful"
## [79] "Raymus Antilles" "Sly Moore" "Tion Medon"
## [82] "Finn" "Rey" "Poe Dameron"
## [85] "BB8" "Captain Phasma" "Padmé Amidala"
exercise 10
sw_people %>% map_int(~ length(.x$films))
## [1] 5 6 7 4 5 3 3 1 1 6 3 2 5 4 1 3 3 1 5 5 3 1 1 2 1 2 1 1 1 1 1 3 1 2 1 1 1 2
## [39] 1 1 2 1 1 3 1 1 1 3 3 3 2 2 2 1 3 2 1 1 1 2 2 1 1 2 2 1 1 1 1 1 1 1 2 1 1 2
## [77] 1 1 2 2 1 1 1 1 1 1 3
exercise 11
sw_people %>%
map_chr(~.x$name) %>% # extract the names
set_names(sw_people, nm = .) %>% # set the list names as character names
map_df(~ length(.x$films)) %>% # extract number of films and make a data frame
pivot_longer( # tidy data frame
cols = everything(),
names_to = "Character",
values_to = "Films"
) %>%
ggplot(aes(Films, reorder(Character, Films))) + # plot results
geom_point()
