module 6 lab

exercise 1

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

install.packages("repurrrsive", repos="https://cran.r-project.org/")

## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)

## package 'repurrrsive' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\HP\AppData\Local\Temp\Rtmp46Buhl\downloaded_packages

library(repurrrsive)
install.packages("here", repos="https://cran.r-project.org/")

## Installing package into 'C:/Users/HP/AppData/Local/R/win-library/4.2'
## (as 'lib' is unspecified)

## package 'here' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\HP\AppData\Local\Temp\Rtmp46Buhl\downloaded_packages

library(purrr)
library(lubridate)

## 
## Attaching package: 'lubridate'
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(here)

## here() starts at C:/Users/HP/Desktop/BANA7025-WRANGLING/week6/assignment

library(dplyr)

monthly_data_files <- here("data/data")
list.files(monthly_data_files)

##  [1] "Month-01.csv" "Month-02.csv" "Month-03.csv" "Month-04.csv" "Month-05.csv"
##  [6] "Month-06.csv" "Month-07.csv" "Month-08.csv" "Month-09.csv" "Month-10.csv"
## [11] "Month-11.csv"

for(data_file in list.files(monthly_data_files)) {
  
  df <- readr::read_csv(here(monthly_data_files, data_file))
  
  new_name <- stringr :: str_sub(data_file, end = -5)
  
  assign(new_name, df)
  
}

## Rows: 54535 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 44380 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 53259 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 51033 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 55079 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 59666 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 64268 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 69492 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 71855 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 80277 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 94315 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

df <- rbind(`Month-01`,`Month-02`,`Month-03`,`Month-04`, `Month-05`, `Month-06`,`Month-07`, `Month-08`, `Month-09`, `Month-10`, `Month-11`)

glimpse(df)

## Rows: 698,159
## Columns: 10
## $ Account_ID            <dbl> 5, 16, 28, 40, 62, 64, 69, 69, 70, 79, 88, 90, 9…
## $ Transaction_Timestamp <dttm> 2009-01-08 00:16:41, 2009-01-20 22:40:08, 2009-…
## $ Factor_A              <dbl> 2, 2, 2, 2, 2, 7, 2, 2, 2, 7, 8, 10, 10, 2, 2, 2…
## $ Factor_B              <dbl> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 18, 6, 6, 6, 6, 6,…
## $ Factor_C              <chr> "VI", "VI", "VI", "VI", "VI", "MC", "VI", "VI", …
## $ Factor_D              <dbl> 20, 20, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, …
## $ Factor_E              <chr> "A", "H", "NULL", "H", "B", "NULL", "H", "H", "B…
## $ Response              <dbl> 1020, 1020, 1020, 1020, 1020, 1020, 1020, 1020, …
## $ Transaction_Status    <chr> "Approved", "Approved", "Approved", "Approved", …
## $ Month                 <chr> "Jan", "Jan", "Jan", "Jan", "Jan", "Jan", "Jan",…

exercise 2

lapply(df, class)

## $Account_ID
## [1] "numeric"
## 
## $Transaction_Timestamp
## [1] "POSIXct" "POSIXt" 
## 
## $Factor_A
## [1] "numeric"
## 
## $Factor_B
## [1] "numeric"
## 
## $Factor_C
## [1] "character"
## 
## $Factor_D
## [1] "numeric"
## 
## $Factor_E
## [1] "character"
## 
## $Response
## [1] "numeric"
## 
## $Transaction_Status
## [1] "character"
## 
## $Month
## [1] "character"

exercise 3

lapply(df, n_distinct)

## $Account_ID
## [1] 475413
## 
## $Transaction_Timestamp
## [1] 686538
## 
## $Factor_A
## [1] 7
## 
## $Factor_B
## [1] 6
## 
## $Factor_C
## [1] 4
## 
## $Factor_D
## [1] 15
## 
## $Factor_E
## [1] 63
## 
## $Response
## [1] 42
## 
## $Transaction_Status
## [1] 2
## 
## $Month
## [1] 11

exercise 4

df %>%
  mutate(Factor_D, value = ifelse(Factor_D == 26, 25, Factor_D)) %>%
  count(value)

## # A tibble: 14 × 2
##    value      n
##    <dbl>  <int>
##  1    10   4595
##  2    15   1089
##  3    20 527882
##  4    21  68072
##  5    25  41021
##  6    30   7030
##  7    31    512
##  8    35  25298
##  9    40   2720
## 10    50   3709
## 11    55  15200
## 12    70     54
## 13    85      4
## 14    90    973

exercise 5

df %>%
  filter_at(vars(3:7), any_vars(grepl("NULL", . , ignore.case = TRUE)))

## # A tibble: 208,622 × 10
##    Account…¹ Transaction_Times…² Facto…³ Facto…⁴ Facto…⁵ Facto…⁶ Facto…⁷ Respo…⁸
##        <dbl> <dttm>                <dbl>   <dbl> <chr>     <dbl> <chr>     <dbl>
##  1        28 2009-01-19 13:24:55       2       6 VI           21 NULL       1020
##  2        64 2009-01-01 18:53:02       7       6 MC           20 NULL       1020
##  3        79 2009-01-07 19:41:18       7       6 MC           20 NULL       1020
##  4        88 2009-01-26 15:21:28       8      18 AX           20 NULL       1020
##  5        90 2009-01-21 22:38:08      10       6 DI           20 NULL       1020
##  6        90 2009-01-29 14:26:19      10       6 DI           20 NULL       1020
##  7       111 2009-01-16 11:46:05       7       6 MC           20 NULL       1020
##  8       111 2009-01-28 16:04:45       7       6 MC           20 NULL       1320
##  9       111 2009-01-28 17:51:52       7       6 MC           20 NULL       1320
## 10       111 2009-01-28 17:53:18       7       6 MC           20 NULL       1320
## # … with 208,612 more rows, 2 more variables: Transaction_Status <chr>,
## #   Month <chr>, and abbreviated variable names ¹Account_ID,
## #   ²Transaction_Timestamp, ³Factor_A, ⁴Factor_B, ⁵Factor_C, ⁶Factor_D,
## #   ⁷Factor_E, ⁸Response

exercise 6

df <- df %>%
  mutate_at(-2, as.factor) %>%
  mutate(Month = factor(Month), levels = 1)

glimpse(df)

## Rows: 698,159
## Columns: 11
## $ Account_ID            <fct> 5, 16, 28, 40, 62, 64, 69, 69, 70, 79, 88, 90, 9…
## $ Transaction_Timestamp <dttm> 2009-01-08 00:16:41, 2009-01-20 22:40:08, 2009-…
## $ Factor_A              <fct> 2, 2, 2, 2, 2, 7, 2, 2, 2, 7, 8, 10, 10, 2, 2, 2…
## $ Factor_B              <fct> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 18, 6, 6, 6, 6, 6,…
## $ Factor_C              <fct> VI, VI, VI, VI, VI, MC, VI, VI, VI, MC, AX, DI, …
## $ Factor_D              <fct> 20, 20, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, …
## $ Factor_E              <fct> A, H, NULL, H, B, NULL, H, H, B, NULL, NULL, NUL…
## $ Response              <fct> 1020, 1020, 1020, 1020, 1020, 1020, 1020, 1020, …
## $ Transaction_Status    <fct> Approved, Approved, Approved, Approved, Approved…
## $ Month                 <fct> Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan, Jan…
## $ levels                <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …

levels(df$Month)

##  [1] "Apr" "Aug" "Feb" "Jan" "Jul" "Jun" "Mar" "May" "Nov" "Oct" "Sep"

exercise 7

df %>%
  summarise_if(is.factor, n_distinct )

## # A tibble: 1 × 9
##   Account_ID Factor_A Factor_B Factor_C Factor_D Factor_E Response Trans…¹ Month
##        <int>    <int>    <int>    <int>    <int>    <int>    <int>   <int> <int>
## 1     475413        7        6        4       15       63       42       2    11
## # … with abbreviated variable name ¹Transaction_Status

df %>%
  group_by(Transaction_Status)%>%
  summarise_if(is.factor, n_distinct )

## # A tibble: 2 × 9
##   Transaction_St…¹ Accou…² Facto…³ Facto…⁴ Facto…⁵ Facto…⁶ Facto…⁷ Respo…⁸ Month
##   <fct>              <int>   <int>   <int>   <int>   <int>   <int>   <int> <int>
## 1 Approved          462655       6       5       4      13      60       2    11
## 2 Declined           22013       7       6       4      15      58      40    11
## # … with abbreviated variable names ¹Transaction_Status, ²Account_ID,
## #   ³Factor_A, ⁴Factor_B, ⁵Factor_C, ⁶Factor_D, ⁷Factor_E, ⁸Response

exercise 8

convert_to_qtr <- function(Month) {
  case_when(
    'Jan' == Month~ "Q1",
    "Feb" == Month ~ "Q1" ,
     "Mar" == Month  ~ "Q1",
    "Apr" == Month ~ "Q2",
    "May" ==Month  ~ "Q2",
    "Jun" ==Month  ~ "Q2",
    "Jul" ==Month  ~ "Q3",
    "Aug" ==Month  ~ "Q3",
    "Sep" ==Month  ~ "Q3",
    "Oct" ==Month  ~ "Q4",
    "Nov" ==Month  ~ "Q4",
    "Dec" ==Month  ~ "Q4"
    
  )
}

df <- df %>%
  mutate(QTR = convert_to_qtr(Month))

exercise 9

sw_people%>%
  map_chr(~.x$name)

##  [1] "Luke Skywalker"        "C-3PO"                 "R2-D2"                
##  [4] "Darth Vader"           "Leia Organa"           "Owen Lars"            
##  [7] "Beru Whitesun lars"    "R5-D4"                 "Biggs Darklighter"    
## [10] "Obi-Wan Kenobi"        "Anakin Skywalker"      "Wilhuff Tarkin"       
## [13] "Chewbacca"             "Han Solo"              "Greedo"               
## [16] "Jabba Desilijic Tiure" "Wedge Antilles"        "Jek Tono Porkins"     
## [19] "Yoda"                  "Palpatine"             "Boba Fett"            
## [22] "IG-88"                 "Bossk"                 "Lando Calrissian"     
## [25] "Lobot"                 "Ackbar"                "Mon Mothma"           
## [28] "Arvel Crynyd"          "Wicket Systri Warrick" "Nien Nunb"            
## [31] "Qui-Gon Jinn"          "Nute Gunray"           "Finis Valorum"        
## [34] "Jar Jar Binks"         "Roos Tarpals"          "Rugor Nass"           
## [37] "Ric Olié"              "Watto"                 "Sebulba"              
## [40] "Quarsh Panaka"         "Shmi Skywalker"        "Darth Maul"           
## [43] "Bib Fortuna"           "Ayla Secura"           "Dud Bolt"             
## [46] "Gasgano"               "Ben Quadinaros"        "Mace Windu"           
## [49] "Ki-Adi-Mundi"          "Kit Fisto"             "Eeth Koth"            
## [52] "Adi Gallia"            "Saesee Tiin"           "Yarael Poof"          
## [55] "Plo Koon"              "Mas Amedda"            "Gregar Typho"         
## [58] "Cordé"                 "Cliegg Lars"           "Poggle the Lesser"    
## [61] "Luminara Unduli"       "Barriss Offee"         "Dormé"                
## [64] "Dooku"                 "Bail Prestor Organa"   "Jango Fett"           
## [67] "Zam Wesell"            "Dexter Jettster"       "Lama Su"              
## [70] "Taun We"               "Jocasta Nu"            "Ratts Tyerell"        
## [73] "R4-P17"                "Wat Tambor"            "San Hill"             
## [76] "Shaak Ti"              "Grievous"              "Tarfful"              
## [79] "Raymus Antilles"       "Sly Moore"             "Tion Medon"           
## [82] "Finn"                  "Rey"                   "Poe Dameron"          
## [85] "BB8"                   "Captain Phasma"        "Padmé Amidala"

exercise 10

sw_people %>% map_int(~ length(.x$films))

##  [1] 5 6 7 4 5 3 3 1 1 6 3 2 5 4 1 3 3 1 5 5 3 1 1 2 1 2 1 1 1 1 1 3 1 2 1 1 1 2
## [39] 1 1 2 1 1 3 1 1 1 3 3 3 2 2 2 1 3 2 1 1 1 2 2 1 1 2 2 1 1 1 1 1 1 1 2 1 1 2
## [77] 1 1 2 2 1 1 1 1 1 1 3

exercise 11

sw_people %>%
  map_chr(~.x$name) %>% # extract the names
  set_names(sw_people, nm = .) %>% # set the list names as character names
  map_df(~ length(.x$films)) %>% # extract number of films and make a data frame
  pivot_longer( # tidy data frame
    cols = everything(),
    names_to = "Character",
    values_to = "Films"
  ) %>%
  ggplot(aes(Films, reorder(Character, Films))) + # plot results
  geom_point()

module 6 lab

Shilpa

2022-10-08

exercise 1

exercise 2

exercise 3

exercise 4

exercise 5

exercise 6

exercise 7

exercise 8

exercise 9

exercise 10

exercise 11