Lab 6

library(readxl)
library(repurrrsive)
library(tidyverse)

# Identify the path
path <- "C:/Users/norah/OneDrive/Desktop/BANA 4080/data_file/lab_6_data"

# Use the path to create a list
files <- list.files(path = path, pattern = "Month-\\d{2}\\.csv", full.names = TRUE)

# Combine the data into one data frame
df <- files %>%
  map_dfr(~ read_csv(.))

## Rows: 54535 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 44380 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 53259 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 51033 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 55079 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 59666 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 64268 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 69492 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 71855 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 80277 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 94315 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl  (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Check the dimensions
dim(df)

## [1] 698159     10

glimpse(df)

## Rows: 698,159
## Columns: 10
## $ Account_ID            <dbl> 5, 16, 28, 40, 62, 64, 69, 69, 70, 79, 88, 90, 9…
## $ Transaction_Timestamp <dttm> 2009-01-08 00:16:41, 2009-01-20 22:40:08, 2009-…
## $ Factor_A              <dbl> 2, 2, 2, 2, 2, 7, 2, 2, 2, 7, 8, 10, 10, 2, 2, 2…
## $ Factor_B              <dbl> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 18, 6, 6, 6, 6, 6,…
## $ Factor_C              <chr> "VI", "VI", "VI", "VI", "VI", "MC", "VI", "VI", …
## $ Factor_D              <dbl> 20, 20, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, …
## $ Factor_E              <chr> "A", "H", "NULL", "H", "B", "NULL", "H", "H", "B…
## $ Response              <dbl> 1020, 1020, 1020, 1020, 1020, 1020, 1020, 1020, …
## $ Transaction_Status    <chr> "Approved", "Approved", "Approved", "Approved", …
## $ Month                 <chr> "Jan", "Jan", "Jan", "Jan", "Jan", "Jan", "Jan",…

# Create list as character vectors
classes <- map_chr(df, ~ class(.x)[1])

# Print the values
print(classes)

##            Account_ID Transaction_Timestamp              Factor_A 
##             "numeric"             "POSIXct"             "numeric" 
##              Factor_B              Factor_C              Factor_D 
##             "numeric"           "character"             "numeric" 
##              Factor_E              Response    Transaction_Status 
##           "character"             "numeric"           "character" 
##                 Month 
##           "character"

# Count unique values in each column
unique_values <- map_int(df, ~ n_distinct(.x))

# Print the values
print(unique_values)

##            Account_ID Transaction_Timestamp              Factor_A 
##                475413                686538                     7 
##              Factor_B              Factor_C              Factor_D 
##                     6                     4                    15 
##              Factor_E              Response    Transaction_Status 
##                    63                    42                     2 
##                 Month 
##                    11

# Correct the values
df <- df %>%
  mutate(Factor_D = if_else(Factor_D == 26, 25, Factor_D))

# Check number of unique values
unique_values_factor_d <- n_distinct(df$Factor_D)

# Print the result
print(unique_values_factor_d)

## [1] 14

# Count for each level 
level_counts <- df %>% 
  count(Factor_D)

# Print the result 
print(level_counts)

## # A tibble: 14 × 2
##    Factor_D      n
##       <dbl>  <int>
##  1       10   4595
##  2       15   1089
##  3       20 527882
##  4       21  68072
##  5       25  41021
##  6       30   7030
##  7       31    512
##  8       35  25298
##  9       40   2720
## 10       50   3709
## 11       55  15200
## 12       70     54
## 13       85      4
## 14       90    973

# Filter out rows that contain "NULL" 
filtered_df <- df %>%
  filter_at(vars(starts_with("Factor_")), all_vars(. != "NULL"))

# Check number of rows 
filtered_df_count <- nrow(filtered_df)

# Print the result
print(filtered_df_count)

## [1] 489537

# Convert columns (except time stamp) to factors 
df <- df %>% 
  mutate_at(vars(-Transaction_Timestamp), as.factor) %>%
  mutate(Month = factor(Month, levels = c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"),
                        ordered = TRUE))

# Check is variables were converted
glimpse(df)

## Rows: 698,159
## Columns: 10
## $ Account_ID            <fct> 5, 16, 28, 40, 62, 64, 69, 69, 70, 79, 88, 90, 9…
## $ Transaction_Timestamp <dttm> 2009-01-08 00:16:41, 2009-01-20 22:40:08, 2009-…
## $ Factor_A              <fct> 2, 2, 2, 2, 2, 7, 2, 2, 2, 7, 8, 10, 10, 2, 2, 2…
## $ Factor_B              <fct> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 18, 6, 6, 6, 6, 6,…
## $ Factor_C              <fct> VI, VI, VI, VI, VI, MC, VI, VI, VI, MC, AX, DI, …
## $ Factor_D              <fct> 20, 20, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, …
## $ Factor_E              <fct> A, H, NULL, H, B, NULL, H, H, B, NULL, NULL, NUL…
## $ Response              <fct> 1020, 1020, 1020, 1020, 1020, 1020, 1020, 1020, …
## $ Transaction_Status    <fct> Approved, Approved, Approved, Approved, Approved…
## $ Month                 <ord> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …

# Check that "Month" is an ordered factor
levels(df$Month)

##  [1] "January"   "February"  "March"     "April"     "May"       "June"     
##  [7] "July"      "August"    "September" "October"   "November"  "December"

# Summarize number of unique values
df %>%
  summarize_if(~ is.numeric(.) | is.factor(.), n_distinct)

## # A tibble: 1 × 9
##   Account_ID Factor_A Factor_B Factor_C Factor_D Factor_E Response
##        <int>    <int>    <int>    <int>    <int>    <int>    <int>
## 1     475413        7        6        4       14       63       42
## # ℹ 2 more variables: Transaction_Status <int>, Month <int>

df %>%
  group_by(Transaction_Status) %>% 
  summarize_if(~ is.numeric(.) | is.factor(.), n_distinct)

## # A tibble: 2 × 9
##   Transaction_Status Account_ID Factor_A Factor_B Factor_C Factor_D Factor_E
##   <fct>                   <int>    <int>    <int>    <int>    <int>    <int>
## 1 Approved               462655        6        5        4       12       60
## 2 Declined                22013        7        6        4       14       58
## # ℹ 2 more variables: Response <int>, Month <int>

convert_to_qtr <- function(months) {
  case_when(
    months %in% c("Jan", "Feb", "Mar") ~ "Q1",
    months %in% c("Apr", "May", "Jun") ~ "Q2",
    months %in% c("Jul", "Aug", "Sep") ~ "Q3",
    months %in% c("Oct", "Nov", "Dec") ~ "Q4",
    TRUE ~ NA_character_
  )
}

# Test the function 
example_months <- c("Jan", "Mar", "May", "May", "Aug", "Nov", "Nov", "Dec")
convert_to_qtr(example_months)

## [1] "Q1" "Q1" "Q2" "Q2" "Q3" "Q4" "Q4" "Q4"

# Use with mutate 
df <- df %>% 
  mutate(Qtr = convert_to_qtr(Month))

df %>%
  group_by(Qtr) %>% 
  summarize(Observation_Count = n())

## # A tibble: 2 × 2
##   Qtr   Observation_Count
##   <chr>             <int>
## 1 Q2                55079
## 2 <NA>             643080

# Extract names 
sw_people %>% 
  map_chr(~ .x[[1]])

##  [1] "Luke Skywalker"        "C-3PO"                 "R2-D2"                
##  [4] "Darth Vader"           "Leia Organa"           "Owen Lars"            
##  [7] "Beru Whitesun lars"    "R5-D4"                 "Biggs Darklighter"    
## [10] "Obi-Wan Kenobi"        "Anakin Skywalker"      "Wilhuff Tarkin"       
## [13] "Chewbacca"             "Han Solo"              "Greedo"               
## [16] "Jabba Desilijic Tiure" "Wedge Antilles"        "Jek Tono Porkins"     
## [19] "Yoda"                  "Palpatine"             "Boba Fett"            
## [22] "IG-88"                 "Bossk"                 "Lando Calrissian"     
## [25] "Lobot"                 "Ackbar"                "Mon Mothma"           
## [28] "Arvel Crynyd"          "Wicket Systri Warrick" "Nien Nunb"            
## [31] "Qui-Gon Jinn"          "Nute Gunray"           "Finis Valorum"        
## [34] "Jar Jar Binks"         "Roos Tarpals"          "Rugor Nass"           
## [37] "Ric Olié"              "Watto"                 "Sebulba"              
## [40] "Quarsh Panaka"         "Shmi Skywalker"        "Darth Maul"           
## [43] "Bib Fortuna"           "Ayla Secura"           "Dud Bolt"             
## [46] "Gasgano"               "Ben Quadinaros"        "Mace Windu"           
## [49] "Ki-Adi-Mundi"          "Kit Fisto"             "Eeth Koth"            
## [52] "Adi Gallia"            "Saesee Tiin"           "Yarael Poof"          
## [55] "Plo Koon"              "Mas Amedda"            "Gregar Typho"         
## [58] "Cordé"                 "Cliegg Lars"           "Poggle the Lesser"    
## [61] "Luminara Unduli"       "Barriss Offee"         "Dormé"                
## [64] "Dooku"                 "Bail Prestor Organa"   "Jango Fett"           
## [67] "Zam Wesell"            "Dexter Jettster"       "Lama Su"              
## [70] "Taun We"               "Jocasta Nu"            "Ratts Tyerell"        
## [73] "R4-P17"                "Wat Tambor"            "San Hill"             
## [76] "Shaak Ti"              "Grievous"              "Tarfful"              
## [79] "Raymus Antilles"       "Sly Moore"             "Tion Medon"           
## [82] "Finn"                  "Rey"                   "Poe Dameron"          
## [85] "BB8"                   "Captain Phasma"        "Padmé Amidala"

first <- sw_people[[1]]
print(first)

## $name
## [1] "Luke Skywalker"
## 
## $height
## [1] "172"
## 
## $mass
## [1] "77"
## 
## $hair_color
## [1] "blond"
## 
## $skin_color
## [1] "fair"
## 
## $eye_color
## [1] "blue"
## 
## $birth_year
## [1] "19BBY"
## 
## $gender
## [1] "male"
## 
## $homeworld
## [1] "http://swapi.co/api/planets/1/"
## 
## $films
## [1] "http://swapi.co/api/films/6/" "http://swapi.co/api/films/3/"
## [3] "http://swapi.co/api/films/2/" "http://swapi.co/api/films/1/"
## [5] "http://swapi.co/api/films/7/"
## 
## $species
## [1] "http://swapi.co/api/species/1/"
## 
## $vehicles
## [1] "http://swapi.co/api/vehicles/14/" "http://swapi.co/api/vehicles/30/"
## 
## $starships
## [1] "http://swapi.co/api/starships/12/" "http://swapi.co/api/starships/22/"
## 
## $created
## [1] "2014-12-09T13:50:51.644000Z"
## 
## $edited
## [1] "2014-12-20T21:17:56.891000Z"
## 
## $url
## [1] "http://swapi.co/api/people/1/"

film_counts<- sw_people %>% 
  map_df(~ tibble(
    Character = .x[[1]], 
    Films = length(.x$films)
  ))

sw_people %>%
  map_chr(~ .x[[1]]) %>%                  # extract the names
  set_names(sw_people, nm = .) %>%                   # set the list names as character names
  map_df(~ tibble(                        # extract number of films and make a data frame
    Character = .x[[1]],                  # Extract character names
    Films = length(.x$films)              # Count the number of films
  ))

## # A tibble: 87 × 2
##    Character          Films
##    <chr>              <int>
##  1 Luke Skywalker         5
##  2 C-3PO                  6
##  3 R2-D2                  7
##  4 Darth Vader            4
##  5 Leia Organa            5
##  6 Owen Lars              3
##  7 Beru Whitesun lars     3
##  8 R5-D4                  1
##  9 Biggs Darklighter      1
## 10 Obi-Wan Kenobi         6
## # ℹ 77 more rows

film_counts <- film_counts %>%
  mutate(Group = cut_number(1:n(), n = 4, labels = paste("Group", 1:4)))


ggplot(film_counts, aes(x = reorder(Character, Films), y = Films)) + 
  geom_point(color = "purple", size = 2) +                            
  labs(title = "Number of Films Star Wars Characters Appeared In", 
       x = "Character", 
       y = "Number of Films") + 
  scale_x_discrete(expand = expansion(add = .8)) +
  theme_minimal() +                     
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, ), 
        plot.title = element_text(size = 16, face = "bold"),   
        axis.title = element_text(size = 14),                       
        axis.text = element_text(size = 12)) + 
  facet_wrap(~ Group, scales = "free_x")

Lab 6

Norah French

2024-10-06