library(readxl)
library(repurrrsive)
library(tidyverse)
# Identify the path
path <- "C:/Users/norah/OneDrive/Desktop/BANA 4080/data_file/lab_6_data"
# Use the path to create a list
files <- list.files(path = path, pattern = "Month-\\d{2}\\.csv", full.names = TRUE)
# Combine the data into one data frame
df <- files %>%
map_dfr(~ read_csv(.))
## Rows: 54535 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 44380 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 53259 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 51033 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 55079 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 59666 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 64268 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 69492 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 71855 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 80277 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Rows: 94315 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Factor_C, Factor_E, Transaction_Status, Month
## dbl (5): Account_ID, Factor_A, Factor_B, Factor_D, Response
## dttm (1): Transaction_Timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Check the dimensions
dim(df)
## [1] 698159 10
glimpse(df)
## Rows: 698,159
## Columns: 10
## $ Account_ID <dbl> 5, 16, 28, 40, 62, 64, 69, 69, 70, 79, 88, 90, 9…
## $ Transaction_Timestamp <dttm> 2009-01-08 00:16:41, 2009-01-20 22:40:08, 2009-…
## $ Factor_A <dbl> 2, 2, 2, 2, 2, 7, 2, 2, 2, 7, 8, 10, 10, 2, 2, 2…
## $ Factor_B <dbl> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 18, 6, 6, 6, 6, 6,…
## $ Factor_C <chr> "VI", "VI", "VI", "VI", "VI", "MC", "VI", "VI", …
## $ Factor_D <dbl> 20, 20, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, …
## $ Factor_E <chr> "A", "H", "NULL", "H", "B", "NULL", "H", "H", "B…
## $ Response <dbl> 1020, 1020, 1020, 1020, 1020, 1020, 1020, 1020, …
## $ Transaction_Status <chr> "Approved", "Approved", "Approved", "Approved", …
## $ Month <chr> "Jan", "Jan", "Jan", "Jan", "Jan", "Jan", "Jan",…
# Create list as character vectors
classes <- map_chr(df, ~ class(.x)[1])
# Print the values
print(classes)
## Account_ID Transaction_Timestamp Factor_A
## "numeric" "POSIXct" "numeric"
## Factor_B Factor_C Factor_D
## "numeric" "character" "numeric"
## Factor_E Response Transaction_Status
## "character" "numeric" "character"
## Month
## "character"
# Count unique values in each column
unique_values <- map_int(df, ~ n_distinct(.x))
# Print the values
print(unique_values)
## Account_ID Transaction_Timestamp Factor_A
## 475413 686538 7
## Factor_B Factor_C Factor_D
## 6 4 15
## Factor_E Response Transaction_Status
## 63 42 2
## Month
## 11
# Correct the values
df <- df %>%
mutate(Factor_D = if_else(Factor_D == 26, 25, Factor_D))
# Check number of unique values
unique_values_factor_d <- n_distinct(df$Factor_D)
# Print the result
print(unique_values_factor_d)
## [1] 14
# Count for each level
level_counts <- df %>%
count(Factor_D)
# Print the result
print(level_counts)
## # A tibble: 14 × 2
## Factor_D n
## <dbl> <int>
## 1 10 4595
## 2 15 1089
## 3 20 527882
## 4 21 68072
## 5 25 41021
## 6 30 7030
## 7 31 512
## 8 35 25298
## 9 40 2720
## 10 50 3709
## 11 55 15200
## 12 70 54
## 13 85 4
## 14 90 973
# Filter out rows that contain "NULL"
filtered_df <- df %>%
filter_at(vars(starts_with("Factor_")), all_vars(. != "NULL"))
# Check number of rows
filtered_df_count <- nrow(filtered_df)
# Print the result
print(filtered_df_count)
## [1] 489537
# Convert columns (except time stamp) to factors
df <- df %>%
mutate_at(vars(-Transaction_Timestamp), as.factor) %>%
mutate(Month = factor(Month, levels = c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"),
ordered = TRUE))
# Check is variables were converted
glimpse(df)
## Rows: 698,159
## Columns: 10
## $ Account_ID <fct> 5, 16, 28, 40, 62, 64, 69, 69, 70, 79, 88, 90, 9…
## $ Transaction_Timestamp <dttm> 2009-01-08 00:16:41, 2009-01-20 22:40:08, 2009-…
## $ Factor_A <fct> 2, 2, 2, 2, 2, 7, 2, 2, 2, 7, 8, 10, 10, 2, 2, 2…
## $ Factor_B <fct> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 18, 6, 6, 6, 6, 6,…
## $ Factor_C <fct> VI, VI, VI, VI, VI, MC, VI, VI, VI, MC, AX, DI, …
## $ Factor_D <fct> 20, 20, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, …
## $ Factor_E <fct> A, H, NULL, H, B, NULL, H, H, B, NULL, NULL, NUL…
## $ Response <fct> 1020, 1020, 1020, 1020, 1020, 1020, 1020, 1020, …
## $ Transaction_Status <fct> Approved, Approved, Approved, Approved, Approved…
## $ Month <ord> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
# Check that "Month" is an ordered factor
levels(df$Month)
## [1] "January" "February" "March" "April" "May" "June"
## [7] "July" "August" "September" "October" "November" "December"
# Summarize number of unique values
df %>%
summarize_if(~ is.numeric(.) | is.factor(.), n_distinct)
## # A tibble: 1 × 9
## Account_ID Factor_A Factor_B Factor_C Factor_D Factor_E Response
## <int> <int> <int> <int> <int> <int> <int>
## 1 475413 7 6 4 14 63 42
## # ℹ 2 more variables: Transaction_Status <int>, Month <int>
df %>%
group_by(Transaction_Status) %>%
summarize_if(~ is.numeric(.) | is.factor(.), n_distinct)
## # A tibble: 2 × 9
## Transaction_Status Account_ID Factor_A Factor_B Factor_C Factor_D Factor_E
## <fct> <int> <int> <int> <int> <int> <int>
## 1 Approved 462655 6 5 4 12 60
## 2 Declined 22013 7 6 4 14 58
## # ℹ 2 more variables: Response <int>, Month <int>
convert_to_qtr <- function(months) {
case_when(
months %in% c("Jan", "Feb", "Mar") ~ "Q1",
months %in% c("Apr", "May", "Jun") ~ "Q2",
months %in% c("Jul", "Aug", "Sep") ~ "Q3",
months %in% c("Oct", "Nov", "Dec") ~ "Q4",
TRUE ~ NA_character_
)
}
# Test the function
example_months <- c("Jan", "Mar", "May", "May", "Aug", "Nov", "Nov", "Dec")
convert_to_qtr(example_months)
## [1] "Q1" "Q1" "Q2" "Q2" "Q3" "Q4" "Q4" "Q4"
# Use with mutate
df <- df %>%
mutate(Qtr = convert_to_qtr(Month))
df %>%
group_by(Qtr) %>%
summarize(Observation_Count = n())
## # A tibble: 2 × 2
## Qtr Observation_Count
## <chr> <int>
## 1 Q2 55079
## 2 <NA> 643080
# Extract names
sw_people %>%
map_chr(~ .x[[1]])
## [1] "Luke Skywalker" "C-3PO" "R2-D2"
## [4] "Darth Vader" "Leia Organa" "Owen Lars"
## [7] "Beru Whitesun lars" "R5-D4" "Biggs Darklighter"
## [10] "Obi-Wan Kenobi" "Anakin Skywalker" "Wilhuff Tarkin"
## [13] "Chewbacca" "Han Solo" "Greedo"
## [16] "Jabba Desilijic Tiure" "Wedge Antilles" "Jek Tono Porkins"
## [19] "Yoda" "Palpatine" "Boba Fett"
## [22] "IG-88" "Bossk" "Lando Calrissian"
## [25] "Lobot" "Ackbar" "Mon Mothma"
## [28] "Arvel Crynyd" "Wicket Systri Warrick" "Nien Nunb"
## [31] "Qui-Gon Jinn" "Nute Gunray" "Finis Valorum"
## [34] "Jar Jar Binks" "Roos Tarpals" "Rugor Nass"
## [37] "Ric Olié" "Watto" "Sebulba"
## [40] "Quarsh Panaka" "Shmi Skywalker" "Darth Maul"
## [43] "Bib Fortuna" "Ayla Secura" "Dud Bolt"
## [46] "Gasgano" "Ben Quadinaros" "Mace Windu"
## [49] "Ki-Adi-Mundi" "Kit Fisto" "Eeth Koth"
## [52] "Adi Gallia" "Saesee Tiin" "Yarael Poof"
## [55] "Plo Koon" "Mas Amedda" "Gregar Typho"
## [58] "Cordé" "Cliegg Lars" "Poggle the Lesser"
## [61] "Luminara Unduli" "Barriss Offee" "Dormé"
## [64] "Dooku" "Bail Prestor Organa" "Jango Fett"
## [67] "Zam Wesell" "Dexter Jettster" "Lama Su"
## [70] "Taun We" "Jocasta Nu" "Ratts Tyerell"
## [73] "R4-P17" "Wat Tambor" "San Hill"
## [76] "Shaak Ti" "Grievous" "Tarfful"
## [79] "Raymus Antilles" "Sly Moore" "Tion Medon"
## [82] "Finn" "Rey" "Poe Dameron"
## [85] "BB8" "Captain Phasma" "Padmé Amidala"
first <- sw_people[[1]]
print(first)
## $name
## [1] "Luke Skywalker"
##
## $height
## [1] "172"
##
## $mass
## [1] "77"
##
## $hair_color
## [1] "blond"
##
## $skin_color
## [1] "fair"
##
## $eye_color
## [1] "blue"
##
## $birth_year
## [1] "19BBY"
##
## $gender
## [1] "male"
##
## $homeworld
## [1] "http://swapi.co/api/planets/1/"
##
## $films
## [1] "http://swapi.co/api/films/6/" "http://swapi.co/api/films/3/"
## [3] "http://swapi.co/api/films/2/" "http://swapi.co/api/films/1/"
## [5] "http://swapi.co/api/films/7/"
##
## $species
## [1] "http://swapi.co/api/species/1/"
##
## $vehicles
## [1] "http://swapi.co/api/vehicles/14/" "http://swapi.co/api/vehicles/30/"
##
## $starships
## [1] "http://swapi.co/api/starships/12/" "http://swapi.co/api/starships/22/"
##
## $created
## [1] "2014-12-09T13:50:51.644000Z"
##
## $edited
## [1] "2014-12-20T21:17:56.891000Z"
##
## $url
## [1] "http://swapi.co/api/people/1/"
film_counts<- sw_people %>%
map_df(~ tibble(
Character = .x[[1]],
Films = length(.x$films)
))
sw_people %>%
map_chr(~ .x[[1]]) %>% # extract the names
set_names(sw_people, nm = .) %>% # set the list names as character names
map_df(~ tibble( # extract number of films and make a data frame
Character = .x[[1]], # Extract character names
Films = length(.x$films) # Count the number of films
))
## # A tibble: 87 × 2
## Character Films
## <chr> <int>
## 1 Luke Skywalker 5
## 2 C-3PO 6
## 3 R2-D2 7
## 4 Darth Vader 4
## 5 Leia Organa 5
## 6 Owen Lars 3
## 7 Beru Whitesun lars 3
## 8 R5-D4 1
## 9 Biggs Darklighter 1
## 10 Obi-Wan Kenobi 6
## # ℹ 77 more rows
film_counts <- film_counts %>%
mutate(Group = cut_number(1:n(), n = 4, labels = paste("Group", 1:4)))
ggplot(film_counts, aes(x = reorder(Character, Films), y = Films)) +
geom_point(color = "purple", size = 2) +
labs(title = "Number of Films Star Wars Characters Appeared In",
x = "Character",
y = "Number of Films") +
scale_x_discrete(expand = expansion(add = .8)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, ),
plot.title = element_text(size = 16, face = "bold"),
axis.title = element_text(size = 14),
axis.text = element_text(size = 12)) +
facet_wrap(~ Group, scales = "free_x")
