glimpse (bank_data)
## Rows: 10,127
## Columns: 20
## $ Attrition_Flag <chr> "Existing Customer", "Existing Customer", "Ex…
## $ Customer_Age <dbl> 45, 49, 51, 40, 40, 44, 51, 32, 37, 48, 42, 6…
## $ Gender <chr> "M", "F", "M", "F", "M", "M", "M", "M", "M", …
## $ Dependent_count <dbl> 3, 5, 3, 4, 3, 2, 4, 0, 3, 2, 5, 1, 1, 3, 2, …
## $ Education_Level <chr> "High School", "Graduate", "Graduate", "High …
## $ Marital_Status <chr> "Married", "Single", "Married", "Unknown", "M…
## $ Income_Category <chr> "$60K - $80K", "Less than $40K", "$80K - $120…
## $ Card_Category <chr> "Blue", "Blue", "Blue", "Blue", "Blue", "Blue…
## $ Months_on_book <dbl> 39, 44, 36, 34, 21, 36, 46, 27, 36, 36, 31, 5…
## $ Total_Relationship_Count <dbl> 5, 6, 4, 3, 5, 3, 6, 2, 5, 6, 5, 6, 3, 5, 5, …
## $ Months_Inactive_12_mon <dbl> 1, 1, 1, 4, 1, 1, 1, 2, 2, 3, 3, 2, 6, 1, 2, …
## $ Contacts_Count_12_mon <dbl> 3, 2, 0, 1, 0, 2, 3, 2, 0, 3, 2, 3, 0, 3, 2, …
## $ Credit_Limit <dbl> 12691.0, 8256.0, 3418.0, 3313.0, 4716.0, 4010…
## $ Total_Revolving_Bal <dbl> 777, 864, 0, 2517, 0, 1247, 2264, 1396, 2517,…
## $ Avg_Open_To_Buy <dbl> 11914.0, 7392.0, 3418.0, 796.0, 4716.0, 2763.…
## $ Total_Amt_Chng_Q4_Q1 <dbl> 1.335, 1.541, 2.594, 1.405, 2.175, 1.376, 1.9…
## $ Total_Trans_Amt <dbl> 1144, 1291, 1887, 1171, 816, 1088, 1330, 1538…
## $ Total_Trans_Ct <dbl> 42, 33, 20, 20, 28, 24, 31, 36, 24, 32, 42, 2…
## $ Total_Ct_Chng_Q4_Q1 <dbl> 1.625, 3.714, 2.333, 2.333, 2.500, 0.846, 0.7…
## $ Avg_Utilization_Ratio <dbl> 0.061, 0.105, 0.000, 0.760, 0.000, 0.311, 0.0…
Answer: I think Gender, Marital_Status can absolutely be factor
bank_data %>%
mutate(Marital_Status = as.factor(Marital_Status)) -> bank_data
levels(bank_data$Marital_Status)
## [1] "Divorced" "Married" "Single" "Unknown"
bank_data %>%
mutate(Gender = as.factor(Gender)) -> bank_data
levels(bank_data$Gender)
## [1] "F" "M"
Answer: I think Income_Category and Education_Level can absolutely be ordinal factor
bank_data %>%
mutate(Education_Level = as.factor(Education_Level)) -> bank_data
levels(bank_data$Education_Level)
## [1] "College" "Doctorate" "Graduate" "High School"
## [5] "Post-Graduate" "Uneducated" "Unknown"
bank_data$Education_Level <- fct_relevel(bank_data$Education_Level, "Uneducated", "High School", "College", "Graduate", "Post-Graduate", "Doctorate", "Unknown")
bank_data %>%
mutate(Income_Category = as.factor(Income_Category)) -> bank_data
levels(bank_data$Income_Category)
## [1] "$120K +" "$40K - $60K" "$60K - $80K" "$80K - $120K"
## [5] "Less than $40K" "Unknown"
bank_data$Income_Category <- fct_relevel(bank_data$Income_Category, "Less than $40K", "$40K - $60K", "$60K - $80K", "$80K - $120K", "$120K +", "Unknown")
bank_data %>%
group_by(Education_Level) %>%
summarise(avg_util = mean(Avg_Utilization_Ratio, na.rm = TRUE)) %>%
ggplot(aes(x = Education_Level, y = avg_util, fill = Education_Level)) +
geom_col() +
labs(title = "Avg Utilization Ratio by Education Level",
x = "Education Level", y = "Avg Utilization Ratio")
levels(gss_cat$marital)
## [1] "No answer" "Never married" "Separated" "Divorced"
## [5] "Widowed" "Married"
gss_cat %>%
mutate(marital = fct_recode(marital,
"Never married"= "Never married",
"No answer" = "No answer",
"Married" = "Married" ,
"Once Married" = "Separated",
"Once Married" = "Divorced",
"Once Married" = "Widowed"
)) %>%
count(marital)
## # A tibble: 4 × 2
## marital n
## <fct> <int>
## 1 No answer 17
## 2 Never married 5416
## 3 Once Married 5933
## 4 Married 10117
gss_cat %>%
group_by(marital) %>%
summarise(mtvhours = mean(tvhours, na.rm = TRUE)) %>%
ggplot(aes(x = marital, y = mtvhours, fill = marital)) +
geom_col() +
labs(title = "Effect of Marital Status on TV Hours",
x = "Marital Status", y = "Avg TV Hours per Day")
library(nycflights13)
library(dplyr)
flights %>%
inner_join(airports, by = c("dest" = "faa")) %>%
filter(!is.na(tz)) %>%
distinct(tzone) %>%
pull(tzone) %>%
intersect(OlsonNames())
## [1] "America/Chicago" "America/New_York" "America/Los_Angeles"
## [4] "America/Phoenix" "America/Denver" "Pacific/Honolulu"
## [7] "America/Anchorage"
Chicago Dallas Denver Seattle Anchorage Honolulu
location<- c(
"ORD", # Chicago
"DFW", # Dallas
"DEN", # Denver
"SEA", # Seattle
"ANC", # Anchorage
"HNL" # Honolulu
)
airports %>%
filter(faa %in% location) %>%
distinct(faa, name, tzone) %>%
mutate(
nyc_offset = as.numeric(format(now(tzone = "America/New_York"), "%z")) / 100,
local_offset = map_dbl(tzone, ~ as.numeric(format(now(tzone = .x), "%z")) / 100),
diff_from_nyc_hrs = local_offset - nyc_offset
) %>%
dplyr::select(diff_from_nyc_hrs, name, faa, tzone ) %>%
arrange(diff_from_nyc_hrs)
## # A tibble: 6 × 4
## diff_from_nyc_hrs name faa tzone
## <dbl> <chr> <chr> <chr>
## 1 -6 Honolulu Intl HNL Pacific/Honolulu
## 2 -4 Ted Stevens Anchorage Intl ANC America/Anchorage
## 3 -3 Seattle Tacoma Intl SEA America/Los_Angeles
## 4 -2 Denver Intl DEN America/Denver
## 5 -1 Dallas Fort Worth Intl DFW America/Chicago
## 6 -1 Chicago Ohare Intl ORD America/Chicago
glimpse(airports)
## Rows: 1,458
## Columns: 8
## $ faa <chr> "04G", "06A", "06C", "06N", "09J", "0A9", "0G6", "0G7", "0P2", "…
## $ name <chr> "Lansdowne Airport", "Moton Field Municipal Airport", "Schaumbur…
## $ lat <dbl> 41.13047, 32.46057, 41.98934, 41.43191, 31.07447, 36.37122, 41.4…
## $ lon <dbl> -80.61958, -85.68003, -88.10124, -74.39156, -81.42778, -82.17342…
## $ alt <dbl> 1044, 264, 801, 523, 11, 1593, 730, 492, 1000, 108, 409, 875, 10…
## $ tz <dbl> -5, -6, -6, -5, -5, -5, -5, -5, -5, -8, -5, -6, -5, -5, -5, -5, …
## $ dst <chr> "A", "A", "A", "A", "A", "A", "A", "A", "U", "A", "A", "U", "A",…
## $ tzone <chr> "America/New_York", "America/Chicago", "America/Chicago", "Ameri…
Time_difference_NYC <- function(dest) {
airport_info <- airports %>%
filter(faa == dest)
if (nrow(airport_info) == 0) {
stop(paste("Airport code", dest, "not found in the airports dataset."))
}
if (is.na(airport_info$tzone)) {
stop(paste("Timezone not available for airport", dest))
}
nyc_offset <- as.numeric(format(now(tzone = "America/New_York"), "%z")) / 100
local_offset <- as.numeric(format(now(tzone = airport_info$tzone), "%z")) / 100
diff <- local_offset - nyc_offset
cat("Airport :", airport_info$name, "(", dest, ")\n")
cat("Timezone :", airport_info$tzone, "\n")
cat("Diff NYC :", diff, "hour(s)\n")
}
flight_time <- function(dep_time, arr_time, origin, dest) {
origin_tz <- airports %>% filter(faa == origin) %>% pull(tzone)
dest_tz <- airports %>% filter(faa == dest) %>% pull(tzone)
dep_hour <- dep_time %/% 100
dep_min <- dep_time %% 100
arr_hour <- arr_time %/% 100
arr_min <- arr_time %% 100
dep_dt <- make_datetime(2013, 1, 1, dep_hour, dep_min) %>%
force_tz(origin_tz)
arr_dt <- make_datetime(2013, 1, 1, arr_hour, arr_min) %>%
force_tz(dest_tz)
du <- as.numeric(as.duration(arr_dt - dep_dt)) / 60
cat("Origin :", origin, "(", origin_tz, ")\n")
cat("Destination :", dest, "(", dest_tz, ")\n")
cat("Departure :", dep_hour, ":", dep_min, "\n")
cat("Arrival :", arr_hour, ":", arr_min, "\n")
cat("Duration :", du, "mins\n")
}