2. Factors

glimpse (bank_data)
## Rows: 10,127
## Columns: 20
## $ Attrition_Flag           <chr> "Existing Customer", "Existing Customer", "Ex…
## $ Customer_Age             <dbl> 45, 49, 51, 40, 40, 44, 51, 32, 37, 48, 42, 6…
## $ Gender                   <chr> "M", "F", "M", "F", "M", "M", "M", "M", "M", …
## $ Dependent_count          <dbl> 3, 5, 3, 4, 3, 2, 4, 0, 3, 2, 5, 1, 1, 3, 2, …
## $ Education_Level          <chr> "High School", "Graduate", "Graduate", "High …
## $ Marital_Status           <chr> "Married", "Single", "Married", "Unknown", "M…
## $ Income_Category          <chr> "$60K - $80K", "Less than $40K", "$80K - $120…
## $ Card_Category            <chr> "Blue", "Blue", "Blue", "Blue", "Blue", "Blue…
## $ Months_on_book           <dbl> 39, 44, 36, 34, 21, 36, 46, 27, 36, 36, 31, 5…
## $ Total_Relationship_Count <dbl> 5, 6, 4, 3, 5, 3, 6, 2, 5, 6, 5, 6, 3, 5, 5, …
## $ Months_Inactive_12_mon   <dbl> 1, 1, 1, 4, 1, 1, 1, 2, 2, 3, 3, 2, 6, 1, 2, …
## $ Contacts_Count_12_mon    <dbl> 3, 2, 0, 1, 0, 2, 3, 2, 0, 3, 2, 3, 0, 3, 2, …
## $ Credit_Limit             <dbl> 12691.0, 8256.0, 3418.0, 3313.0, 4716.0, 4010…
## $ Total_Revolving_Bal      <dbl> 777, 864, 0, 2517, 0, 1247, 2264, 1396, 2517,…
## $ Avg_Open_To_Buy          <dbl> 11914.0, 7392.0, 3418.0, 796.0, 4716.0, 2763.…
## $ Total_Amt_Chng_Q4_Q1     <dbl> 1.335, 1.541, 2.594, 1.405, 2.175, 1.376, 1.9…
## $ Total_Trans_Amt          <dbl> 1144, 1291, 1887, 1171, 816, 1088, 1330, 1538…
## $ Total_Trans_Ct           <dbl> 42, 33, 20, 20, 28, 24, 31, 36, 24, 32, 42, 2…
## $ Total_Ct_Chng_Q4_Q1      <dbl> 1.625, 3.714, 2.333, 2.333, 2.500, 0.846, 0.7…
## $ Avg_Utilization_Ratio    <dbl> 0.061, 0.105, 0.000, 0.760, 0.000, 0.311, 0.0…

Which features can be regarded as a factor?

Answer: I think Gender, Marital_Status can absolutely be factor

bank_data %>%
  mutate(Marital_Status = as.factor(Marital_Status)) -> bank_data
levels(bank_data$Marital_Status)
## [1] "Divorced" "Married"  "Single"   "Unknown"
bank_data %>%
  mutate(Gender = as.factor(Gender)) -> bank_data
levels(bank_data$Gender)
## [1] "F" "M"

Which features can be regarded as an ordered factor (ordinal)?

Answer: I think Income_Category and Education_Level can absolutely be ordinal factor

bank_data %>%
  mutate(Education_Level = as.factor(Education_Level)) -> bank_data
levels(bank_data$Education_Level)
## [1] "College"       "Doctorate"     "Graduate"      "High School"  
## [5] "Post-Graduate" "Uneducated"    "Unknown"
bank_data$Education_Level <- fct_relevel(bank_data$Education_Level, "Uneducated", "High School", "College", "Graduate", "Post-Graduate", "Doctorate", "Unknown")

bank_data %>%
  mutate(Income_Category = as.factor(Income_Category)) -> bank_data
levels(bank_data$Income_Category)
## [1] "$120K +"        "$40K - $60K"    "$60K - $80K"    "$80K - $120K"  
## [5] "Less than $40K" "Unknown"
bank_data$Income_Category <- fct_relevel(bank_data$Income_Category, "Less than $40K", "$40K - $60K", "$60K - $80K", "$80K - $120K", "$120K +", "Unknown")

Visualize the effect of education level on averag utiliation ratio

bank_data %>%
  group_by(Education_Level) %>%
  summarise(avg_util = mean(Avg_Utilization_Ratio, na.rm = TRUE)) %>%
  ggplot(aes(x = Education_Level, y = avg_util, fill = Education_Level)) +
  geom_col() +
  labs(title = "Avg Utilization Ratio by Education Level",
       x = "Education Level", y = "Avg Utilization Ratio") 

What are the levels of marital variable?

levels(gss_cat$marital)
## [1] "No answer"     "Never married" "Separated"     "Divorced"     
## [5] "Widowed"       "Married"

Combine “Separated”, “Divorced”, “Widowed” into a new category “Once Married”

gss_cat %>%
  mutate(marital = fct_recode(marital,
    "Never married"= "Never married",
    "No answer" =  "No answer",
    "Married" = "Married" ,
    "Once Married" = "Separated",
    "Once Married" = "Divorced",
    "Once Married" = "Widowed"
  )) %>%
  count(marital)
## # A tibble: 4 × 2
##   marital           n
##   <fct>         <int>
## 1 No answer        17
## 2 Never married  5416
## 3 Once Married   5933
## 4 Married       10117

Use the new levels, explore whether there is an effect of martial status on tvhours.

gss_cat %>%
  group_by(marital) %>%
  summarise(mtvhours = mean(tvhours, na.rm = TRUE)) %>%
  ggplot(aes(x = marital, y = mtvhours, fill = marital)) +
  geom_col() +
  labs(title = "Effect of Marital Status on TV Hours",
       x = "Marital Status", y = "Avg TV Hours per Day") 

Date and Time

How many timezones are there for all the destination airports (excluding NA)?

library(nycflights13)
library(dplyr)

flights %>%
  inner_join(airports, by = c("dest" = "faa")) %>%
  filter(!is.na(tz)) %>%
  distinct(tzone) %>%
  pull(tzone) %>%
  intersect(OlsonNames())
## [1] "America/Chicago"     "America/New_York"    "America/Los_Angeles"
## [4] "America/Phoenix"     "America/Denver"      "Pacific/Honolulu"   
## [7] "America/Anchorage"

Use the data in flights and airports only along with R code to show the time difference (in hours) between New York City and the following cities:

Chicago Dallas Denver Seattle Anchorage Honolulu

location<- c(
  "ORD",  # Chicago
  "DFW",  # Dallas
  "DEN",  # Denver
  "SEA",  # Seattle
  "ANC",  # Anchorage
  "HNL"   # Honolulu
)

airports %>%
  filter(faa %in% location) %>%
  distinct(faa, name, tzone) %>%
  mutate(
    nyc_offset   = as.numeric(format(now(tzone = "America/New_York"), "%z")) / 100,
    local_offset = map_dbl(tzone, ~ as.numeric(format(now(tzone = .x), "%z")) / 100),
    diff_from_nyc_hrs = local_offset - nyc_offset
  ) %>%
  dplyr::select(diff_from_nyc_hrs, name, faa, tzone ) %>%
  arrange(diff_from_nyc_hrs)
## # A tibble: 6 × 4
##   diff_from_nyc_hrs name                       faa   tzone              
##               <dbl> <chr>                      <chr> <chr>              
## 1                -6 Honolulu Intl              HNL   Pacific/Honolulu   
## 2                -4 Ted Stevens Anchorage Intl ANC   America/Anchorage  
## 3                -3 Seattle Tacoma Intl        SEA   America/Los_Angeles
## 4                -2 Denver Intl                DEN   America/Denver     
## 5                -1 Dallas Fort Worth Intl     DFW   America/Chicago    
## 6                -1 Chicago Ohare Intl         ORD   America/Chicago

Write a function Time_difference_NYC(dest) to compute the time difference (in hours) between any destination airport (in faa code) and New York City.

glimpse(airports)
## Rows: 1,458
## Columns: 8
## $ faa   <chr> "04G", "06A", "06C", "06N", "09J", "0A9", "0G6", "0G7", "0P2", "…
## $ name  <chr> "Lansdowne Airport", "Moton Field Municipal Airport", "Schaumbur…
## $ lat   <dbl> 41.13047, 32.46057, 41.98934, 41.43191, 31.07447, 36.37122, 41.4…
## $ lon   <dbl> -80.61958, -85.68003, -88.10124, -74.39156, -81.42778, -82.17342…
## $ alt   <dbl> 1044, 264, 801, 523, 11, 1593, 730, 492, 1000, 108, 409, 875, 10…
## $ tz    <dbl> -5, -6, -6, -5, -5, -5, -5, -5, -5, -8, -5, -6, -5, -5, -5, -5, …
## $ dst   <chr> "A", "A", "A", "A", "A", "A", "A", "A", "U", "A", "A", "U", "A",…
## $ tzone <chr> "America/New_York", "America/Chicago", "America/Chicago", "Ameri…
Time_difference_NYC <- function(dest) {
  
  airport_info <- airports %>%
    filter(faa == dest)
  
  if (nrow(airport_info) == 0) {
    stop(paste("Airport code", dest, "not found in the airports dataset."))
  }
  
  if (is.na(airport_info$tzone)) {
    stop(paste("Timezone not available for airport", dest))
  }
  
  nyc_offset   <- as.numeric(format(now(tzone = "America/New_York"), "%z")) / 100
  local_offset <- as.numeric(format(now(tzone = airport_info$tzone), "%z")) / 100
  diff         <- local_offset - nyc_offset
  
  cat("Airport  :", airport_info$name, "(", dest, ")\n")
  cat("Timezone :", airport_info$tzone, "\n")
  cat("Diff NYC :", diff, "hour(s)\n")

}

Write a function flight_time(dep_time, arr_time, origin, dest) to compute the actual flight time from the dep_time at the origin airport to the arr_time in the dest airport. dep_time and arr_time should be in the form of an integer such as 830 (8:30am); origin and dest should be in faa codes.

flight_time <- function(dep_time, arr_time, origin, dest) {
  
  origin_tz <- airports %>% filter(faa == origin) %>% pull(tzone)
  dest_tz   <- airports %>% filter(faa == dest)   %>% pull(tzone)
  
  dep_hour <- dep_time %/% 100
  dep_min  <- dep_time %%  100
  arr_hour <- arr_time %/% 100
  arr_min  <- arr_time %%  100
  
  dep_dt <- make_datetime(2013, 1, 1, dep_hour, dep_min) %>%
              force_tz(origin_tz)
  
  arr_dt <- make_datetime(2013, 1, 1, arr_hour, arr_min) %>%
              force_tz(dest_tz)
  
  du <- as.numeric(as.duration(arr_dt - dep_dt)) / 60
  
  cat("Origin       :", origin, "(", origin_tz, ")\n")
  cat("Destination  :", dest,   "(", dest_tz,   ")\n")
  cat("Departure    :", dep_hour, ":", dep_min,  "\n")
  cat("Arrival      :", arr_hour, ":", arr_min,  "\n")
  cat("Duration     :", du, "mins\n")
}