Preprocess.r

setwd("D:/Class Materials & Work/Summer 2020 practice/2_Tidymodel_Preprocess your data")
getwd()

## [1] "D:/Class Materials & Work/Summer 2020 practice/2_Tidymodel_Preprocess your data"

#Loading packages
require(tidymodels)

## Loading required package: tidymodels

## -- Attaching packages -------------------------------------------------------------------------------- tidymodels 0.1.0 --

## v broom     0.5.6      v recipes   0.1.12
## v dials     0.0.6      v rsample   0.0.6 
## v dplyr     0.8.5      v tibble    3.0.1 
## v ggplot2   3.3.1      v tune      0.1.0 
## v infer     0.5.1      v workflows 0.1.1 
## v parsnip   0.1.1      v yardstick 0.0.6 
## v purrr     0.3.4

## -- Conflicts ----------------------------------------------------------------------------------- tidymodels_conflicts() --
## x purrr::discard()  masks scales::discard()
## x dplyr::filter()   masks stats::filter()
## x dplyr::lag()      masks stats::lag()
## x ggplot2::margin() masks dials::margin()
## x recipes::step()   masks stats::step()

require(nycflights13)

## Loading required package: nycflights13

## Warning: package 'nycflights13' was built under R version 4.0.2

require(skimr)

## Loading required package: skimr

## Warning: package 'skimr' was built under R version 4.0.2

#Prepare the data set----

set.seed(123)


flight_data <- 
  flights %>% 
  
  mutate(
    # Convert the arrival delay to a factor
    arr_delay = ifelse(arr_delay >= 30, "late", "on_time"),
    arr_delay = factor(arr_delay),
    # We will use the date (not date-time) in the recipe below
    date = as.Date(time_hour)
  ) %>% 
  
  # Include the weather data
  inner_join(weather, by = c("origin", "time_hour")) %>% 
  
  # Only retain the specific columns we will use
  select(dep_time, flight, origin, dest, air_time, distance, 
         carrier, date, arr_delay, time_hour) %>% 
  
  # Exclude missing data
  na.omit() %>% 
  
  # For creating models, it is better to have qualitative columns encoded as factors (instead of character strings)
  mutate_if(is.character, as.factor)

#Calculating the proportion of delayed flights

flight_data %>% 
  count(arr_delay) %>% 
  mutate(prop = n/sum(n))

## # A tibble: 2 x 3
##   arr_delay      n  prop
##   <fct>      <int> <dbl>
## 1 late       52540 0.161
## 2 on_time   273279 0.839

#Check variables of the data set
glimpse(flight_data)

## Rows: 325,819
## Columns: 10
## $ dep_time  <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, 55...
## $ flight    <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 49,...
## $ origin    <fct> EWR, LGA, JFK, JFK, LGA, EWR, EWR, LGA, JFK, LGA, JFK, JF...
## $ dest      <fct> IAH, IAH, MIA, BQN, ATL, ORD, FLL, IAD, MCO, ORD, PBI, TP...
## $ air_time  <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 158...
## $ distance  <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, 10...
## $ carrier   <fct> UA, UA, AA, B6, DL, UA, B6, EV, B6, AA, B6, B6, UA, UA, A...
## $ date      <date> 2013-01-01, 2013-01-01, 2013-01-01, 2013-01-01, 2013-01-...
## $ arr_delay <fct> on_time, on_time, late, on_time, on_time, on_time, on_tim...
## $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 05:...

# or str(flight_data) for more details regarding levels.
#Convert "dest" and "carrier" to dummy variable for simple logistic regression. 

#Check the variables of interest.
flight_data %>% 
  skimr::skim(dest, carrier)

Data summary
Name	Piped data
Number of rows	325819
Number of columns	10
_______________________
Column type frequency:
factor	2
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
dest	0	1	FALSE	104	ATL: 16771, ORD: 16507, LAX: 15942, BOS: 14948
carrier	0	1	FALSE	16	UA: 57489, B6: 53715, EV: 50868, DL: 47465

#Data splitting----

#We will split the data set into two: a training set and a testing set.

# Fix the random numbers by setting the seed 
# This enables the analysis to be reproducible when random numbers are used 
set.seed(555)

# Put 3/4 of the data into the training set 
data_split <- initial_split(flight_data, prop = 3/4)

# Create data frames for the two sets:
train_data <- training(data_split) #3/4
test_data  <- testing(data_split)  #1/4

#Create recipes and roles----

#Initiating a new recipe:
flights_rec <- 
  recipe(arr_delay ~ ., data = train_data) 

flights_rec

## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor          9

#A formula. Any variable on the left-hand side of the tilde (~) is considered the model outcome (here, arr_delay).
#On the right-hand side of the tilde are the predictors. 
#Variables may be listed by name, or you can use the dot (.) to indicate all other variables as predictors.

#The data. A recipe is associated with the data set used to create the model. 
#This will typically be the training set, so "data = train_data" here. 
#Naming a data set doesn't actually change the data itself; it is only used to catalog the names of the variables and their types, like factors, integers, dates, etc.

#Essentially, recipe creates a model for analysis with specific roles and features for variables without tampering with the dataset itself. 

#Add role
flights_rec <- 
  recipe(arr_delay ~ ., data = train_data) %>% 
  update_role(flight, time_hour, new_role = "ID") 

flights_rec

## Data Recipe
## 
## Inputs:
## 
##       role #variables
##         ID          2
##    outcome          1
##  predictor          7

#We are letting the recipe knows that "flight" and "time_hour" are to be kept, but not used as predictors
#We will assign "ID" as the role for the two variables. After the model is fit, we may want to investigate some poorly predicted value.
#These ID columns will be available and can be used to try to understand what went wrong.

#Get current set of variables and roles
summary(flights_rec)

## # A tibble: 10 x 4
##    variable  type    role      source  
##    <chr>     <chr>   <chr>     <chr>   
##  1 dep_time  numeric predictor original
##  2 flight    numeric ID        original
##  3 origin    nominal predictor original
##  4 dest      nominal predictor original
##  5 air_time  numeric predictor original
##  6 distance  numeric predictor original
##  7 carrier   nominal predictor original
##  8 date      date    predictor original
##  9 time_hour date    ID        original
## 10 arr_delay nominal outcome   original

#Create features----

#Adding features using %>% (pipe operator) will allow us to use nominal variables as predictors.
#In this case, we are mutating "date", which is in the format of YYYY-MM-DD, into numeric format that equals to the number of days after the reference date.

flight_data %>% 
  distinct(date) %>% 
  mutate(numeric_date = as.numeric(date))

## # A tibble: 364 x 2
##    date       numeric_date
##    <date>            <dbl>
##  1 2013-01-01        15706
##  2 2013-01-02        15707
##  3 2013-01-03        15708
##  4 2013-01-04        15709
##  5 2013-01-05        15710
##  6 2013-01-06        15711
##  7 2013-01-07        15712
##  8 2013-01-08        15713
##  9 2013-01-09        15714
## 10 2013-01-10        15715
## # ... with 354 more rows

#Perhaps the model would benefit from a linear trend between the log-odds of a late arrival and the numeric date variable.
#HOWEVER, we could further improve it by adding model terms from the "date" variable by deriving its meaning.
#Doing so would allow us to have more meaningful predictors (and features) than the sole numeric date format.
#In this case, we are deriving 1.the day of the week, 2.the month, and 3.whether the date corresponds to a holiday.
#We do so by adding "steps" to our recipe.

flights_rec <- 
  recipe(arr_delay ~ ., data = train_data) %>% 
  update_role(flight, time_hour, new_role = "ID") %>% 
  step_date(date, features = c("dow", "month")) %>%               
  step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 
  step_rm(date)

#For step_date, we created two new factor columns with the appropriate day of the week and the month.
#For step_holiday, we created a binary variable indicating whether the current date is a holiday or not. The argument value of "timeDate::listHolidays("US")" uses the timedate package to list the 17 standard US holidays.
#For step_rm, we remove the original date variable since we no longer want it in the model.

flights_rec

## Data Recipe
## 
## Inputs:
## 
##       role #variables
##         ID          2
##    outcome          1
##  predictor          7
## 
## Operations:
## 
## Date features from date
## Holiday features from date
## Delete terms date

#For variable types of predictors, we may need to store factor variables differently based on our purposes.
#We store factor variable as is in a data frame, while converting it onto either numeric metrix or dummy variable in equations.

#For example, we may want to convert a factor value in a nominal variable into "1" as a reference dummy, while all other factor values become "0".

#HOWEVER, we need to tell recipe to create dummy variables manually as numeric predictors are not required in all models, and non-dummy variables maybe of use more outside modeling (where recipe can be used as well).
#Examples of the latter case are table making or graph plotting.

#Anyway, here it is, the dummification.

flights_rec <- 
  recipe(arr_delay ~ ., data = train_data) %>% 
  update_role(flight, time_hour, new_role = "ID") %>% 
  step_date(date, features = c("dow", "month")) %>% 
  step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 
  step_rm(date) %>% 
  step_dummy(all_nominal(), -all_outcomes())

flights_rec

## Data Recipe
## 
## Inputs:
## 
##       role #variables
##         ID          2
##    outcome          1
##  predictor          7
## 
## Operations:
## 
## Date features from date
## Holiday features from date
## Delete terms date
## Dummy variables from all_nominal, -, all_outcomes()

#Here, we did something different than before: instead of applying a step to an individual variable, we used selectors to apply this recipe step to several variables at once.
#The "all_nominal" selects all variables that are either factors or characters.
#The "-all_outcomes" removes any outcome variables from this recipe step.
#Basically, the recipe creates dummy variables from for all of the factor or character columns unless they are outcomes.

#Using selectors mean that you don't always have to apply steps to individual variables one at a time. 
#Since a recipe knows the variable type and role of each column, they can also be selected (or dropped) using this information.

#We are aware that carrier and dest might have some infrequently occurring values, it is possible that dummy variables might be created for values that don't exist in the training set.
#Basically, factor value in the two data set might not be completely matched.
#In our case, there is one destination that only appears in the test data set.

test_data %>% 
  distinct(dest) %>% 
  anti_join(train_data)

## Joining, by = "dest"

## # A tibble: 1 x 1
##   dest 
##   <fct>
## 1 LEX

#When the recipe is applied for the data set without LEX, a column is made for this unique value but it will contain all zeros, aka a "zero-variance predictor" that has no information within the column.
#While some R functions will not produce an error for such predictors, it usually causes warnings and other issues.
#step_zv() will remove columns from the data when the training set data have a single value, so it is added to the recipe after step_dummy().

flights_rec <- 
  recipe(arr_delay ~ ., data = train_data) %>% 
  update_role(flight, time_hour, new_role = "ID") %>% 
  step_date(date, features = c("dow", "month")) %>% 
  step_holiday(date, holidays = timeDate::listHolidays("US")) %>% 
  step_rm(date) %>% 
  step_dummy(all_nominal(), -all_outcomes()) %>% 
  step_zv(all_predictors())

flights_rec

## Data Recipe
## 
## Inputs:
## 
##       role #variables
##         ID          2
##    outcome          1
##  predictor          7
## 
## Operations:
## 
## Date features from date
## Holiday features from date
## Delete terms date
## Dummy variables from all_nominal, -, all_outcomes()
## Zero variance filter on all_predictors

#Now we have create specifications (or recipe) for our model in reference to our data set without tampering with the data itself.
#Next, we will use the recipe with our modeling.

#Fit a model with the recipe----

#Referring to the model building file with Parsnip, we will use logistic regression.

lr_mod <- 
  logistic_reg() %>% 
  set_engine("glm")

#We will:
#1.Process the recipe using the training set to determine which predictor should be dummified or filter out as zero variance predictor.
#2.Create the final predictor set of the training set using recipe.
#3.Apply the final recipe with finalized predictors to the test set.

#We will use "workflow" package to pair both the model and the recipe together for efficiency.

flights_wflow <- 
  workflow() %>% 
  add_model(lr_mod) %>% 
  add_recipe(flights_rec)

flights_wflow

## == Workflow ==============================================================================================================
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## -- Preprocessor ----------------------------------------------------------------------------------------------------------
## 5 Recipe Steps
## 
## * step_date()
## * step_holiday()
## * step_rm()
## * step_dummy()
## * step_zv()
## 
## -- Model -----------------------------------------------------------------------------------------------------------------
## Logistic Regression Model Specification (classification)
## 
## Computational engine: glm

#Fit the model to the training data.

flights_fit <- 
  flights_wflow %>% 
  fit(data = train_data)

## New names:
## * `` -> ...1
## * `` -> ...2
## * `` -> ...3
## * `` -> ...4
## * `` -> ...5
## * ...

#You may want to extract the model or recipe objects from the workflow to double check.
#For example, here we pull the fitted model object then use the broom::tidy() function to get a tidy tibble of model coefficients:

flights_fit %>% 
  pull_workflow_fit() %>% 
  broom::tidy()

## # A tibble: 157 x 5
##    term                         estimate std.error statistic  p.value
##    <chr>                           <dbl>     <dbl>     <dbl>    <dbl>
##  1 (Intercept)                   3.91    2.73           1.43 1.51e- 1
##  2 dep_time                     -0.00167 0.0000141   -118.   0.      
##  3 air_time                     -0.0439  0.000561     -78.4  0.      
##  4 distance                      0.00686 0.00150        4.57 4.84e- 6
##  5 date_USChristmasDay           1.12    0.173          6.49 8.45e-11
##  6 date_USColumbusDay            0.474   0.159          2.99 2.81e- 3
##  7 date_USCPulaskisBirthday      0.864   0.139          6.21 5.47e-10
##  8 date_USDecorationMemorialDay  0.279   0.110          2.53 1.15e- 2
##  9 date_USElectionDay            0.696   0.169          4.12 3.82e- 5
## 10 date_USGoodFriday             1.28    0.166          7.71 1.27e-14
## # ... with 147 more rows

#check each step in the recipe

flights_fit %>% 
  pull_workflow_prepped_recipe() %>% 
  broom::tidy()

## # A tibble: 5 x 6
##   number operation type    trained skip  id           
##    <int> <chr>     <chr>   <lgl>   <lgl> <chr>        
## 1      1 step      date    TRUE    FALSE date_tnlL5   
## 2      2 step      holiday TRUE    FALSE holiday_VjUyh
## 3      3 step      rm      TRUE    FALSE rm_3U2Gi     
## 4      4 step      dummy   TRUE    FALSE dummy_tOYI7  
## 5      5 step      zv      TRUE    FALSE zv_SjkxS

#Use the trained workflow to predict----

#Our goal was to predict whether a plane arrives more than 30 minutes late. We have just:
#1.Built the model (lr_mod),
#2.Created a preprocessing recipe (flights_rec),
#3.Bundled the model and recipe (flights_wflow), and
#4.Trained our workflow using a single call to fit().

#The next step is to use the trained workflow to predict the unseen test data with predict(), which applies the recipe to the new data and passes it to the fitted model.
predict(flights_fit, test_data)

## New names:
## * `` -> ...1
## * `` -> ...2
## * `` -> ...3
## * `` -> ...4
## * `` -> ...5
## * ...

## # A tibble: 81,454 x 1
##    .pred_class
##    <fct>      
##  1 on_time    
##  2 on_time    
##  3 on_time    
##  4 on_time    
##  5 on_time    
##  6 on_time    
##  7 on_time    
##  8 on_time    
##  9 on_time    
## 10 on_time    
## # ... with 81,444 more rows

#Because our outcome variable here is a factor, the output from predict() returns the predicted class: late versus on_time. 
#We can specify type=prob for  predicted class probabilities for each flight.
#We'll also bind the output with some variables from the test data and save them together for additional information.

flights_pred <- 
  predict(flights_fit, test_data, type = "prob") %>% 
  bind_cols(test_data %>% select(arr_delay, time_hour, flight))

## New names:
## * `` -> ...1
## * `` -> ...2
## * `` -> ...3
## * `` -> ...4
## * `` -> ...5
## * ...

flights_pred

## # A tibble: 81,454 x 5
##    .pred_late .pred_on_time arr_delay time_hour           flight
##         <dbl>         <dbl> <fct>     <dttm>               <int>
##  1     0.0565         0.944 on_time   2013-01-01 05:00:00   1714
##  2     0.0264         0.974 on_time   2013-01-01 06:00:00     79
##  3     0.0481         0.952 on_time   2013-01-01 06:00:00    301
##  4     0.0325         0.967 on_time   2013-01-01 06:00:00     49
##  5     0.0711         0.929 on_time   2013-01-01 06:00:00   1187
##  6     0.0583         0.942 on_time   2013-01-01 06:00:00   4401
##  7     0.0171         0.983 on_time   2013-01-01 06:00:00   1895
##  8     0.0458         0.954 on_time   2013-01-01 06:00:00    135
##  9     0.0221         0.978 on_time   2013-01-01 06:00:00   4646
## 10     0.0502         0.950 on_time   2013-01-01 06:00:00   4144
## # ... with 81,444 more rows

#To evaluate performance of our workflow, we can use Area under curve of the ROC curve.
#We can also do it manually by comparing pred_on_time probability value to the actual status of outcome variable in arr_delay, but... it's impractical.

#To generate an ROC curve, we need the predicted class probabilities for late and on_time, which we just calculated in the code chunk above.
#We can create the ROC curve with these values, using roc_curve() and then piping to the autoplot() method:

#Create the plot
flights_pred %>% 
  roc_curve(truth = arr_delay, .pred_late) %>% 
  autoplot()

#Generate the AArea under curve
flights_pred %>% 
  roc_auc(truth = arr_delay, .pred_late)

## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.765

Preprocess.r

tarid

2020-09-29