# STEP 1: PERFORM STANDARD SETUP
# Cleans the memory
rm(list=ls())
# The code below records the computer CPU clock. This can be used to measure the run time of our code.
startTime=proc.time()[3]  # Starts a clock to measure run time
# Set the seed, so that every user receives the same results.
set.seed(123)
# Call all the libraries below:
library(tidyverse); library(gridExtra); library(grid); library(ggplot2); library(lattice);
Registered S3 methods overwritten by 'ggplot2':
  method         from 
  [.quosures     rlang
  c.quosures     rlang
  print.quosures rlang
-- Attaching packages --------------------------------------- tidyverse 1.2.1 --
v ggplot2 3.1.1     v purrr   0.3.2
v tibble  2.1.1     v dplyr   0.8.1
v tidyr   0.8.3     v stringr 1.4.0
v readr   1.3.1     v forcats 0.4.0
-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::combine()  masks randomForest::combine()
x dplyr::filter()   masks stats::filter()
x dplyr::lag()      masks stats::lag()
x ggplot2::margin() masks randomForest::margin()

Attaching package: 㤼㸱gridExtra㤼㸲

The following object is masked from 㤼㸱package:dplyr㤼㸲:

    combine

The following object is masked from 㤼㸱package:randomForest㤼㸲:

    combine
library(dplyr); library(sqldf); library(data.table); library(readr); library(modelr); library(naniar);
Loading required package: gsubfn
Loading required package: proto
Loading required package: RSQLite
data.table 1.12.2 using 4 threads (see ?getDTthreads).  Latest news: r-datatable.com

Attaching package: 㤼㸱data.table㤼㸲

The following objects are masked from 㤼㸱package:dplyr㤼㸲:

    between, first, last

The following object is masked from 㤼㸱package:purrr㤼㸲:

    transpose
library(knitr); library(markdown); library(rmarkdown); library(survey); library(sandwich);
Loading required package: Matrix

Attaching package: 㤼㸱Matrix㤼㸲

The following object is masked from 㤼㸱package:tidyr㤼㸲:

    expand

Loading required package: survival

Attaching package: 㤼㸱survey㤼㸲

The following object is masked from 㤼㸱package:graphics㤼㸲:

    dotchart
library(plyr); library(lmtest); library(randomForest); library(bigrquery); library(tokenizers);
---------------------------------------------------------------------------------------------------
You have loaded plyr after dplyr - this is likely to cause problems.
If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
library(plyr); library(dplyr)
---------------------------------------------------------------------------------------------------

Attaching package: 㤼㸱plyr㤼㸲

The following objects are masked from 㤼㸱package:dplyr㤼㸲:

    arrange, count, desc, failwith, id, mutate, rename, summarise, summarize

The following object is masked from 㤼㸱package:purrr㤼㸲:

    compact

Loading required package: zoo

Attaching package: 㤼㸱zoo㤼㸲

The following objects are masked from 㤼㸱package:base㤼㸲:

    as.Date, as.Date.numeric
library(factoextra); library(jpeg); library(rpart); library(corrplot); library(RColorBrewer);
Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
corrplot 0.84 loaded
library(GGally); library(ggfortify); library(factoextra); library(rpart.plot); library(psych);
Registered S3 method overwritten by 'GGally':
  method from   
  +.gg   ggplot2

Attaching package: 㤼㸱GGally㤼㸲

The following object is masked from 㤼㸱package:dplyr㤼㸲:

    nasa


Attaching package: 㤼㸱psych㤼㸲

The following object is masked from 㤼㸱package:modelr㤼㸲:

    heights

The following objects are masked from 㤼㸱package:ggplot2㤼㸲:

    %+%, alpha

The following object is masked from 㤼㸱package:randomForest㤼㸲:

    outlier
library(GPArotation); library(lubridate); library(matrixStats); library(png); library(grid)

Attaching package: 㤼㸱lubridate㤼㸲

The following object is masked from 㤼㸱package:plyr㤼㸲:

    here

The following objects are masked from 㤼㸱package:data.table㤼㸲:

    hour, isoweek, mday, minute, month, quarter, second, wday, week, yday, year

The following object is masked from 㤼㸱package:base㤼㸲:

    date


Attaching package: 㤼㸱matrixStats㤼㸲

The following object is masked from 㤼㸱package:plyr㤼㸲:

    count

The following object is masked from 㤼㸱package:dplyr㤼㸲:

    count
#########################################################################################################
# STEP 2: UNDERSTAND THE DATA
Ride_Share = read.csv("Ride_Share.csv") # Load the data
class(Ride_Share) # View the class of the dataset
[1] "data.frame"
dim(Ride_Share) # View the dataset's dimensions
[1] 98 52
names(Ride_Share) # Look at the column names
 [1] "Age"                                                                                                                                                                                                                                              
 [2] "Parent"                                                                                                                                                                                                                                           
 [3] "AnnInc"                                                                                                                                                                                                                                           
 [4] "DriveStat"                                                                                                                                                                                                                                        
 [5] "Platform"                                                                                                                                                                                                                                         
 [6] "PF.Comp"                                                                                                                                                                                                                                          
 [7] "PF.TimeFlex"                                                                                                                                                                                                                                      
 [8] "PF.GeoFlex"                                                                                                                                                                                                                                       
 [9] "PF.Autonomy"                                                                                                                                                                                                                                      
[10] "PF.Brand"                                                                                                                                                                                                                                         
[11] "PF.EasyStart"                                                                                                                                                                                                                                     
[12] "PF.Other"                                                                                                                                                                                                                                         
[13] "EmployerLikeFactor"                                                                                                                                                                                                                               
[14] "MultipleCo"                                                                                                                                                                                                                                       
[15] "WhichApp.Habit"                                                                                                                                                                                                                                   
[16] "WhichApp.Demand"                                                                                                                                                                                                                                  
[17] "WhichApp.Bonus"                                                                                                                                                                                                                                   
[18] "WhichApp.Surge"                                                                                                                                                                                                                                   
[19] "WhichApp.Riders"                                                                                                                                                                                                                                  
[20] "WhichApp.Traffic"                                                                                                                                                                                                                                 
[21] "WhichApp.Other"                                                                                                                                                                                                                                   
[22] "TimetoFRide"                                                                                                                                                                                                                                      
[23] "FullTimer"                                                                                                                                                                                                                                        
[24] "HoursAvail"                                                                                                                                                                                                                                       
[25] "KidsMoreLess"                                                                                                                                                                                                                                     
[26] "ComfortLvl"                                                                                                                                                                                                                                       
[27] "KidsEZR.Car"                                                                                                                                                                                                                                      
[28] "KidsEZR.CPR"                                                                                                                                                                                                                                      
[29] "KidsEZR.Toys"                                                                                                                                                                                                                                     
[30] "KidsEZR.Cln"                                                                                                                                                                                                                                      
[31] "KidsEZR.Liability"                                                                                                                                                                                                                                
[32] "KidsEZR.Rltnshp"                                                                                                                                                                                                                                  
[33] "KidsEZR.Other"                                                                                                                                                                                                                                    
[34] "ZumYN"                                                                                                                                                                                                                                            
[35] "Zum.is.a.rideshare.service.that.enables.families.and.schools.to.arrange.rides.for.their.children.for.traveling.to.and.from.school.or.other.related.activities..such.as.ballet..soccer..or.music.lessons..Would.you.drive.for.Zum..Why.or.why.not."
[36] "PreventNotConsidered"                                                                                                                                                                                                                             
[37] "PreventTime"                                                                                                                                                                                                                                      
[38] "PreventSafety"                                                                                                                                                                                                                                    
[39] "PreventNoCar"                                                                                                                                                                                                                                     
[40] "PreventOther"                                                                                                                                                                                                                                     
[41] "DropOff"                                                                                                                                                                                                                                          
[42] "DropWComp"                                                                                                                                                                                                                                        
[43] "DriveWhnAvail"                                                                                                                                                                                                                                    
[44] "WhyNotDriveKids"                                                                                                                                                                                                                                  
[45] "WeekdayAvail"                                                                                                                                                                                                                                     
[46] "PaxAgeGapComf"                                                                                                                                                                                                                                    
[47] "Sex"                                                                                                                                                                                                                                              
[48] "AgeCat"                                                                                                                                                                                                                                           
[49] "Race"                                                                                                                                                                                                                                             
[50] "EmployStat"                                                                                                                                                                                                                                       
[51] "SocialMed"                                                                                                                                                                                                                                        
[52] "X"                                                                                                                                                                                                                                                
str(Ride_Share) # View summary of the data's internal structure
'data.frame':   98 obs. of  52 variables:
 $ Age                                                                                                                                                                                                                                              : int  33 35 28 29 35 28 26 31 54 30 ...
 $ Parent                                                                                                                                                                                                                                           : int  0 0 0 0 1 1 1 1 1 1 ...
 $ AnnInc                                                                                                                                                                                                                                           : int  2 1 2 2 1 1 3 1 2 3 ...
 $ DriveStat                                                                                                                                                                                                                                        : int  1 1 1 1 1 1 1 1 1 1 ...
 $ Platform                                                                                                                                                                                                                                         : int  1 1 1 2 2 1 1 1 1 1 ...
 $ PF.Comp                                                                                                                                                                                                                                          : int  3 1 1 2 1 1 2 2 2 5 ...
 $ PF.TimeFlex                                                                                                                                                                                                                                      : int  1 3 3 3 4 2 3 1 1 1 ...
 $ PF.GeoFlex                                                                                                                                                                                                                                       : int  2 4 4 1 3 4 4 5 6 2 ...
 $ PF.Autonomy                                                                                                                                                                                                                                      : int  4 5 5 4 5 5 1 3 5 6 ...
 $ PF.Brand                                                                                                                                                                                                                                         : int  5 6 6 5 2 6 5 4 3 3 ...
 $ PF.EasyStart                                                                                                                                                                                                                                     : int  6 2 2 6 6 3 6 6 4 4 ...
 $ PF.Other                                                                                                                                                                                                                                         : int  7 7 7 7 7 7 7 7 7 7 ...
 $ EmployerLikeFactor                                                                                                                                                                                                                               : int  1 2 1 6 1 2 4 6 2 1 ...
 $ MultipleCo                                                                                                                                                                                                                                       : int  1 0 0 0 0 0 0 0 1 1 ...
 $ WhichApp.Habit                                                                                                                                                                                                                                   : int  2 NA NA NA NA NA NA NA 5 5 ...
 $ WhichApp.Demand                                                                                                                                                                                                                                  : int  4 NA NA NA NA NA NA NA 2 1 ...
 $ WhichApp.Bonus                                                                                                                                                                                                                                   : int  1 NA NA NA NA NA NA NA 4 2 ...
 $ WhichApp.Surge                                                                                                                                                                                                                                   : int  5 NA NA NA NA NA NA NA 1 3 ...
 $ WhichApp.Riders                                                                                                                                                                                                                                  : int  3 NA NA NA NA NA NA NA 6 4 ...
 $ WhichApp.Traffic                                                                                                                                                                                                                                 : int  6 NA NA NA NA NA NA NA 3 6 ...
 $ WhichApp.Other                                                                                                                                                                                                                                   : int  7 NA NA NA NA NA NA NA 7 7 ...
 $ TimetoFRide                                                                                                                                                                                                                                      : int  3 2 2 1 1 3 2 2 1 2 ...
 $ FullTimer                                                                                                                                                                                                                                        : int  0 1 1 1 0 1 1 1 1 1 ...
 $ HoursAvail                                                                                                                                                                                                                                       : Factor w/ 11 levels "","1","1,2","1,2,3",..: 7 11 9 3 7 10 9 9 7 4 ...
 $ KidsMoreLess                                                                                                                                                                                                                                     : int  3 2 3 2 1 3 3 2 1 1 ...
 $ ComfortLvl                                                                                                                                                                                                                                       : int  4 10 7 0 4 5 5 1 3 7 ...
 $ KidsEZR.Car                                                                                                                                                                                                                                      : int  1 1 3 1 3 3 1 3 4 5 ...
 $ KidsEZR.CPR                                                                                                                                                                                                                                      : int  2 3 5 2 5 1 3 1 6 1 ...
 $ KidsEZR.Toys                                                                                                                                                                                                                                     : int  3 4 4 3 1 5 4 2 1 2 ...
 $ KidsEZR.Cln                                                                                                                                                                                                                                      : int  4 5 6 4 2 6 5 6 2 3 ...
 $ KidsEZR.Liability                                                                                                                                                                                                                                : int  5 2 1 5 4 2 6 5 3 4 ...
 $ KidsEZR.Rltnshp                                                                                                                                                                                                                                  : int  6 6 2 6 6 4 2 4 5 6 ...
 $ KidsEZR.Other                                                                                                                                                                                                                                    : int  7 7 7 7 7 7 7 7 7 7 ...
 $ ZumYN                                                                                                                                                                                                                                            : int  1 1 1 1 1 1 1 0 1 1 ...
 $ Zum.is.a.rideshare.service.that.enables.families.and.schools.to.arrange.rides.for.their.children.for.traveling.to.and.from.school.or.other.related.activities..such.as.ballet..soccer..or.music.lessons..Would.you.drive.for.Zum..Why.or.why.not.: Factor w/ 98 levels "Definitely. I like the family related vibe. ",..: 86 25 19 37 45 46 67 56 3 93 ...
 $ PreventNotConsidered                                                                                                                                                                                                                             : int  NA NA NA NA NA NA NA NA NA NA ...
 $ PreventTime                                                                                                                                                                                                                                      : int  NA NA NA NA NA NA NA NA NA NA ...
 $ PreventSafety                                                                                                                                                                                                                                    : int  NA NA NA NA NA NA NA NA NA NA ...
 $ PreventNoCar                                                                                                                                                                                                                                     : int  NA NA NA NA NA NA NA NA NA NA ...
 $ PreventOther                                                                                                                                                                                                                                     : int  NA NA NA NA NA NA NA NA NA NA ...
 $ DropOff                                                                                                                                                                                                                                          : int  NA NA NA NA NA NA NA NA NA NA ...
 $ DropWComp                                                                                                                                                                                                                                        : int  NA NA NA NA NA NA NA NA NA NA ...
 $ DriveWhnAvail                                                                                                                                                                                                                                    : int  NA NA NA NA NA NA NA NA NA NA ...
 $ WhyNotDriveKids                                                                                                                                                                                                                                  : int  NA NA NA NA NA NA NA NA NA NA ...
 $ WeekdayAvail                                                                                                                                                                                                                                     : int  NA NA NA NA NA NA NA NA NA NA ...
 $ PaxAgeGapComf                                                                                                                                                                                                                                    : int  NA NA NA NA NA NA NA NA NA NA ...
 $ Sex                                                                                                                                                                                                                                              : int  2 2 1 1 1 2 1 1 2 2 ...
 $ AgeCat                                                                                                                                                                                                                                           : int  2 3 2 2 3 2 2 2 4 2 ...
 $ Race                                                                                                                                                                                                                                             : int  3 1 1 1 1 2 1 1 3 4 ...
 $ EmployStat                                                                                                                                                                                                                                       : int  3 3 3 3 3 3 3 3 4 3 ...
 $ SocialMed                                                                                                                                                                                                                                        : int  1 1 3 3 1 6 1 1 2 1 ...
 $ X                                                                                                                                                                                                                                                : logi  NA NA NA NA NA NA ...
glimpse(Ride_Share) # View the structure of the data, the dplyr way
Observations: 98
Variables: 52
$ Age                                                                                                                                                                                                                                               <int> ...
$ Parent                                                                                                                                                                                                                                            <int> ...
$ AnnInc                                                                                                                                                                                                                                            <int> ...
$ DriveStat                                                                                                                                                                                                                                         <int> ...
$ Platform                                                                                                                                                                                                                                          <int> ...
$ PF.Comp                                                                                                                                                                                                                                           <int> ...
$ PF.TimeFlex                                                                                                                                                                                                                                       <int> ...
$ PF.GeoFlex                                                                                                                                                                                                                                        <int> ...
$ PF.Autonomy                                                                                                                                                                                                                                       <int> ...
$ PF.Brand                                                                                                                                                                                                                                          <int> ...
$ PF.EasyStart                                                                                                                                                                                                                                      <int> ...
$ PF.Other                                                                                                                                                                                                                                          <int> ...
$ EmployerLikeFactor                                                                                                                                                                                                                                <int> ...
$ MultipleCo                                                                                                                                                                                                                                        <int> ...
$ WhichApp.Habit                                                                                                                                                                                                                                    <int> ...
$ WhichApp.Demand                                                                                                                                                                                                                                   <int> ...
$ WhichApp.Bonus                                                                                                                                                                                                                                    <int> ...
$ WhichApp.Surge                                                                                                                                                                                                                                    <int> ...
$ WhichApp.Riders                                                                                                                                                                                                                                   <int> ...
$ WhichApp.Traffic                                                                                                                                                                                                                                  <int> ...
$ WhichApp.Other                                                                                                                                                                                                                                    <int> ...
$ TimetoFRide                                                                                                                                                                                                                                       <int> ...
$ FullTimer                                                                                                                                                                                                                                         <int> ...
$ HoursAvail                                                                                                                                                                                                                                        <fct> ...
$ KidsMoreLess                                                                                                                                                                                                                                      <int> ...
$ ComfortLvl                                                                                                                                                                                                                                        <int> ...
$ KidsEZR.Car                                                                                                                                                                                                                                       <int> ...
$ KidsEZR.CPR                                                                                                                                                                                                                                       <int> ...
$ KidsEZR.Toys                                                                                                                                                                                                                                      <int> ...
$ KidsEZR.Cln                                                                                                                                                                                                                                       <int> ...
$ KidsEZR.Liability                                                                                                                                                                                                                                 <int> ...
$ KidsEZR.Rltnshp                                                                                                                                                                                                                                   <int> ...
$ KidsEZR.Other                                                                                                                                                                                                                                     <int> ...
$ ZumYN                                                                                                                                                                                                                                             <int> ...
$ Zum.is.a.rideshare.service.that.enables.families.and.schools.to.arrange.rides.for.their.children.for.traveling.to.and.from.school.or.other.related.activities..such.as.ballet..soccer..or.music.lessons..Would.you.drive.for.Zum..Why.or.why.not. <fct> ...
$ PreventNotConsidered                                                                                                                                                                                                                              <int> ...
$ PreventTime                                                                                                                                                                                                                                       <int> ...
$ PreventSafety                                                                                                                                                                                                                                     <int> ...
$ PreventNoCar                                                                                                                                                                                                                                      <int> ...
$ PreventOther                                                                                                                                                                                                                                      <int> ...
$ DropOff                                                                                                                                                                                                                                           <int> ...
$ DropWComp                                                                                                                                                                                                                                         <int> ...
$ DriveWhnAvail                                                                                                                                                                                                                                     <int> ...
$ WhyNotDriveKids                                                                                                                                                                                                                                   <int> ...
$ WeekdayAvail                                                                                                                                                                                                                                      <int> ...
$ PaxAgeGapComf                                                                                                                                                                                                                                     <int> ...
$ Sex                                                                                                                                                                                                                                               <int> ...
$ AgeCat                                                                                                                                                                                                                                            <int> ...
$ Race                                                                                                                                                                                                                                              <int> ...
$ EmployStat                                                                                                                                                                                                                                        <int> ...
$ SocialMed                                                                                                                                                                                                                                         <int> ...
$ X                                                                                                                                                                                                                                                 <lgl> ...
summary(Ride_Share) # View a summary of the data
      Age            Parent           AnnInc        DriveStat        Platform      PF.Comp    
 Min.   :25.00   Min.   :0.0000   Min.   :1.000   Min.   :1.000   Min.   :1.0   Min.   :1.00  
 1st Qu.:30.00   1st Qu.:1.0000   1st Qu.:2.000   1st Qu.:1.000   1st Qu.:1.0   1st Qu.:1.00  
 Median :33.00   Median :1.0000   Median :2.000   Median :2.000   Median :1.0   Median :2.00  
 Mean   :35.31   Mean   :0.9592   Mean   :2.459   Mean   :1.643   Mean   :1.2   Mean   :2.01  
 3rd Qu.:39.00   3rd Qu.:1.0000   3rd Qu.:3.000   3rd Qu.:2.000   3rd Qu.:1.0   3rd Qu.:3.00  
 Max.   :57.00   Max.   :1.0000   Max.   :4.000   Max.   :2.000   Max.   :2.0   Max.   :6.00  
                                                                  NA's   :63                  
  PF.TimeFlex      PF.GeoFlex     PF.Autonomy       PF.Brand     PF.EasyStart      PF.Other    
 Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.00   Min.   :1.000   Min.   :1.000  
 1st Qu.:1.000   1st Qu.:3.000   1st Qu.:4.000   1st Qu.:5.00   1st Qu.:3.000   1st Qu.:7.000  
 Median :2.000   Median :4.000   Median :5.000   Median :5.00   Median :4.000   Median :7.000  
 Mean   :2.204   Mean   :3.582   Mean   :4.296   Mean   :5.02   Mean   :4.051   Mean   :6.837  
 3rd Qu.:3.000   3rd Qu.:4.000   3rd Qu.:5.000   3rd Qu.:6.00   3rd Qu.:6.000   3rd Qu.:7.000  
 Max.   :6.000   Max.   :6.000   Max.   :6.000   Max.   :7.00   Max.   :7.000   Max.   :7.000  
                                                                                               
 EmployerLikeFactor   MultipleCo     WhichApp.Habit WhichApp.Demand WhichApp.Bonus WhichApp.Surge 
 Min.   :1.0        Min.   :0.0000   Min.   :1.00   Min.   :1.000   Min.   :1.00   Min.   :1.000  
 1st Qu.:1.5        1st Qu.:0.0000   1st Qu.:1.75   1st Qu.:2.750   1st Qu.:2.00   1st Qu.:2.500  
 Median :2.0        Median :0.0000   Median :2.50   Median :3.500   Median :2.00   Median :4.500  
 Mean   :2.4        Mean   :0.2286   Mean   :2.75   Mean   :3.125   Mean   :3.00   Mean   :3.625  
 3rd Qu.:2.5        3rd Qu.:0.0000   3rd Qu.:3.50   3rd Qu.:4.000   3rd Qu.:4.25   3rd Qu.:5.000  
 Max.   :6.0        Max.   :1.0000   Max.   :5.00   Max.   :4.000   Max.   :6.00   Max.   :5.000  
 NA's   :63         NA's   :63       NA's   :90     NA's   :90      NA's   :90     NA's   :90     
 WhichApp.Riders WhichApp.Traffic WhichApp.Other  TimetoFRide    FullTimer        HoursAvail
 Min.   :1.00    Min.   :1.00     Min.   :7      Min.   :1.0   Min.   :0.0000          :63  
 1st Qu.:3.00    1st Qu.:2.75     1st Qu.:7      1st Qu.:1.0   1st Qu.:1.0000   3      :13  
 Median :4.00    Median :6.00     Median :7      Median :2.0   Median :1.0000   2      : 6  
 Mean   :4.00    Mean   :4.50     Mean   :7      Mean   :1.8   Mean   :0.8286   4      : 6  
 3rd Qu.:5.25    3rd Qu.:6.00     3rd Qu.:7      3rd Qu.:2.0   3rd Qu.:1.0000   3,4    : 3  
 Max.   :6.00    Max.   :6.00     Max.   :7      Max.   :3.0   Max.   :1.0000   2,3,4  : 2  
 NA's   :90      NA's   :90       NA's   :90     NA's   :63    NA's   :63       (Other): 5  
  KidsMoreLess     ComfortLvl      KidsEZR.Car     KidsEZR.CPR     KidsEZR.Toys    KidsEZR.Cln   
 Min.   :1.000   Min.   : 0.000   Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
 1st Qu.:1.000   1st Qu.: 4.000   1st Qu.:2.000   1st Qu.:2.000   1st Qu.:3.000   1st Qu.:4.000  
 Median :2.000   Median : 5.000   Median :3.000   Median :3.000   Median :4.000   Median :5.000  
 Mean   :2.029   Mean   : 5.514   Mean   :3.143   Mean   :3.031   Mean   :3.694   Mean   :4.612  
 3rd Qu.:3.000   3rd Qu.: 7.000   3rd Qu.:4.000   3rd Qu.:4.000   3rd Qu.:5.000   3rd Qu.:6.000  
 Max.   :3.000   Max.   :10.000   Max.   :6.000   Max.   :6.000   Max.   :6.000   Max.   :7.000  
 NA's   :63      NA's   :63                                                                      
 KidsEZR.Liability KidsEZR.Rltnshp KidsEZR.Other       ZumYN       
 Min.   :1.000     Min.   :1.000   Min.   :1.000   Min.   :0.0000  
 1st Qu.:1.000     1st Qu.:2.000   1st Qu.:7.000   1st Qu.:1.0000  
 Median :2.000     Median :4.000   Median :7.000   Median :1.0000  
 Mean   :2.755     Mean   :3.878   Mean   :6.888   Mean   :0.7653  
 3rd Qu.:4.000     3rd Qu.:6.000   3rd Qu.:7.000   3rd Qu.:1.0000  
 Max.   :6.000     Max.   :6.000   Max.   :7.000   Max.   :1.0000  
                                                                   
                                                                                                                                                                  Zum.is.a.rideshare.service.that.enables.families.and.schools.to.arrange.rides.for.their.children.for.traveling.to.and.from.school.or.other.related.activities..such.as.ballet..soccer..or.music.lessons..Would.you.drive.for.Zum..Why.or.why.not.
 Definitely. I like the family related vibe.                                                                                                                                                                                                                                               : 1                                                                                                                     
 Depends on the pay pretty much and if it is worth my time.                                                                                                                                                                                                                                : 1                                                                                                                     
 good driving\n                                                                                                                                                                                                                                                                            : 1                                                                                                                     
 I'm not sure.  It seems like a great idea, but an awful lot of liability.  I also think it takes a special person to work exclusively with children.  I think it's going to be very hard to ensure the safety of the children and I would be very fearful of being accused of something.  : 1                                                                                                                     
 I am already fine driving for Uber. There is no reason for me to drive for Zum.                                                                                                                                                                                                           : 1                                                                                                                     
 I am not sure. Driving someone elses children is a responsibility and a privilige.  If I knew the parents, I may.  If I know they needed a favor I would do it and not expect to be paid. It also would depend on the child :)                                                            : 1                                                                                                                     
 (Other)                                                                                                                                                                                                                                                                                   :92                                                                                                                     
 PreventNotConsidered  PreventTime     PreventSafety     PreventNoCar      PreventOther    
 Min.   :0.00000      Min.   :0.0000   Min.   :0.0000   Min.   :0.00000   Min.   :0.00000  
 1st Qu.:0.00000      1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.00000  
 Median :0.00000      Median :1.0000   Median :0.0000   Median :0.00000   Median :0.00000  
 Mean   :0.01587      Mean   :0.5714   Mean   :0.2698   Mean   :0.07937   Mean   :0.06349  
 3rd Qu.:0.00000      3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:0.00000  
 Max.   :1.00000      Max.   :1.0000   Max.   :1.0000   Max.   :1.00000   Max.   :1.00000  
 NA's   :35           NA's   :35       NA's   :35       NA's   :35        NA's   :35       
    DropOff         DropWComp     DriveWhnAvail WhyNotDriveKids  WeekdayAvail   PaxAgeGapComf  
 Min.   :0.0000   Min.   :1.000   Min.   :1     Min.   :1.000   Min.   :1.000   Min.   :1.000  
 1st Qu.:1.0000   1st Qu.:1.000   1st Qu.:1     1st Qu.:2.000   1st Qu.:2.000   1st Qu.:1.000  
 Median :1.0000   Median :1.000   Median :1     Median :3.000   Median :2.000   Median :1.000  
 Mean   :0.8254   Mean   :1.615   Mean   :1     Mean   :2.833   Mean   :2.263   Mean   :1.239  
 3rd Qu.:1.0000   3rd Qu.:2.000   3rd Qu.:1     3rd Qu.:4.000   3rd Qu.:3.000   3rd Qu.:1.000  
 Max.   :1.0000   Max.   :4.000   Max.   :1     Max.   :4.000   Max.   :3.000   Max.   :2.000  
 NA's   :35       NA's   :46      NA's   :87    NA's   :92      NA's   :41      NA's   :52     
      Sex            AgeCat           Race         EmployStat     SocialMed        X          
 Min.   :1.000   Min.   :2.000   Min.   :1.000   Min.   :1.00   Min.   :1.000   Mode:logical  
 1st Qu.:1.000   1st Qu.:2.000   1st Qu.:1.000   1st Qu.:3.00   1st Qu.:1.000   NA's:98       
 Median :1.000   Median :2.000   Median :1.000   Median :3.00   Median :2.000                 
 Mean   :1.388   Mean   :2.612   Mean   :1.551   Mean   :3.02   Mean   :2.663                 
 3rd Qu.:2.000   3rd Qu.:3.000   3rd Qu.:1.000   3rd Qu.:3.00   3rd Qu.:3.750                 
 Max.   :2.000   Max.   :5.000   Max.   :6.000   Max.   :4.00   Max.   :7.000                 
                                                                                              
head(Ride_Share, n = 15) # View the first 15 rows of data
tail(Ride_Share, n = 15) # View the last 15 rows of data
length(unique(Ride_Share$AnnInc)) # Count the number of distinct values in a column
[1] 4
#########################################################################################################

#########################################################################################################
# STEP 3: VISUALIZE THE DATA (PART I)
# View histogram for select numeric variables to get a sense of the distribution for each variable.
hist(Ride_Share$Age, main = "Distribution of Age", xlab = "Age")
table(Ride_Share$Age)

25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 42 43 44 45 46 47 48 49 50 52 53 54 55 57 
 5  3  2  9  3 11  8  5  5  5  5  4  2  4  3  2  3  1  3  1  1  2  3  2  1  1  1  1  1  1 
hist(Ride_Share$Parent, main = "Distribution of Parenthood", xlab = "Parent or Not?")

table(Ride_Share$Parent)

 0  1 
 4 94 
hist(Ride_Share$AnnInc, main = "Distribution of Income", xlab = "Income Bracket", breaks = 8)

table(Ride_Share$AnnInc)

 1  2  3  4 
19 33 28 18 
hist(Ride_Share$EmployStat, main = "Distribution of Employment Status", xlab = "Employment Status")

table(Ride_Share$EmployStat)

 1  2  3  4 
 3  4 79 12 
hist(Ride_Share$SocialMed, main = "Distribution of Social Media", xlab = "Social Media Platform",
     breaks = 10)

table(Ride_Share$SocialMed)

 1  2  3  4  6  7 
45 18 10  2 19  4 
########################################################################################################

#########################################################################################################
# STEP 4: VISUALIZE THE DATA (PART II)
# What factors matter to those who drive for a ride-share company or would consider doing so?
hist(Ride_Share$PF.Comp, main = "Distribution of Ranking of Compensation",
     xlab = "Ranking of Compensation", breaks = 10)
table(Ride_Share$PF.Comp)

 1  2  3  4  5  6 
46 25 15  6  4  2 
hist(Ride_Share$PF.TimeFlex, main = "Distribution of Ranking of Time Flexibility",
     xlab = "Ranking of Time Flexibility", breaks = 10)

table(Ride_Share$PF.TimeFlex)

 1  2  3  4  5  6 
31 38 14  9  5  1 
hist(Ride_Share$PF.GeoFlex, main = "Distribution of Ranking of Geographic Flexibility",
     xlab = "Ranking of Geographic Flexibility", breaks = 10)

table(Ride_Share$PF.GeoFlex)

 1  2  3  4  5  6 
 6 11 30 31 11  9 
hist(Ride_Share$PF.Autonomy, main = "Distribution of Ranking of Autonomy",
     xlab = "Ranking of Autonomy", breaks = 10)

table(Ride_Share$PF.Autonomy)

 1  2  3  4  5  6 
 4  4 14 26 37 13 
hist(Ride_Share$PF.Brand, main = "Distribution of Ranking of Brand",
     xlab = "Ranking of Brand", breaks = 10)

table(Ride_Share$PF.Brand)

 1  2  3  4  5  6  7 
 3  5  7  7 28 46  2 
hist(Ride_Share$PF.EasyStart, main = "Distribution of Ranking of Ease of Getting Started",
     xlab = "Ranking of Ease of Getting Started", breaks = 10)

table(Ride_Share$PF.EasyStart)

 1  2  3  4  5  6  7 
 7 15 16 19 12 27  2 
#########################################################################################################

#########################################################################################################
# STEP 5: VISUALIZE THE DATA (PART III)
# View scatter plot of select pairs of variables.
plot(Ride_Share$Age, Ride_Share$ZumYN,
     main = "Regression for Age on Whether the Person Would Drive for Zum",
     xlab = "Age", ylab = "Would You Drive for Zum?")
abline(lm(Ride_Share$ZumYN ~ Ride_Share$Age, data = Ride_Share), col = "red")
plot(Ride_Share$Parent, Ride_Share$ZumYN,
     main = "Regression for Parenthood on Whether the Person Would Drive for Zum",
     xlab = "Is the Person a Parent?", ylab = "Would You Drive for Zum?")

abline(lm(Ride_Share$ZumYN ~ Ride_Share$Parent, data = Ride_Share), col = "red")

plot(Ride_Share$AnnInc, Ride_Share$ZumYN,
     main = "Regression for Annual Income on Whether the Person Would Drive for Zum",
     xlab = "Annual Income", ylab = "Would You Drive for Zum?")
abline(lm(Ride_Share$ZumYN ~ Ride_Share$AnnInc, data = Ride_Share), col = "red")

plot(Ride_Share$Sex, Ride_Share$ZumYN,
     main = "Regression for Sex on Whether the Person Would Drive for Zum",
     xlab = "Sex", ylab = "Would You Drive for Zum?")
abline(lm(Ride_Share$ZumYN ~ Ride_Share$Sex, data = Ride_Share), col = "red")

#########################################################################################################

#########################################################################################################
# STEP 7: DETERMINE CORRELATIONS AMONG INDEPENDENT VARIABLES
# Visualize correlations among the independent variables.  
ggcorr(Ride_Share5, nbreaks = 4, palette = "RdGy")
# Alternative method for visualizing correlations:
allcorrelations1 = cor(Ride_Share5)
corrplot(allcorrelations1)

# For the multivariate regression analysis, we should omit variables that have high correlation with
# other variables. In other words, for each strongly correlated (+/-) pair of variables, keep only one
# of the two variables. Doing so will help to overcome the problem of multi-collinearity.
# In this case, the explanatory variables Age and AgeCat are strongly correlated with each other (not
# surprisingly). So, let's create a data frame without one of those variables. Specifically, we can get
# rid of AgeCat.
Ride_Share6 <- Ride_Share4[,c(1:19,21:24)] # Note: this data frame includes the y variable.
# Also, let's get rid of the two "Other" columns, since they are virtually useless, given the lack of
# variation of responses for them.
Ride_Share7 <- Ride_Share6[,c(1:10,12:17,19:23)]
#########################################################################################################
#########################################################################################################
# STEP 9: BUILD PREDICTION MODELS
#-------------------------------------------------------------------------------------------------------#
# Univariate Regression
model1 = lm(ZumYN ~ Age, data = training)
summary(model1)

Call:
lm(formula = ZumYN ~ Age, data = training)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.8231 -0.2654  0.2433  0.2678  0.2888 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)  
(Intercept) 0.623840   0.267117   2.335   0.0231 *
Age         0.003495   0.007478   0.467   0.6420  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.4422 on 57 degrees of freedom
Multiple R-squared:  0.003818,  Adjusted R-squared:  -0.01366 
F-statistic: 0.2185 on 1 and 57 DF,  p-value: 0.642
# According to this model, for every one unit increase in Age, the dependent variable increases by
# ~0.01 units. However, the result is not statistically significant.
plot(ZumYN ~ Age, data = training, col = 'red') # Visualize the regression
abline(model1)

prediction1 = predict(model1, newdata = validation)
validationErrors1 = validation$ZumYN - prediction1
# Calculating RMSE on validation data
validationRMSE1 = sqrt(mean(validationErrors1^2))
cat('Validation RMSE = ', validationRMSE1,'\n')
Validation RMSE =  0.2963254 
# The goal is to minimize RMSE without overfitting. The RMSE is in the same units as the y variable.
#--------------------------------------------------------------------------------------------------------#
#--------------------------------------------------------------------------------------------------------#
# Multivariate Regression
model2 = lm(ZumYN ~ ., data = training)
summary(model2)

Call:
lm(formula = ZumYN ~ ., data = training)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.74668 -0.26779  0.09332  0.27193  0.62306 

Coefficients: (1 not defined because of singularities)
                   Estimate Std. Error t value Pr(>|t|)  
(Intercept)        1.001671   3.025857   0.331   0.7424  
Age               -0.003869   0.009741  -0.397   0.6934  
Parent                   NA         NA      NA       NA  
AnnInc            -0.020526   0.061269  -0.335   0.7394  
DriveStat          0.194189   0.149799   1.296   0.2025  
PF.Comp           -0.082994   0.101253  -0.820   0.4174  
PF.TimeFlex       -0.221208   0.110768  -1.997   0.0528 .
PF.GeoFlex        -0.180741   0.112367  -1.608   0.1158  
PF.Autonomy       -0.154005   0.112415  -1.370   0.1785  
PF.Brand          -0.144544   0.098865  -1.462   0.1517  
PF.EasyStart      -0.186519   0.088724  -2.102   0.0420 *
KidsEZR.Car        0.165629   0.155280   1.067   0.2927  
KidsEZR.CPR        0.214801   0.149624   1.436   0.1591  
KidsEZR.Toys       0.160223   0.138975   1.153   0.2560  
KidsEZR.Cln        0.106503   0.150285   0.709   0.4827  
KidsEZR.Liability  0.190436   0.150804   1.263   0.2142  
KidsEZR.Rltnshp    0.143406   0.141956   1.010   0.3186  
Sex                0.146017   0.144750   1.009   0.3193  
Race              -0.010620   0.065297  -0.163   0.8716  
EmployStat        -0.137979   0.129572  -1.065   0.2935  
SocialMed         -0.002775   0.033308  -0.083   0.9340  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.4551 on 39 degrees of freedom
Multiple R-squared:  0.2779,    Adjusted R-squared:  -0.07384 
F-statistic: 0.7901 on 19 and 39 DF,  p-value: 0.704
prediction2 = predict(model2, newdata = validation)
prediction from a rank-deficient fit may be misleading
validationErrors2 = validation$ZumYN - prediction2
validationRMSE2 = sqrt(mean(validationErrors2^2))
cat('Validation RMSE = ', validationRMSE2,'\n')
Validation RMSE =  0.3884708 
#--------------------------------------------------------------------------------------------------------#
#--------------------------------------------------------------------------------------------------------#
# Stepwise Regression
model3 = step(model2, data = training, direction = "backward")
Start:  AIC=-77.32
ZumYN ~ Age + Parent + AnnInc + DriveStat + PF.Comp + PF.TimeFlex + 
    PF.GeoFlex + PF.Autonomy + PF.Brand + PF.EasyStart + KidsEZR.Car + 
    KidsEZR.CPR + KidsEZR.Toys + KidsEZR.Cln + KidsEZR.Liability + 
    KidsEZR.Rltnshp + Sex + Race + EmployStat + SocialMed


Step:  AIC=-77.32
ZumYN ~ Age + AnnInc + DriveStat + PF.Comp + PF.TimeFlex + PF.GeoFlex + 
    PF.Autonomy + PF.Brand + PF.EasyStart + KidsEZR.Car + KidsEZR.CPR + 
    KidsEZR.Toys + KidsEZR.Cln + KidsEZR.Liability + KidsEZR.Rltnshp + 
    Sex + Race + EmployStat + SocialMed

                    Df Sum of Sq    RSS     AIC
- SocialMed          1   0.00144 8.0788 -79.309
- Race               1   0.00548 8.0828 -79.280
- AnnInc             1   0.02325 8.1006 -79.150
- Age                1   0.03267 8.1100 -79.082
- KidsEZR.Cln        1   0.10402 8.1814 -78.565
- PF.Comp            1   0.13915 8.2165 -78.312
- Sex                1   0.21075 8.2881 -77.800
- KidsEZR.Rltnshp    1   0.21136 8.2887 -77.796
- EmployStat         1   0.23486 8.3122 -77.629
- KidsEZR.Car        1   0.23564 8.3130 -77.623
- KidsEZR.Toys       1   0.27528 8.3526 -77.343
<none>                           8.0774 -77.320
- KidsEZR.Liability  1   0.33028 8.4076 -76.955
- DriveStat          1   0.34805 8.4254 -76.831
- PF.Autonomy        1   0.38871 8.4661 -76.547
- KidsEZR.CPR        1   0.42685 8.5042 -76.282
- PF.Brand           1   0.44272 8.5201 -76.172
- PF.GeoFlex         1   0.53585 8.6132 -75.530
- PF.TimeFlex        1   0.82600 8.9034 -73.575
- PF.EasyStart       1   0.91531 8.9927 -72.986

Step:  AIC=-79.31
ZumYN ~ Age + AnnInc + DriveStat + PF.Comp + PF.TimeFlex + PF.GeoFlex + 
    PF.Autonomy + PF.Brand + PF.EasyStart + KidsEZR.Car + KidsEZR.CPR + 
    KidsEZR.Toys + KidsEZR.Cln + KidsEZR.Liability + KidsEZR.Rltnshp + 
    Sex + Race + EmployStat

                    Df Sum of Sq    RSS     AIC
- Race               1   0.00469 8.0835 -81.275
- AnnInc             1   0.02210 8.1009 -81.148
- Age                1   0.03184 8.1106 -81.077
- KidsEZR.Cln        1   0.11418 8.1930 -80.481
- PF.Comp            1   0.13775 8.2166 -80.312
- Sex                1   0.21664 8.2954 -79.748
- KidsEZR.Rltnshp    1   0.22740 8.3062 -79.672
- EmployStat         1   0.25021 8.3290 -79.510
- KidsEZR.Car        1   0.26100 8.3398 -79.433
<none>                           8.0788 -79.309
- KidsEZR.Toys       1   0.28757 8.3664 -79.246
- DriveStat          1   0.34694 8.4257 -78.828
- KidsEZR.Liability  1   0.36219 8.4410 -78.722
- PF.Autonomy        1   0.38745 8.4663 -78.546
- PF.Brand           1   0.44204 8.5208 -78.166
- KidsEZR.CPR        1   0.44868 8.5275 -78.120
- PF.GeoFlex         1   0.54367 8.6225 -77.467
- PF.TimeFlex        1   0.85069 8.9295 -75.402
- PF.EasyStart       1   0.92189 9.0007 -74.934

Step:  AIC=-81.28
ZumYN ~ Age + AnnInc + DriveStat + PF.Comp + PF.TimeFlex + PF.GeoFlex + 
    PF.Autonomy + PF.Brand + PF.EasyStart + KidsEZR.Car + KidsEZR.CPR + 
    KidsEZR.Toys + KidsEZR.Cln + KidsEZR.Liability + KidsEZR.Rltnshp + 
    Sex + EmployStat

                    Df Sum of Sq    RSS     AIC
- AnnInc             1   0.02108 8.1046 -83.121
- Age                1   0.03180 8.1153 -83.043
- KidsEZR.Cln        1   0.12581 8.2093 -82.364
- PF.Comp            1   0.13835 8.2218 -82.274
- Sex                1   0.21233 8.2958 -81.745
- KidsEZR.Rltnshp    1   0.23538 8.3189 -81.582
- EmployStat         1   0.24592 8.3294 -81.507
<none>                           8.0835 -81.275
- KidsEZR.Car        1   0.28408 8.3676 -81.237
- KidsEZR.Toys       1   0.29524 8.3787 -81.159
- DriveStat          1   0.34358 8.4271 -80.819
- PF.Autonomy        1   0.38789 8.4714 -80.510
- KidsEZR.Liability  1   0.38812 8.4716 -80.508
- PF.Brand           1   0.43747 8.5210 -80.165
- KidsEZR.CPR        1   0.45016 8.5337 -80.078
- PF.GeoFlex         1   0.54668 8.6302 -79.414
- PF.TimeFlex        1   0.85041 8.9339 -77.373
- PF.EasyStart       1   0.91895 9.0024 -76.922

Step:  AIC=-83.12
ZumYN ~ Age + DriveStat + PF.Comp + PF.TimeFlex + PF.GeoFlex + 
    PF.Autonomy + PF.Brand + PF.EasyStart + KidsEZR.Car + KidsEZR.CPR + 
    KidsEZR.Toys + KidsEZR.Cln + KidsEZR.Liability + KidsEZR.Rltnshp + 
    Sex + EmployStat

                    Df Sum of Sq    RSS     AIC
- Age                1   0.02964 8.1342 -84.906
- KidsEZR.Cln        1   0.12014 8.2247 -84.253
- PF.Comp            1   0.14790 8.2525 -84.054
- Sex                1   0.21950 8.3241 -83.545
- KidsEZR.Rltnshp    1   0.23034 8.3349 -83.468
- EmployStat         1   0.23483 8.3394 -83.436
- KidsEZR.Car        1   0.27924 8.3838 -83.123
<none>                           8.1046 -83.121
- KidsEZR.Toys       1   0.28249 8.3871 -83.100
- DriveStat          1   0.32629 8.4309 -82.793
- KidsEZR.Liability  1   0.37905 8.4836 -82.425
- PF.Autonomy        1   0.38841 8.4930 -82.360
- PF.Brand           1   0.43423 8.5388 -82.042
- KidsEZR.CPR        1   0.43915 8.5437 -82.008
- PF.GeoFlex         1   0.53771 8.6423 -81.331
- PF.TimeFlex        1   0.85619 8.9608 -79.196
- PF.EasyStart       1   0.95506 9.0596 -78.549

Step:  AIC=-84.91
ZumYN ~ DriveStat + PF.Comp + PF.TimeFlex + PF.GeoFlex + PF.Autonomy + 
    PF.Brand + PF.EasyStart + KidsEZR.Car + KidsEZR.CPR + KidsEZR.Toys + 
    KidsEZR.Cln + KidsEZR.Liability + KidsEZR.Rltnshp + Sex + 
    EmployStat

                    Df Sum of Sq    RSS     AIC
- KidsEZR.Cln        1   0.09785 8.2321 -86.201
- PF.Comp            1   0.14043 8.2746 -85.896
- KidsEZR.Rltnshp    1   0.20185 8.3361 -85.460
- Sex                1   0.20601 8.3402 -85.430
- EmployStat         1   0.21568 8.3499 -85.362
- KidsEZR.Car        1   0.25242 8.3866 -85.103
- KidsEZR.Toys       1   0.26180 8.3960 -85.037
<none>                           8.1342 -84.906
- DriveStat          1   0.29969 8.4339 -84.771
- KidsEZR.Liability  1   0.35564 8.4898 -84.381
- PF.Autonomy        1   0.36086 8.4951 -84.345
- KidsEZR.CPR        1   0.41146 8.5457 -83.995
- PF.Brand           1   0.42364 8.5579 -83.911
- PF.GeoFlex         1   0.54487 8.6791 -83.081
- PF.TimeFlex        1   0.82710 8.9613 -81.193
- PF.EasyStart       1   0.93263 9.0668 -80.502

Step:  AIC=-86.2
ZumYN ~ DriveStat + PF.Comp + PF.TimeFlex + PF.GeoFlex + PF.Autonomy + 
    PF.Brand + PF.EasyStart + KidsEZR.Car + KidsEZR.CPR + KidsEZR.Toys + 
    KidsEZR.Liability + KidsEZR.Rltnshp + Sex + EmployStat

                    Df Sum of Sq    RSS     AIC
- PF.Comp            1   0.10638 8.3384 -87.443
- EmployStat         1   0.16557 8.3976 -87.026
- KidsEZR.Rltnshp    1   0.16795 8.4000 -87.009
- Sex                1   0.23329 8.4654 -86.552
- KidsEZR.Toys       1   0.23358 8.4656 -86.550
- KidsEZR.Car        1   0.23982 8.4719 -86.506
- PF.Autonomy        1   0.28133 8.5134 -86.218
<none>                           8.2321 -86.201
- DriveStat          1   0.29949 8.5315 -86.092
- PF.Brand           1   0.33405 8.5661 -85.854
- PF.GeoFlex         1   0.46547 8.6975 -84.955
- KidsEZR.Liability  1   0.59866 8.8307 -84.059
- KidsEZR.CPR        1   0.65628 8.8883 -83.675
- PF.TimeFlex        1   0.73320 8.9653 -83.167
- PF.EasyStart       1   0.83876 9.0708 -82.476

Step:  AIC=-87.44
ZumYN ~ DriveStat + PF.TimeFlex + PF.GeoFlex + PF.Autonomy + 
    PF.Brand + PF.EasyStart + KidsEZR.Car + KidsEZR.CPR + KidsEZR.Toys + 
    KidsEZR.Liability + KidsEZR.Rltnshp + Sex + EmployStat

                    Df Sum of Sq    RSS     AIC
- EmployStat         1   0.16409 8.5025 -88.293
- PF.Autonomy        1   0.17653 8.5150 -88.207
- KidsEZR.Rltnshp    1   0.19038 8.5288 -88.111
- Sex                1   0.19551 8.5340 -88.076
- KidsEZR.Car        1   0.20976 8.5482 -87.977
- PF.Brand           1   0.22790 8.5663 -87.852
- KidsEZR.Toys       1   0.25723 8.5957 -87.650
<none>                           8.3384 -87.443
- PF.GeoFlex         1   0.37099 8.7094 -86.875
- DriveStat          1   0.49261 8.8311 -86.057
- KidsEZR.CPR        1   0.56608 8.9045 -85.568
- KidsEZR.Liability  1   0.60034 8.9388 -85.341
- PF.TimeFlex        1   0.74728 9.0857 -84.379
- PF.EasyStart       1   0.93489 9.2733 -83.173

Step:  AIC=-88.29
ZumYN ~ DriveStat + PF.TimeFlex + PF.GeoFlex + PF.Autonomy + 
    PF.Brand + PF.EasyStart + KidsEZR.Car + KidsEZR.CPR + KidsEZR.Toys + 
    KidsEZR.Liability + KidsEZR.Rltnshp + Sex

                    Df Sum of Sq    RSS     AIC
- Sex                1   0.15393 8.6565 -89.235
- KidsEZR.Rltnshp    1   0.19986 8.7024 -88.922
- PF.Autonomy        1   0.23621 8.7387 -88.677
- PF.Brand           1   0.24207 8.7446 -88.637
- KidsEZR.Car        1   0.24866 8.7512 -88.593
- KidsEZR.Toys       1   0.25339 8.7559 -88.561
<none>                           8.5025 -88.293
- PF.GeoFlex         1   0.43285 8.9354 -87.364
- DriveStat          1   0.44443 8.9470 -87.287
- KidsEZR.CPR        1   0.50460 9.0071 -86.892
- KidsEZR.Liability  1   0.51785 9.0204 -86.805
- PF.TimeFlex        1   0.84184 9.3444 -84.723
- PF.EasyStart       1   1.01543 9.5180 -83.637

Step:  AIC=-89.23
ZumYN ~ DriveStat + PF.TimeFlex + PF.GeoFlex + PF.Autonomy + 
    PF.Brand + PF.EasyStart + KidsEZR.Car + KidsEZR.CPR + KidsEZR.Toys + 
    KidsEZR.Liability + KidsEZR.Rltnshp

                    Df Sum of Sq    RSS     AIC
- KidsEZR.Rltnshp    1   0.13927 8.7957 -90.293
- PF.Brand           1   0.15463 8.8111 -90.190
- PF.Autonomy        1   0.16935 8.8258 -90.092
- KidsEZR.Car        1   0.19779 8.8543 -89.902
- KidsEZR.Toys       1   0.20498 8.8615 -89.854
<none>                           8.6565 -89.235
- PF.GeoFlex         1   0.33529 8.9918 -88.992
- KidsEZR.CPR        1   0.44433 9.1008 -88.281
- DriveStat          1   0.46121 9.1177 -88.172
- KidsEZR.Liability  1   0.49073 9.1472 -87.981
- PF.TimeFlex        1   0.74345 9.3999 -86.373
- PF.EasyStart       1   0.92099 9.5775 -85.269

Step:  AIC=-90.29
ZumYN ~ DriveStat + PF.TimeFlex + PF.GeoFlex + PF.Autonomy + 
    PF.Brand + PF.EasyStart + KidsEZR.Car + KidsEZR.CPR + KidsEZR.Toys + 
    KidsEZR.Liability

                    Df Sum of Sq    RSS     AIC
- KidsEZR.Car        1   0.07340 8.8691 -91.803
- KidsEZR.Toys       1   0.07368 8.8694 -91.801
- PF.Autonomy        1   0.18993 8.9857 -91.032
- PF.Brand           1   0.23174 9.0275 -90.759
<none>                           8.7957 -90.293
- KidsEZR.CPR        1   0.30506 9.1008 -90.281
- KidsEZR.Liability  1   0.35313 9.1489 -89.971
- PF.GeoFlex         1   0.38628 9.1820 -89.757
- DriveStat          1   0.47252 9.2683 -89.206
- PF.TimeFlex        1   0.84660 9.6423 -86.871
- PF.EasyStart       1   0.93634 9.7321 -86.324

Step:  AIC=-91.8
ZumYN ~ DriveStat + PF.TimeFlex + PF.GeoFlex + PF.Autonomy + 
    PF.Brand + PF.EasyStart + KidsEZR.CPR + KidsEZR.Toys + KidsEZR.Liability

                    Df Sum of Sq    RSS     AIC
- KidsEZR.Toys       1   0.03464 8.9038 -93.573
- PF.Autonomy        1   0.14176 9.0109 -92.867
- KidsEZR.CPR        1   0.23388 9.1030 -92.267
- PF.Brand           1   0.26128 9.1304 -92.090
- KidsEZR.Liability  1   0.27990 9.1490 -91.969
<none>                           8.8691 -91.803
- PF.GeoFlex         1   0.32857 9.1977 -91.656
- DriveStat          1   0.47506 9.3442 -90.724
- PF.TimeFlex        1   0.79483 9.6640 -88.739
- PF.EasyStart       1   0.95018 9.8193 -87.798

Step:  AIC=-93.57
ZumYN ~ DriveStat + PF.TimeFlex + PF.GeoFlex + PF.Autonomy + 
    PF.Brand + PF.EasyStart + KidsEZR.CPR + KidsEZR.Liability

                    Df Sum of Sq    RSS     AIC
- PF.Autonomy        1   0.14039 9.0442 -94.650
- KidsEZR.CPR        1   0.22987 9.1336 -94.069
- PF.Brand           1   0.23578 9.1396 -94.031
- KidsEZR.Liability  1   0.24674 9.1505 -93.960
<none>                           8.9038 -93.573
- PF.GeoFlex         1   0.31512 9.2189 -93.521
- DriveStat          1   0.52327 9.4270 -92.203
- PF.TimeFlex        1   0.78100 9.6848 -90.612
- PF.EasyStart       1   0.91662 9.8204 -89.791

Step:  AIC=-94.65
ZumYN ~ DriveStat + PF.TimeFlex + PF.GeoFlex + PF.Brand + PF.EasyStart + 
    KidsEZR.CPR + KidsEZR.Liability

                    Df Sum of Sq    RSS     AIC
- PF.Brand           1   0.11188 9.1561 -95.924
- KidsEZR.CPR        1   0.17583 9.2200 -95.514
- PF.GeoFlex         1   0.17957 9.2237 -95.490
<none>                           9.0442 -94.650
- KidsEZR.Liability  1   0.35555 9.3997 -94.375
- DriveStat          1   0.49303 9.5372 -93.518
- PF.TimeFlex        1   0.72742 9.7716 -92.085
- PF.EasyStart       1   0.82052 9.8647 -91.526

Step:  AIC=-95.92
ZumYN ~ DriveStat + PF.TimeFlex + PF.GeoFlex + PF.EasyStart + 
    KidsEZR.CPR + KidsEZR.Liability

                    Df Sum of Sq    RSS     AIC
- PF.GeoFlex         1   0.08722 9.2433 -97.365
- KidsEZR.CPR        1   0.26166 9.4177 -96.262
<none>                           9.1561 -95.924
- KidsEZR.Liability  1   0.38497 9.5410 -95.494
- DriveStat          1   0.38738 9.5434 -95.479
- PF.TimeFlex        1   0.61640 9.7725 -94.080
- PF.EasyStart       1   0.74896 9.9050 -93.285

Step:  AIC=-97.36
ZumYN ~ DriveStat + PF.TimeFlex + PF.EasyStart + KidsEZR.CPR + 
    KidsEZR.Liability

                    Df Sum of Sq    RSS     AIC
- KidsEZR.CPR        1   0.24500 9.4883 -97.821
- KidsEZR.Liability  1   0.31546 9.5587 -97.385
<none>                           9.2433 -97.365
- DriveStat          1   0.37496 9.6182 -97.019
- PF.TimeFlex        1   0.55108 9.7944 -95.948
- PF.EasyStart       1   0.67357 9.9168 -95.215

Step:  AIC=-97.82
ZumYN ~ DriveStat + PF.TimeFlex + PF.EasyStart + KidsEZR.Liability

                    Df Sum of Sq     RSS     AIC
- KidsEZR.Liability  1   0.25894  9.7472 -98.233
<none>                            9.4883 -97.821
- DriveStat          1   0.36943  9.8577 -97.568
- PF.TimeFlex        1   0.57150 10.0598 -96.371
- PF.EasyStart       1   0.65098 10.1393 -95.906

Step:  AIC=-98.23
ZumYN ~ DriveStat + PF.TimeFlex + PF.EasyStart

               Df Sum of Sq     RSS     AIC
- DriveStat     1   0.28910 10.0363 -98.508
<none>                       9.7472 -98.233
- PF.TimeFlex   1   0.56957 10.3168 -96.882
- PF.EasyStart  1   0.57015 10.3174 -96.879

Step:  AIC=-98.51
ZumYN ~ PF.TimeFlex + PF.EasyStart

               Df Sum of Sq    RSS     AIC
<none>                      10.036 -98.508
- PF.TimeFlex   1   0.51885 10.555 -97.534
- PF.EasyStart  1   0.67155 10.708 -96.687
summary(model3)

Call:
lm(formula = ZumYN ~ PF.TimeFlex + PF.EasyStart, data = training)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.8887 -0.2015  0.1587  0.2684  0.5007 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)   1.18338    0.18177   6.510 2.22e-08 ***
PF.TimeFlex  -0.07742    0.04550  -1.701   0.0944 .  
PF.EasyStart -0.06240    0.03224  -1.936   0.0580 .  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.4233 on 56 degrees of freedom
Multiple R-squared:  0.1028,    Adjusted R-squared:  0.07077 
F-statistic: 3.209 on 2 and 56 DF,  p-value: 0.04794
prediction3 = predict(model3, newdata = validation)
validationErrors3 = validation$ZumYN - prediction3
validationRMSE3 = sqrt(mean(validationErrors3^2))
cat('Validation RMSE = ', validationRMSE3,'\n')
Validation RMSE =  0.3006132 
#--------------------------------------------------------------------------------------------------------#

#-------------------------------------------------------------------------------------------------------#
# Tree
model4 = rpart(ZumYN ~ ., training)
prp(model4, extra=1, fallen.leaves=TRUE, type=1, box.col=rainbow(80), varlen=0,digits=2,faclen=0)
prediction4 = predict(model4, newdata = validation)
validationErrors4 = validation$ZumYN - prediction4
validationRMSE4 = sqrt(mean(validationErrors4^2))
cat('Validation RMSE = ', validationRMSE4,'\n')
Validation RMSE =  0.367987 
#-------------------------------------------------------------------------------------------------------#
#-------------------------------------------------------------------------------------------------------#
# Gradient Boosting
require(gbm)
model5 = gbm(ZumYN ~ ., 
             data = training,
             cv.folds = 5,
             distribution = "bernoulli", 
             n.trees = 5,
             shrinkage = 0.1, 
             interaction.depth = 3)
variable 2: Parent has no variation.
model5
gbm(formula = ZumYN ~ ., distribution = "bernoulli", data = training, 
    n.trees = 5, interaction.depth = 3, shrinkage = 0.1, cv.folds = 5)
A gradient boosted model with bernoulli loss function.
5 iterations were performed.
The best cross-validation iteration was 5.
There were 20 predictors of which 4 had non-zero influence.
summary(model5)

prediction5 = predict(model5, validation, n.trees = 5)
validationErrors5 = validation$ZumYN - prediction5
validationRMSE5 = sqrt(mean(validationErrors5^2))
cat('Validation RMSE = ', validationRMSE5,'\n')
Validation RMSE =  0.4593657 
#-------------------------------------------------------------------------------------------------------#
#-------------------------------------------------------------------------------------------------------#
# Random Forest
model6 = randomForest(ZumYN ~ ., training)
The response has five or fewer unique values.  Are you sure you want to do regression?
summary(model6)
                Length Class  Mode     
call              3    -none- call     
type              1    -none- character
predicted        59    -none- numeric  
mse             500    -none- numeric  
rsq             500    -none- numeric  
oob.times        59    -none- numeric  
importance       20    -none- numeric  
importanceSD      0    -none- NULL     
localImportance   0    -none- NULL     
proximity         0    -none- NULL     
ntree             1    -none- numeric  
mtry              1    -none- numeric  
forest           11    -none- list     
coefs             0    -none- NULL     
y                59    -none- numeric  
test              0    -none- NULL     
inbag             0    -none- NULL     
terms             3    terms  call     
prediction6 = predict(model6, newdata = validation)
validationErrors6 = validation$ZumYN - prediction6
validationRMSE6 = sqrt(mean(validationErrors6^2))
cat('Validation RMSE = ', validationRMSE6,'\n')
Validation RMSE =  0.2796911 
# Random Forest, by principle, randomizes the variable selection during each tree split, so it's not as
# prone to overfitting as some of the other models.
#########################################################################################################
# STEP 10: TEST ON THE TEST DATASET
# Check out the different RMSEs.
validationRMSE1
[1] 0.2963254
validationRMSE2
[1] 0.3884708
validationRMSE3
[1] 0.3006132
validationRMSE4
[1] 0.367987
validationRMSE5
[1] 0.4593657
validationRMSE6
[1] 0.2796911
# We choose model 6 because it has consistently proven to be the better predictive model, given that it
# typically generates a lower RMSE than the other models while maintaining a low risk for overfitting.
predictiontest = predict(model6, newdata = test)
test <- test %>% add_predictions(model6) # Append the model predictions to the test set.
test <- rename(test, c(pred = "Predicted_Outcome"))
test[,21:22] # View only the last two columns
testErrors = test$ZumYN - predictiontest
testRMSE = sqrt(mean(testErrors^2))
cat('Test RMSE = ', testRMSE,'\n')
Test RMSE =  0.5347391 
#########################################################################################################

#########################################################################################################
# STEP 11: CONDUCT DIMENSIONALITY REDUCTION WITH PCA
# Run PCA on the explanatory variables.
Ride_Share9 <- Ride_Share8[,1:20]
PCA1 = principal(Ride_Share9, rotate="none") # Use as much data as possible to come up with the PCs
# How many components (factors) should we focus on? Consider the eigenvalues to answer this question.
# Keep the components where the number of eigenvalues is greater than one. After all, the cost of
# additional complexity outweighs the value of additional insight from adding the next dimension.
# Let's plot the eigenvalues to make this decision easy. The second command below adds to the plot a
# dashed horizontal line at y = 1 (i.e., a line with intercept 1 and slope 0). If you don't see big
# eigenvalues in your plot, that tells you that the PCA is not really useful.
plot(PCA1$values, main = "Eigenvalues", ylab = "Eigenvalues", xlab = "Number of Principal Components",
     col = "blue", type = "b", pch = 19)
abline(a = 1, b = 0, lty = "dashed")
# Eigenvalue of specific PCAx / Total eigenvalues = Percent of variation in the data explained by PCAx
# As is apparent, the number of principal components is equivalent to the number of explanatory
# variables in the data frame.
# There are nine factors whose eigenvalues are greater than 1. Rerun PCA to keep those components.
PCA2 = principal(Ride_Share9, nfactors = 9, rotate="none")
PCA2$values # Display only the eigenvalues
 [1] 2.73388824 1.96484372 1.66366342 1.48147389 1.43070799 1.31491913 1.25265100 1.09749895
 [9] 1.04596955 0.91676106 0.87070943 0.81229907 0.71068267 0.66952589 0.58773444 0.48813952
[17] 0.46094309 0.41679483 0.05331187 0.02748224
#########################################################################################################
#########################################################################################################
# STEP 12: INTERPRET THE PRINCIPAL COMPONENTS
# What do the selected components stand for? We need to interpret them. Based on the loadings, we can
# appropriately name each column (a rather subjective process). The loadings give the weights of the
# variables in each of the principal components. We can see the loadings using the following code:
fa.sort(PCA2$loadings) # The loadings tell you how strong the relationship is between each variable

Loadings:
                  PC1    PC2    PC3    PC4    PC5    PC6    PC7    PC8    PC9   
DriveStat          0.624  0.171  0.186                0.215 -0.138  0.252       
KidsEZR.Cln        0.617  0.157 -0.345 -0.146               -0.201 -0.246       
KidsEZR.Liability -0.607                0.176        -0.171 -0.526         0.177
KidsEZR.Toys       0.601  0.447  0.154  0.103        -0.218                     
PF.Brand           0.584  0.156 -0.470         0.144  0.178                0.194
KidsEZR.Rltnshp   -0.529  0.233 -0.122 -0.237  0.318  0.306  0.352  0.427       
PF.Comp           -0.396 -0.260 -0.123  0.256  0.321  0.373        -0.370       
KidsEZR.Car        0.138 -0.730  0.122 -0.211 -0.349  0.190                     
PF.EasyStart      -0.330  0.587  0.306  0.168 -0.282  0.165        -0.160       
PF.GeoFlex         0.237 -0.557  0.329  0.123        -0.218 -0.309  0.267       
AnnInc                    0.277  0.502 -0.101 -0.229         0.110 -0.209  0.349
PF.TimeFlex               0.190  0.277 -0.576  0.237 -0.331 -0.186        -0.409
SocialMed                 0.301 -0.175  0.573 -0.229                0.368 -0.161
EmployStat                              0.504  0.308                0.147 -0.473
KidsEZR.CPR        0.131 -0.111  0.367  0.451  0.197 -0.238  0.383 -0.431       
Age                0.426 -0.134  0.152         0.481  0.437  0.122              
Parent             0.192 -0.171  0.417  0.147 -0.198  0.422 -0.205  0.227  0.132
PF.Autonomy        0.112 -0.316 -0.343        -0.418 -0.249  0.542  0.110       
Race                             0.341         0.343 -0.373  0.290  0.310  0.434
Sex                             -0.284  0.206  0.363 -0.238 -0.233         0.428

                 PC1   PC2   PC3   PC4   PC5   PC6   PC7   PC8   PC9
SS loadings    2.734 1.965 1.664 1.481 1.431 1.315 1.253 1.097 1.046
Proportion Var 0.137 0.098 0.083 0.074 0.072 0.066 0.063 0.055 0.052
Cumulative Var 0.137 0.235 0.318 0.392 0.464 0.529 0.592 0.647 0.699
                       # (constituting the component) with the component itself.
# If any of the original columns are not represented in the nine factors, we would see the communality
# for that column to be very low.
round(data.frame(PCA2$communality),digits=1)
#########################################################################################################
#########################################################################################################
# STEP 13: TRANSFORM THE DATA USING THE PRINCIPAL COMPONENTS
# After doing PCA and narrowing down to the key components, our data is transformed from 20 columns to
# 9 columns. Moreover, now that we have interpreted the new columns, we need to fill them up. For every
# observation, we need to know its value for PC1 through PC9.
score1 = data.frame(PCA2$score)
# Take a look at the transformed data.
head(score1)

# Notice the lack of correlation among the different components.
allcorrelations2 = cor(score1)
corrplot(allcorrelations2)

# Notice the mean and standard deviation for each component.
round(mean(score1[,1]),1) # Mean of principal component one.
[1] 0
round(sd(score1[,1]),1) # Standard deviation of principal component one.
[1] 1
round(mean(score1[,2]),1) # Mean of principal component two.
[1] 0
round(sd(score1[,2]),1) # Standard deviation of principal component two.
[1] 1
# More efficient way to find mean and sd of each principal component:
round(apply(score1, 2, FUN = mean),1)
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 
  0   0   0   0   0   0   0   0   0 
round(apply(score1, 2, FUN = sd),1)
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 
  1   1   1   1   1   1   1   1   1 
# Even more efficient way to find the mean of each principal component:
round(colMeans(score1), 1)
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 
  0   0   0   0   0   0   0   0   0 
#########################################################################################################
# STEP 14: PERFORM REGRESSION USING PRINCIPAL COMPONENTS
# Regress the depedent variable on the components. Furthermore, we can interpret the coefficients
# without worrying about multicollinearity.
reg = lm(Ride_Share8$ZumYN ~ ., data = score1)
summary(reg)

Call:
lm(formula = Ride_Share8$ZumYN ~ ., data = score1)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.91321 -0.00112  0.18567  0.27277  0.52012 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.765306   0.043609  17.549   <2e-16 ***
PC1         -0.022015   0.043833  -0.502    0.617    
PC2         -0.053372   0.043833  -1.218    0.227    
PC3         -0.032021   0.043833  -0.731    0.467    
PC4         -0.010944   0.043833  -0.250    0.803    
PC5          0.071028   0.043833   1.620    0.109    
PC6          0.011533   0.043833   0.263    0.793    
PC7          0.005747   0.043833   0.131    0.896    
PC8          0.030861   0.043833   0.704    0.483    
PC9          0.041794   0.043833   0.953    0.343    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.4317 on 88 degrees of freedom
Multiple R-squared:  0.06827,   Adjusted R-squared:  -0.02702 
F-statistic: 0.7164 on 9 and 88 DF,  p-value: 0.6925
# In lm command a "." tells R to pick all the columns in the table specified after "data = ", in this
# case, the score table. So, in the above command, it regresses satisfaction on all nine components.
#########################################################################################################
# STEP 15: IMPROVE INTERPRETATION OF PRINCIPAL COMPONENTS
# We can take additional steps to better align our components with the original columns so they can be
# interpreted more easily.
# Rotating the Components
# For this purpose, we use an option in PCA called the "varimax" rotation. This rotation maximizes the
# correlations between variables and factors. Applying this rotation will not change the amount of
# variation explained by the components.
# Rotated PCA
# Command to run a new PCA with rotation.
PCA3 = principal(Ride_Share9, nfactors = 9, rotate="varimax")
# Now, consider the loadings to interpret the components.
fa.sort(PCA3$loadings)

Loadings:
                  RC1    RC2    RC6    RC5    RC7    RC4    RC3    RC8    RC9   
Age                0.739  0.143 -0.171         0.182                0.140       
KidsEZR.Liability -0.704        -0.233         0.258  0.137        -0.212  0.256
PF.Brand           0.586 -0.196  0.117 -0.273 -0.162               -0.265  0.319
DriveStat          0.539  0.272  0.317 -0.118  0.189  0.235  0.105              
PF.GeoFlex                0.763  0.146                      -0.216  0.143  0.161
Parent             0.200  0.606                       0.201  0.295 -0.127 -0.136
KidsEZR.Car               0.579 -0.176 -0.132 -0.391 -0.331 -0.110        -0.336
PF.Comp                         -0.799         0.140                0.117  0.143
KidsEZR.Toys       0.284         0.576 -0.232         0.210  0.274  0.142  0.201
KidsEZR.Rltnshp          -0.412 -0.179  0.727               -0.118 -0.328 -0.190
Race                      0.111  0.281  0.660        -0.210  0.103  0.185  0.368
KidsEZR.Cln        0.312 -0.137  0.283 -0.640                      -0.105  0.174
PF.Autonomy                      0.121        -0.859        -0.139              
PF.TimeFlex       -0.111 -0.120  0.460         0.506 -0.458 -0.204        -0.255
SocialMed                        0.122                0.800                     
AnnInc                           0.136               -0.132  0.726              
PF.EasyStart      -0.249 -0.275                0.230  0.287  0.604        -0.257
KidsEZR.CPR                                                  0.129  0.875       
EmployStat                                     0.207  0.410 -0.392  0.478       
Sex                                                         -0.132         0.734

                 RC1   RC2   RC6   RC5   RC7   RC4   RC3   RC8   RC9
SS loadings    1.999 1.722 1.695 1.562 1.471 1.466 1.422 1.376 1.272
Proportion Var 0.100 0.086 0.085 0.078 0.074 0.073 0.071 0.069 0.064
Cumulative Var 0.100 0.186 0.271 0.349 0.422 0.496 0.567 0.636 0.699
# Score2 will be our new scores, which is the transformed data.
score2 = data.frame(PCA3$score)
head(score2)
# Efficient way to find mean and sd of each rotated component:
round(apply(score2, 2, FUN = mean),1)
RC1 RC2 RC6 RC5 RC7 RC4 RC3 RC8 RC9 
  0   0   0   0   0   0   0   0   0 
round(apply(score2, 2, FUN = sd),1)
RC1 RC2 RC6 RC5 RC7 RC4 RC3 RC8 RC9 
  1   1   1   1   1   1   1   1   1 
# Regress the depedent variable on the rotated components.
reg = lm(Ride_Share8$ZumYN ~ ., data = score2)
summary(reg)

Call:
lm(formula = Ride_Share8$ZumYN ~ ., data = score2)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.91321 -0.00112  0.18567  0.27277  0.52012 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.76531    0.04361  17.549   <2e-16 ***
RC1          0.02231    0.04383   0.509    0.612    
RC2          0.01010    0.04383   0.230    0.818    
RC6         -0.04343    0.04383  -0.991    0.324    
RC5          0.05719    0.04383   1.305    0.195    
RC7         -0.00693    0.04383  -0.158    0.875    
RC4         -0.02763    0.04383  -0.630    0.530    
RC3         -0.04746    0.04383  -1.083    0.282    
RC8         -0.02230    0.04383  -0.509    0.612    
RC9          0.05541    0.04383   1.264    0.210    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.4317 on 88 degrees of freedom
Multiple R-squared:  0.06827,   Adjusted R-squared:  -0.02702 
F-statistic: 0.7164 on 9 and 88 DF,  p-value: 0.6925
#########################################################################################################
# STEP 16: RUN CLUSTER ANALYSIS ON ROTATED COMPONENTS
# Conduct a cluster analysis based on the factor scores. First create the distance.
d1 = dist(score2, method="euclidean")
# Run the command to conduct hierarchical cluster analysis
hc = hclust(d1, method = "ward")
The "ward" method has been renamed to "ward.D"; note new "ward.D2"
# Check out the height at which the clusters were merged, given that height is the measure of
# dissimilarity within clusters. Naturally, larger clusters will merge at a larger height.
# hc, the output from hclust command, has the height stored in it. Reverse it and look at the first 10
# elements.
plot(rev(hc$height)[1:10], type = "b", col = "blue", xlab = "Number of Clusters")

# It looks like going from three clusters to two clusters took a big compromise, compared to other
# mergers of clusters. This tells us that the clusters in a three-cluster solution are very dissimilar.
# The more clusters you choose, the more the clusters seem similar to one another.
# Visualize how the algorithm progressed.
plot(cut(as.dendrogram(hc),5)$upper, main = "Dendrogram for Cluster Analysis (Zoomed-In)", 
     leaflab = "none")

# Attach the cluster identities to the original data frame.
Ride_Share9$clus = cutree(hc,3) # Cut tree at three clusters.
# How many observations fall into each of the clusters?
table(Ride_Share9$clus)

 1  2  3 
31 34 33 
# Inspect cluster means.
clusterMeans <- round(aggregate(score2, by = list(Ride_Share9$clus), FUN = mean), digits=2)
clusterMeans
# Consider the difference in the means for each RC to determine for which RCs you should build a
# histogram. Focus on the RCs with the biggest difference.
apply(clusterMeans, 2, diff)
     Group.1   RC1   RC2   RC6   RC5   RC7   RC4   RC3   RC8   RC9
[1,]       1 -0.18 -0.01  0.46  0.07  0.92 -0.67 -0.51  1.38 -0.21
[2,]       1  1.14 -0.03 -0.79 -0.62 -0.74 -0.15  0.71 -0.04 -0.05
# Given the differences, choose RC1, RC6, RC7, and RC8.

#########################################################################################################
# STEP 17: DISTRIBUTION OF THE ROTATED COMPONENTS ACROSS THE CLUSTERS
# Histograms with overlaid distributions
hist(score2$RC1[Ride_Share9$clus==1] , xlab = "RC1", ylab = "# of Observations", col = rgb(1,0,0,0.5),
     xlim = c(-3,3), ylim = c(0,15), main = "Histogram RC1")
hist(score2$RC1[Ride_Share9$clus==2], col = rgb(0,0,1,0.5), add = T)
hist(score2$RC1[Ride_Share9$clus==3], col = rgb(0,1,0,0.5), add = T)
legend("topright", c("Cluster 1", "Cluster 2", "Cluster 3"),
       col = c(rgb(1,0,0,0.5), rgb(0,0,1,0.5), rgb(0,1,0,0.5)), lwd = 10)
hist(score2$RC6[Ride_Share9$clus==1] , xlab = "RC6", ylab = "# of Observations", col = rgb(1,0,0,0.5),
     xlim = c(-5,5), ylim = c(0,10), main = "Histogram RC6")

hist(score2$RC6[Ride_Share9$clus==2], col = rgb(0,0,1,0.5), add = T)
hist(score2$RC6[Ride_Share9$clus==3], col = rgb(0,1,0,0.5), add = T)
legend("topright", c("Cluster 1", "Cluster 2", "Cluster 3"),
       col = c(rgb(1,0,0,0.5), rgb(0,0,1,0.5), rgb(0,1,0,0.5)), lwd = 10)

hist(score2$RC7[Ride_Share9$clus==1] , xlab = "RC7", ylab = "# of Observations", col = rgb(1,0,0,0.5),
     xlim = c(-3,4), ylim = c(0,20), main = "Histogram RC7")
hist(score2$RC7[Ride_Share9$clus==2], col = rgb(0,0,1,0.5), add = T)
hist(score2$RC7[Ride_Share9$clus==3], col = rgb(0,1,0,0.5), add = T)
legend("topright", c("Cluster 1", "Cluster 2", "Cluster 3"),
       col = c(rgb(1,0,0,0.5), rgb(0,0,1,0.5), rgb(0,1,0,0.5)), lwd = 10)

hist(score2$RC8[Ride_Share9$clus==1] , xlab = "RC8", ylab = "# of Observations", col = rgb(1,0,0,0.5),
     xlim = c(-3,4), ylim = c(0,15), main = "Histogram RC8")
hist(score2$RC8[Ride_Share9$clus==2], col = rgb(0,0,1,0.5), add = T)
hist(score2$RC8[Ride_Share9$clus==3], col = rgb(0,1,0,0.5), add = T)
legend("topright", c("Cluster 1", "Cluster 2", "Cluster 3"),
       col = c(rgb(1,0,0,0.5), rgb(0,0,1,0.5), rgb(0,1,0,0.5)), lwd = 10)

# Calculate the average Age across the various clusters.
aggregate(Ride_Share9$Age, by = list(Ride_Share9$clus), FUN = mean)
# Calculate the average AnnInc across the various clusters.
aggregate(Ride_Share9$AnnInc, by = list(Ride_Share9$clus), FUN = mean)
# Calculate the average PFComp across the various clusters.
aggregate(Ride_Share9$PF.Comp, by = list(Ride_Share9$clus), FUN = mean)
# Calculate the average EmployStat across the various clusters.
aggregate(Ride_Share9$EmployStat, by = list(Ride_Share9$clus), FUN = mean)
#########################################################################################################
# STEP 18: MEASURE TOTAL TIME
# The final phase is to report the time it took to run the code.
endTime=proc.time()[3]  # Records current time to calculate overall code's run-time
cat("This code took ", endTime-startTime, " seconds\n")
This code took  270.86  seconds
---
title: "Marketing Research Final Project"
output: html_notebook
---

```{r}
#########################################################################################################
# STEP 1: PERFORM STANDARD SETUP

# Cleans the memory
rm(list=ls())

# The code below records the computer CPU clock. This can be used to measure the run time of our code.
startTime=proc.time()[3]  # Starts a clock to measure run time

# Set the seed, so that every user receives the same results.
set.seed(123)

# Call all the libraries below:
library(tidyverse); library(gridExtra); library(grid); library(ggplot2); library(lattice);
library(dplyr); library(sqldf); library(data.table); library(readr); library(modelr); library(naniar);
library(knitr); library(markdown); library(rmarkdown); library(survey); library(sandwich);
library(plyr); library(lmtest); library(randomForest); library(bigrquery); library(tokenizers);
library(factoextra); library(jpeg); library(rpart); library(corrplot); library(RColorBrewer);
library(GGally); library(ggfortify); library(factoextra); library(rpart.plot); library(psych);
library(GPArotation); library(lubridate); library(matrixStats); library(png); library(grid)
#########################################################################################################
```

```{r}
#########################################################################################################
# STEP 2: UNDERSTAND THE DATA

Ride_Share = read.csv("Ride_Share.csv") # Load the data
class(Ride_Share) # View the class of the dataset
dim(Ride_Share) # View the dataset's dimensions
names(Ride_Share) # Look at the column names
str(Ride_Share) # View summary of the data's internal structure
glimpse(Ride_Share) # View the structure of the data, the dplyr way
summary(Ride_Share) # View a summary of the data
head(Ride_Share, n = 15) # View the first 15 rows of data
tail(Ride_Share, n = 15) # View the last 15 rows of data
length(unique(Ride_Share$AnnInc)) # Count the number of distinct values in a column
#########################################################################################################
```

```{r}
#########################################################################################################
# STEP 3: VISUALIZE THE DATA (PART I)

# View histogram for select numeric variables to get a sense of the distribution for each variable.
hist(Ride_Share$Age, main = "Distribution of Age", xlab = "Age")
table(Ride_Share$Age)

hist(Ride_Share$Parent, main = "Distribution of Parenthood", xlab = "Parent or Not?")
table(Ride_Share$Parent)

hist(Ride_Share$AnnInc, main = "Distribution of Income", xlab = "Income Bracket", breaks = 8)
table(Ride_Share$AnnInc)

hist(Ride_Share$EmployStat, main = "Distribution of Employment Status", xlab = "Employment Status")
table(Ride_Share$EmployStat)

hist(Ride_Share$SocialMed, main = "Distribution of Social Media", xlab = "Social Media Platform",
     breaks = 10)
table(Ride_Share$SocialMed)
########################################################################################################
```

```{r}
#########################################################################################################
# STEP 4: VISUALIZE THE DATA (PART II)

# What factors matter to those who drive for a ride-share company or would consider doing so?
hist(Ride_Share$PF.Comp, main = "Distribution of Ranking of Compensation",
     xlab = "Ranking of Compensation", breaks = 10)
table(Ride_Share$PF.Comp)

hist(Ride_Share$PF.TimeFlex, main = "Distribution of Ranking of Time Flexibility",
     xlab = "Ranking of Time Flexibility", breaks = 10)
table(Ride_Share$PF.TimeFlex)

hist(Ride_Share$PF.GeoFlex, main = "Distribution of Ranking of Geographic Flexibility",
     xlab = "Ranking of Geographic Flexibility", breaks = 10)
table(Ride_Share$PF.GeoFlex)

hist(Ride_Share$PF.Autonomy, main = "Distribution of Ranking of Autonomy",
     xlab = "Ranking of Autonomy", breaks = 10)
table(Ride_Share$PF.Autonomy)

hist(Ride_Share$PF.Brand, main = "Distribution of Ranking of Brand",
     xlab = "Ranking of Brand", breaks = 10)
table(Ride_Share$PF.Brand)

hist(Ride_Share$PF.EasyStart, main = "Distribution of Ranking of Ease of Getting Started",
     xlab = "Ranking of Ease of Getting Started", breaks = 10)
table(Ride_Share$PF.EasyStart)
#########################################################################################################
```

```{r}
#########################################################################################################
# STEP 5: VISUALIZE THE DATA (PART III)

# View scatter plot of select pairs of variables.
plot(Ride_Share$Age, Ride_Share$ZumYN,
     main = "Regression for Age on Whether the Person Would Drive for Zum",
     xlab = "Age", ylab = "Would You Drive for Zum?")
abline(lm(Ride_Share$ZumYN ~ Ride_Share$Age, data = Ride_Share), col = "red")

plot(Ride_Share$Parent, Ride_Share$ZumYN,
     main = "Regression for Parenthood on Whether the Person Would Drive for Zum",
     xlab = "Is the Person a Parent?", ylab = "Would You Drive for Zum?")
abline(lm(Ride_Share$ZumYN ~ Ride_Share$Parent, data = Ride_Share), col = "red")

plot(Ride_Share$AnnInc, Ride_Share$ZumYN,
     main = "Regression for Annual Income on Whether the Person Would Drive for Zum",
     xlab = "Annual Income", ylab = "Would You Drive for Zum?")
abline(lm(Ride_Share$ZumYN ~ Ride_Share$AnnInc, data = Ride_Share), col = "red")

plot(Ride_Share$Sex, Ride_Share$ZumYN,
     main = "Regression for Sex on Whether the Person Would Drive for Zum",
     xlab = "Sex", ylab = "Would You Drive for Zum?")
abline(lm(Ride_Share$ZumYN ~ Ride_Share$Sex, data = Ride_Share), col = "red")
#########################################################################################################
```

```{r}
#########################################################################################################
# STEP 6: CLEAN / TIDY THE DATA

# Get rid of column labeled "X".
Ride_Share$X = NULL

# Shift columns containing non-numeric data to the end of the data frame.
Ride_Share_Reorder <- Ride_Share[,c(1:23,25:33,36:51,34,24,35)]
write.csv(Ride_Share_Reorder, file="Ride_Share2.csv") # Create csv file
Ride_Share2 = read.csv("Ride_Share2.csv") # Load the data
Ride_Share2$X = NULL #Get rid of the column labeled "X"

# Select only numeric variables.
Ride_Share3 <- data.frame(Ride_Share2)[1:49] # Select only numeric variables

# Select only those independent variables that have all responses.
Ride_Share4 <- Ride_Share3[ , colSums(is.na(Ride_Share3)) == 0]

# Create data frame without the dependent variable.
Ride_Share5 <- Ride_Share4[,1:23]
#########################################################################################################
```


```{r}
#########################################################################################################
# STEP 7: DETERMINE CORRELATIONS AMONG INDEPENDENT VARIABLES

# Visualize correlations among the independent variables.  
ggcorr(Ride_Share5, nbreaks = 4, palette = "RdGy")

# Alternative method for visualizing correlations:
allcorrelations1 = cor(Ride_Share5)
corrplot(allcorrelations1)

# For the multivariate regression analysis, we should omit variables that have high correlation with
# other variables. In other words, for each strongly correlated (+/-) pair of variables, keep only one
# of the two variables. Doing so will help to overcome the problem of multi-collinearity.

# In this case, the explanatory variables Age and AgeCat are strongly correlated with each other (not
# surprisingly). So, let's create a data frame without one of those variables. Specifically, we can get
# rid of AgeCat.
Ride_Share6 <- Ride_Share4[,c(1:19,21:24)] # Note: this data frame includes the y variable.

# Also, let's get rid of the two "Other" columns, since they are virtually useless, given the lack of
# variation of responses for them.
Ride_Share7 <- Ride_Share6[,c(1:10,12:17,19:23)]
#########################################################################################################
```

```{r}
#########################################################################################################
# STEP 8: PARTITION THE DATA INTO TRAINING, VALIDATION, AND TEST SETS

# Randomize the data.
n = nrow(Ride_Share7)
Ride_Share8 = Ride_Share7[sample(n), ]

# Split the data.
train_inds = 1:round(n*.6)                  # 60% training 
valid_inds = (round(n*.6)+1):round(n*.8)    # 20% validation
test_inds = (round(n*.8)+1):n               # 20% test   

# Build a prediction model using the training set. Then, make predictions on the data in the validation
# set to check the RMSE. Once you've determined the best prediction model, use it on the test set.
training = Ride_Share8[train_inds,]
validation = Ride_Share8[valid_inds,]
test = Ride_Share8[test_inds,]
#########################################################################################################
```

```{r}
#########################################################################################################
# STEP 9: BUILD PREDICTION MODELS

#-------------------------------------------------------------------------------------------------------#
# Univariate Regression
model1 = lm(ZumYN ~ Age, data = training)
summary(model1)

# According to this model, for every one unit increase in Age, the dependent variable increases by
# ~0.01 units. However, the result is not statistically significant.

plot(ZumYN ~ Age, data = training, col = 'red') # Visualize the regression
abline(model1)

prediction1 = predict(model1, newdata = validation)
validationErrors1 = validation$ZumYN - prediction1

# Calculating RMSE on validation data
validationRMSE1 = sqrt(mean(validationErrors1^2))
cat('Validation RMSE = ', validationRMSE1,'\n')

# The goal is to minimize RMSE without overfitting. The RMSE is in the same units as the y variable.
#--------------------------------------------------------------------------------------------------------#
```

```{r}
#--------------------------------------------------------------------------------------------------------#
# Multivariate Regression
model2 = lm(ZumYN ~ ., data = training)
summary(model2)

prediction2 = predict(model2, newdata = validation)
validationErrors2 = validation$ZumYN - prediction2

validationRMSE2 = sqrt(mean(validationErrors2^2))
cat('Validation RMSE = ', validationRMSE2,'\n')
#--------------------------------------------------------------------------------------------------------#
```

```{r}
#--------------------------------------------------------------------------------------------------------#
# Stepwise Regression
model3 = step(model2, data = training, direction = "backward")
summary(model3)

prediction3 = predict(model3, newdata = validation)
validationErrors3 = validation$ZumYN - prediction3

validationRMSE3 = sqrt(mean(validationErrors3^2))
cat('Validation RMSE = ', validationRMSE3,'\n')
#--------------------------------------------------------------------------------------------------------#
```

```{r}
#-------------------------------------------------------------------------------------------------------#
# Tree

model4 = rpart(ZumYN ~ ., training)

prp(model4, extra=1, fallen.leaves=TRUE, type=1, box.col=rainbow(80), varlen=0,digits=2,faclen=0)

prediction4 = predict(model4, newdata = validation)
validationErrors4 = validation$ZumYN - prediction4

validationRMSE4 = sqrt(mean(validationErrors4^2))
cat('Validation RMSE = ', validationRMSE4,'\n')
#-------------------------------------------------------------------------------------------------------#
```

```{r}
#-------------------------------------------------------------------------------------------------------#
# Gradient Boosting

require(gbm)
model5 = gbm(ZumYN ~ ., 
             data = training,
             cv.folds = 5,
             distribution = "bernoulli", 
             n.trees = 5,
             shrinkage = 0.1, 
             interaction.depth = 3)
model5
summary(model5)

prediction5 = predict(model5, validation, n.trees = 5)
validationErrors5 = validation$ZumYN - prediction5

validationRMSE5 = sqrt(mean(validationErrors5^2))
cat('Validation RMSE = ', validationRMSE5,'\n')
#-------------------------------------------------------------------------------------------------------#
```

```{r}
#-------------------------------------------------------------------------------------------------------#
# Random Forest

model6 = randomForest(ZumYN ~ ., training)
summary(model6)

prediction6 = predict(model6, newdata = validation)
validationErrors6 = validation$ZumYN - prediction6

validationRMSE6 = sqrt(mean(validationErrors6^2))
cat('Validation RMSE = ', validationRMSE6,'\n')

# Random Forest, by principle, randomizes the variable selection during each tree split, so it's not as
# prone to overfitting as some of the other models.
#########################################################################################################
```

```{r}
#########################################################################################################
# STEP 10: TEST ON THE TEST DATASET

# Check out the different RMSEs.
validationRMSE1
validationRMSE2
validationRMSE3
validationRMSE4
validationRMSE5
validationRMSE6

# We choose model 6 because it has consistently proven to be the better predictive model, given that it
# typically generates a lower RMSE than the other models while maintaining a low risk for overfitting.
predictiontest = predict(model6, newdata = test)
test <- test %>% add_predictions(model6) # Append the model predictions to the test set.
test <- rename(test, c(pred = "Predicted_Outcome"))
test[,21:22] # View only the last two columns
testErrors = test$ZumYN - predictiontest
testRMSE = sqrt(mean(testErrors^2))
cat('Test RMSE = ', testRMSE,'\n')
#########################################################################################################
```

```{r}
#########################################################################################################
# STEP 11: CONDUCT DIMENSIONALITY REDUCTION WITH PCA

# Run PCA on the explanatory variables.
Ride_Share9 <- Ride_Share8[,1:20]
PCA1 = principal(Ride_Share9, rotate="none") # Use as much data as possible to come up with the PCs

# How many components (factors) should we focus on? Consider the eigenvalues to answer this question.
# Keep the components where the number of eigenvalues is greater than one. After all, the cost of
# additional complexity outweighs the value of additional insight from adding the next dimension.

# Let's plot the eigenvalues to make this decision easy. The second command below adds to the plot a
# dashed horizontal line at y = 1 (i.e., a line with intercept 1 and slope 0). If you don't see big
# eigenvalues in your plot, that tells you that the PCA is not really useful.
plot(PCA1$values, main = "Eigenvalues", ylab = "Eigenvalues", xlab = "Number of Principal Components",
     col = "blue", type = "b", pch = 19)
abline(a = 1, b = 0, lty = "dashed")

# Eigenvalue of specific PCAx / Total eigenvalues = Percent of variation in the data explained by PCAx

# As is apparent, the number of principal components is equivalent to the number of explanatory
# variables in the data frame.

# There are nine factors whose eigenvalues are greater than 1. Rerun PCA to keep those components.
PCA2 = principal(Ride_Share9, nfactors = 9, rotate="none")
PCA2$values # Display only the eigenvalues
#########################################################################################################
```

```{r}
#########################################################################################################
# STEP 12: INTERPRET THE PRINCIPAL COMPONENTS

# What do the selected components stand for? We need to interpret them. Based on the loadings, we can
# appropriately name each column (a rather subjective process). The loadings give the weights of the
# variables in each of the principal components. We can see the loadings using the following code:
fa.sort(PCA2$loadings) # The loadings tell you how strong the relationship is between each variable
                       # (constituting the component) with the component itself.

# If any of the original columns are not represented in the nine factors, we would see the communality
# for that column to be very low.
round(data.frame(PCA2$communality),digits=1)
#########################################################################################################
```

```{r}
#########################################################################################################
# STEP 13: TRANSFORM THE DATA USING THE PRINCIPAL COMPONENTS

# After doing PCA and narrowing down to the key components, our data is transformed from 20 columns to
# 9 columns. Moreover, now that we have interpreted the new columns, we need to fill them up. For every
# observation, we need to know its value for PC1 through PC9.

score1 = data.frame(PCA2$score)

# Take a look at the transformed data.
head(score1)

# Notice the lack of correlation among the different components.
allcorrelations2 = cor(score1)
corrplot(allcorrelations2)

# Notice the mean and standard deviation for each component.
round(mean(score1[,1]),1) # Mean of principal component one.
round(sd(score1[,1]),1) # Standard deviation of principal component one.
round(mean(score1[,2]),1) # Mean of principal component two.
round(sd(score1[,2]),1) # Standard deviation of principal component two.

# More efficient way to find mean and sd of each principal component:
round(apply(score1, 2, FUN = mean),1)
round(apply(score1, 2, FUN = sd),1)

# Even more efficient way to find the mean of each principal component:
round(colMeans(score1), 1)
#########################################################################################################
```

```{r}
#########################################################################################################
# STEP 14: PERFORM REGRESSION USING PRINCIPAL COMPONENTS

# Regress the depedent variable on the components. Furthermore, we can interpret the coefficients
# without worrying about multicollinearity.
reg = lm(Ride_Share8$ZumYN ~ ., data = score1)
summary(reg)

# In lm command a "." tells R to pick all the columns in the table specified after "data = ", in this
# case, the score table. So, in the above command, it regresses satisfaction on all nine components.
#########################################################################################################
```

```{r}
#########################################################################################################
# STEP 15: IMPROVE INTERPRETATION OF PRINCIPAL COMPONENTS

# We can take additional steps to better align our components with the original columns so they can be
# interpreted more easily.

# Rotating the Components
# For this purpose, we use an option in PCA called the "varimax" rotation. This rotation maximizes the
# correlations between variables and factors. Applying this rotation will not change the amount of
# variation explained by the components.

# Rotated PCA

# Command to run a new PCA with rotation.
PCA3 = principal(Ride_Share9, nfactors = 9, rotate="varimax")

# Now, consider the loadings to interpret the components.
fa.sort(PCA3$loadings)

# Score2 will be our new scores, which is the transformed data.
score2 = data.frame(PCA3$score)
head(score2)

# Efficient way to find mean and sd of each rotated component:
round(apply(score2, 2, FUN = mean),1)
round(apply(score2, 2, FUN = sd),1)

# Regress the depedent variable on the rotated components.
reg = lm(Ride_Share8$ZumYN ~ ., data = score2)
summary(reg)
#########################################################################################################
```

```{r}
#########################################################################################################
# STEP 16: RUN CLUSTER ANALYSIS ON ROTATED COMPONENTS

# Conduct a cluster analysis based on the factor scores. First create the distance.
d1 = dist(score2, method="euclidean")

# Run the command to conduct hierarchical cluster analysis
hc = hclust(d1, method = "ward")

# Check out the height at which the clusters were merged, given that height is the measure of
# dissimilarity within clusters. Naturally, larger clusters will merge at a larger height.

# hc, the output from hclust command, has the height stored in it. Reverse it and look at the first 10
# elements.
plot(rev(hc$height)[1:10], type = "b", col = "blue", xlab = "Number of Clusters")

# It looks like going from three clusters to two clusters took a big compromise, compared to other
# mergers of clusters. This tells us that the clusters in a three-cluster solution are very dissimilar.
# The more clusters you choose, the more the clusters seem similar to one another.

# Visualize how the algorithm progressed.
plot(cut(as.dendrogram(hc),5)$upper, main = "Dendrogram for Cluster Analysis (Zoomed-In)", 
     leaflab = "none")

# Attach the cluster identities to the original data frame.
Ride_Share9$clus = cutree(hc,3) # Cut tree at three clusters.

# How many observations fall into each of the clusters?
table(Ride_Share9$clus)

# Inspect cluster means.
clusterMeans <- round(aggregate(score2, by = list(Ride_Share9$clus), FUN = mean), digits=2)
clusterMeans

# Consider the difference in the means for each RC to determine for which RCs you should build a
# histogram. Focus on the RCs with the biggest difference.
apply(clusterMeans, 2, diff)

# Given the differences, choose RC1, RC6, RC7, and RC8.
#########################################################################################################
```

```{r}
#########################################################################################################
# STEP 17: DISTRIBUTION OF THE ROTATED COMPONENTS ACROSS THE CLUSTERS

# Histograms with overlaid distributions
hist(score2$RC1[Ride_Share9$clus==1] , xlab = "RC1", ylab = "# of Observations", col = rgb(1,0,0,0.5),
     xlim = c(-3,3), ylim = c(0,15), main = "Histogram RC1")
hist(score2$RC1[Ride_Share9$clus==2], col = rgb(0,0,1,0.5), add = T)
hist(score2$RC1[Ride_Share9$clus==3], col = rgb(0,1,0,0.5), add = T)
legend("topright", c("Cluster 1", "Cluster 2", "Cluster 3"),
       col = c(rgb(1,0,0,0.5), rgb(0,0,1,0.5), rgb(0,1,0,0.5)), lwd = 10)

hist(score2$RC6[Ride_Share9$clus==1] , xlab = "RC6", ylab = "# of Observations", col = rgb(1,0,0,0.5),
     xlim = c(-5,5), ylim = c(0,10), main = "Histogram RC6")
hist(score2$RC6[Ride_Share9$clus==2], col = rgb(0,0,1,0.5), add = T)
hist(score2$RC6[Ride_Share9$clus==3], col = rgb(0,1,0,0.5), add = T)
legend("topright", c("Cluster 1", "Cluster 2", "Cluster 3"),
       col = c(rgb(1,0,0,0.5), rgb(0,0,1,0.5), rgb(0,1,0,0.5)), lwd = 10)

hist(score2$RC7[Ride_Share9$clus==1] , xlab = "RC7", ylab = "# of Observations", col = rgb(1,0,0,0.5),
     xlim = c(-3,4), ylim = c(0,20), main = "Histogram RC7")
hist(score2$RC7[Ride_Share9$clus==2], col = rgb(0,0,1,0.5), add = T)
hist(score2$RC7[Ride_Share9$clus==3], col = rgb(0,1,0,0.5), add = T)
legend("topright", c("Cluster 1", "Cluster 2", "Cluster 3"),
       col = c(rgb(1,0,0,0.5), rgb(0,0,1,0.5), rgb(0,1,0,0.5)), lwd = 10)

hist(score2$RC8[Ride_Share9$clus==1] , xlab = "RC8", ylab = "# of Observations", col = rgb(1,0,0,0.5),
     xlim = c(-3,4), ylim = c(0,15), main = "Histogram RC8")
hist(score2$RC8[Ride_Share9$clus==2], col = rgb(0,0,1,0.5), add = T)
hist(score2$RC8[Ride_Share9$clus==3], col = rgb(0,1,0,0.5), add = T)
legend("topright", c("Cluster 1", "Cluster 2", "Cluster 3"),
       col = c(rgb(1,0,0,0.5), rgb(0,0,1,0.5), rgb(0,1,0,0.5)), lwd = 10)

# Calculate the average Age across the various clusters.
aggregate(Ride_Share9$Age, by = list(Ride_Share9$clus), FUN = mean)

# Calculate the average AnnInc across the various clusters.
aggregate(Ride_Share9$AnnInc, by = list(Ride_Share9$clus), FUN = mean)

# Calculate the average PFComp across the various clusters.
aggregate(Ride_Share9$PF.Comp, by = list(Ride_Share9$clus), FUN = mean)

# Calculate the average EmployStat across the various clusters.
aggregate(Ride_Share9$EmployStat, by = list(Ride_Share9$clus), FUN = mean)
#########################################################################################################
```

```{r}
#########################################################################################################
# STEP 18: MEASURE TOTAL TIME

# The final phase is to report the time it took to run the code.
endTime=proc.time()[3]  # Records current time to calculate overall code's run-time
cat("This code took ", endTime-startTime, " seconds\n")
#########################################################################################################
```
