# STEP 1: PERFORM STANDARD SETUP
# Cleans the memory
rm(list=ls())
# The code below records the computer CPU clock. This can be used to measure the run time of our code.
startTime=proc.time()[3] # Starts a clock to measure run time
# Set the seed, so that every user receives the same results.
set.seed(123)
# Call all the libraries below:
library(tidyverse); library(gridExtra); library(grid); library(ggplot2); library(lattice);
Registered S3 methods overwritten by 'ggplot2':
method from
[.quosures rlang
c.quosures rlang
print.quosures rlang
[30m-- [1mAttaching packages[22m --------------------------------------- tidyverse 1.2.1 --[39m
[30m[32mv[30m [34mggplot2[30m 3.1.1 [32mv[30m [34mpurrr [30m 0.3.2
[32mv[30m [34mtibble [30m 2.1.1 [32mv[30m [34mdplyr [30m 0.8.1
[32mv[30m [34mtidyr [30m 0.8.3 [32mv[30m [34mstringr[30m 1.4.0
[32mv[30m [34mreadr [30m 1.3.1 [32mv[30m [34mforcats[30m 0.4.0[39m
[30m-- [1mConflicts[22m ------------------------------------------ tidyverse_conflicts() --
[31mx[30m [34mdplyr[30m::[32mcombine()[30m masks [34mrandomForest[30m::combine()
[31mx[30m [34mdplyr[30m::[32mfilter()[30m masks [34mstats[30m::filter()
[31mx[30m [34mdplyr[30m::[32mlag()[30m masks [34mstats[30m::lag()
[31mx[30m [34mggplot2[30m::[32mmargin()[30m masks [34mrandomForest[30m::margin()[39m
Attaching package: 㤼㸱gridExtra㤼㸲
The following object is masked from 㤼㸱package:dplyr㤼㸲:
combine
The following object is masked from 㤼㸱package:randomForest㤼㸲:
combine
library(dplyr); library(sqldf); library(data.table); library(readr); library(modelr); library(naniar);
Loading required package: gsubfn
Loading required package: proto
Loading required package: RSQLite
data.table 1.12.2 using 4 threads (see ?getDTthreads). Latest news: r-datatable.com
Attaching package: 㤼㸱data.table㤼㸲
The following objects are masked from 㤼㸱package:dplyr㤼㸲:
between, first, last
The following object is masked from 㤼㸱package:purrr㤼㸲:
transpose
library(knitr); library(markdown); library(rmarkdown); library(survey); library(sandwich);
Loading required package: Matrix
Attaching package: 㤼㸱Matrix㤼㸲
The following object is masked from 㤼㸱package:tidyr㤼㸲:
expand
Loading required package: survival
Attaching package: 㤼㸱survey㤼㸲
The following object is masked from 㤼㸱package:graphics㤼㸲:
dotchart
library(plyr); library(lmtest); library(randomForest); library(bigrquery); library(tokenizers);
---------------------------------------------------------------------------------------------------
You have loaded plyr after dplyr - this is likely to cause problems.
If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
library(plyr); library(dplyr)
---------------------------------------------------------------------------------------------------
Attaching package: 㤼㸱plyr㤼㸲
The following objects are masked from 㤼㸱package:dplyr㤼㸲:
arrange, count, desc, failwith, id, mutate, rename, summarise, summarize
The following object is masked from 㤼㸱package:purrr㤼㸲:
compact
Loading required package: zoo
Attaching package: 㤼㸱zoo㤼㸲
The following objects are masked from 㤼㸱package:base㤼㸲:
as.Date, as.Date.numeric
library(factoextra); library(jpeg); library(rpart); library(corrplot); library(RColorBrewer);
Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
corrplot 0.84 loaded
library(GGally); library(ggfortify); library(factoextra); library(rpart.plot); library(psych);
Registered S3 method overwritten by 'GGally':
method from
+.gg ggplot2
Attaching package: 㤼㸱GGally㤼㸲
The following object is masked from 㤼㸱package:dplyr㤼㸲:
nasa
Attaching package: 㤼㸱psych㤼㸲
The following object is masked from 㤼㸱package:modelr㤼㸲:
heights
The following objects are masked from 㤼㸱package:ggplot2㤼㸲:
%+%, alpha
The following object is masked from 㤼㸱package:randomForest㤼㸲:
outlier
library(GPArotation); library(lubridate); library(matrixStats); library(png); library(grid)
Attaching package: 㤼㸱lubridate㤼㸲
The following object is masked from 㤼㸱package:plyr㤼㸲:
here
The following objects are masked from 㤼㸱package:data.table㤼㸲:
hour, isoweek, mday, minute, month, quarter, second, wday, week, yday, year
The following object is masked from 㤼㸱package:base㤼㸲:
date
Attaching package: 㤼㸱matrixStats㤼㸲
The following object is masked from 㤼㸱package:plyr㤼㸲:
count
The following object is masked from 㤼㸱package:dplyr㤼㸲:
count
#########################################################################################################
# STEP 2: UNDERSTAND THE DATA
Ride_Share = read.csv("Ride_Share.csv") # Load the data
class(Ride_Share) # View the class of the dataset
[1] "data.frame"
dim(Ride_Share) # View the dataset's dimensions
[1] 98 52
names(Ride_Share) # Look at the column names
[1] "Age"
[2] "Parent"
[3] "AnnInc"
[4] "DriveStat"
[5] "Platform"
[6] "PF.Comp"
[7] "PF.TimeFlex"
[8] "PF.GeoFlex"
[9] "PF.Autonomy"
[10] "PF.Brand"
[11] "PF.EasyStart"
[12] "PF.Other"
[13] "EmployerLikeFactor"
[14] "MultipleCo"
[15] "WhichApp.Habit"
[16] "WhichApp.Demand"
[17] "WhichApp.Bonus"
[18] "WhichApp.Surge"
[19] "WhichApp.Riders"
[20] "WhichApp.Traffic"
[21] "WhichApp.Other"
[22] "TimetoFRide"
[23] "FullTimer"
[24] "HoursAvail"
[25] "KidsMoreLess"
[26] "ComfortLvl"
[27] "KidsEZR.Car"
[28] "KidsEZR.CPR"
[29] "KidsEZR.Toys"
[30] "KidsEZR.Cln"
[31] "KidsEZR.Liability"
[32] "KidsEZR.Rltnshp"
[33] "KidsEZR.Other"
[34] "ZumYN"
[35] "Zum.is.a.rideshare.service.that.enables.families.and.schools.to.arrange.rides.for.their.children.for.traveling.to.and.from.school.or.other.related.activities..such.as.ballet..soccer..or.music.lessons..Would.you.drive.for.Zum..Why.or.why.not."
[36] "PreventNotConsidered"
[37] "PreventTime"
[38] "PreventSafety"
[39] "PreventNoCar"
[40] "PreventOther"
[41] "DropOff"
[42] "DropWComp"
[43] "DriveWhnAvail"
[44] "WhyNotDriveKids"
[45] "WeekdayAvail"
[46] "PaxAgeGapComf"
[47] "Sex"
[48] "AgeCat"
[49] "Race"
[50] "EmployStat"
[51] "SocialMed"
[52] "X"
str(Ride_Share) # View summary of the data's internal structure
'data.frame': 98 obs. of 52 variables:
$ Age : int 33 35 28 29 35 28 26 31 54 30 ...
$ Parent : int 0 0 0 0 1 1 1 1 1 1 ...
$ AnnInc : int 2 1 2 2 1 1 3 1 2 3 ...
$ DriveStat : int 1 1 1 1 1 1 1 1 1 1 ...
$ Platform : int 1 1 1 2 2 1 1 1 1 1 ...
$ PF.Comp : int 3 1 1 2 1 1 2 2 2 5 ...
$ PF.TimeFlex : int 1 3 3 3 4 2 3 1 1 1 ...
$ PF.GeoFlex : int 2 4 4 1 3 4 4 5 6 2 ...
$ PF.Autonomy : int 4 5 5 4 5 5 1 3 5 6 ...
$ PF.Brand : int 5 6 6 5 2 6 5 4 3 3 ...
$ PF.EasyStart : int 6 2 2 6 6 3 6 6 4 4 ...
$ PF.Other : int 7 7 7 7 7 7 7 7 7 7 ...
$ EmployerLikeFactor : int 1 2 1 6 1 2 4 6 2 1 ...
$ MultipleCo : int 1 0 0 0 0 0 0 0 1 1 ...
$ WhichApp.Habit : int 2 NA NA NA NA NA NA NA 5 5 ...
$ WhichApp.Demand : int 4 NA NA NA NA NA NA NA 2 1 ...
$ WhichApp.Bonus : int 1 NA NA NA NA NA NA NA 4 2 ...
$ WhichApp.Surge : int 5 NA NA NA NA NA NA NA 1 3 ...
$ WhichApp.Riders : int 3 NA NA NA NA NA NA NA 6 4 ...
$ WhichApp.Traffic : int 6 NA NA NA NA NA NA NA 3 6 ...
$ WhichApp.Other : int 7 NA NA NA NA NA NA NA 7 7 ...
$ TimetoFRide : int 3 2 2 1 1 3 2 2 1 2 ...
$ FullTimer : int 0 1 1 1 0 1 1 1 1 1 ...
$ HoursAvail : Factor w/ 11 levels "","1","1,2","1,2,3",..: 7 11 9 3 7 10 9 9 7 4 ...
$ KidsMoreLess : int 3 2 3 2 1 3 3 2 1 1 ...
$ ComfortLvl : int 4 10 7 0 4 5 5 1 3 7 ...
$ KidsEZR.Car : int 1 1 3 1 3 3 1 3 4 5 ...
$ KidsEZR.CPR : int 2 3 5 2 5 1 3 1 6 1 ...
$ KidsEZR.Toys : int 3 4 4 3 1 5 4 2 1 2 ...
$ KidsEZR.Cln : int 4 5 6 4 2 6 5 6 2 3 ...
$ KidsEZR.Liability : int 5 2 1 5 4 2 6 5 3 4 ...
$ KidsEZR.Rltnshp : int 6 6 2 6 6 4 2 4 5 6 ...
$ KidsEZR.Other : int 7 7 7 7 7 7 7 7 7 7 ...
$ ZumYN : int 1 1 1 1 1 1 1 0 1 1 ...
$ Zum.is.a.rideshare.service.that.enables.families.and.schools.to.arrange.rides.for.their.children.for.traveling.to.and.from.school.or.other.related.activities..such.as.ballet..soccer..or.music.lessons..Would.you.drive.for.Zum..Why.or.why.not.: Factor w/ 98 levels "Definitely. I like the family related vibe. ",..: 86 25 19 37 45 46 67 56 3 93 ...
$ PreventNotConsidered : int NA NA NA NA NA NA NA NA NA NA ...
$ PreventTime : int NA NA NA NA NA NA NA NA NA NA ...
$ PreventSafety : int NA NA NA NA NA NA NA NA NA NA ...
$ PreventNoCar : int NA NA NA NA NA NA NA NA NA NA ...
$ PreventOther : int NA NA NA NA NA NA NA NA NA NA ...
$ DropOff : int NA NA NA NA NA NA NA NA NA NA ...
$ DropWComp : int NA NA NA NA NA NA NA NA NA NA ...
$ DriveWhnAvail : int NA NA NA NA NA NA NA NA NA NA ...
$ WhyNotDriveKids : int NA NA NA NA NA NA NA NA NA NA ...
$ WeekdayAvail : int NA NA NA NA NA NA NA NA NA NA ...
$ PaxAgeGapComf : int NA NA NA NA NA NA NA NA NA NA ...
$ Sex : int 2 2 1 1 1 2 1 1 2 2 ...
$ AgeCat : int 2 3 2 2 3 2 2 2 4 2 ...
$ Race : int 3 1 1 1 1 2 1 1 3 4 ...
$ EmployStat : int 3 3 3 3 3 3 3 3 4 3 ...
$ SocialMed : int 1 1 3 3 1 6 1 1 2 1 ...
$ X : logi NA NA NA NA NA NA ...
glimpse(Ride_Share) # View the structure of the data, the dplyr way
Observations: 98
Variables: 52
$ Age [3m[90m<int>[39m[23m ...
$ Parent [3m[90m<int>[39m[23m ...
$ AnnInc [3m[90m<int>[39m[23m ...
$ DriveStat [3m[90m<int>[39m[23m ...
$ Platform [3m[90m<int>[39m[23m ...
$ PF.Comp [3m[90m<int>[39m[23m ...
$ PF.TimeFlex [3m[90m<int>[39m[23m ...
$ PF.GeoFlex [3m[90m<int>[39m[23m ...
$ PF.Autonomy [3m[90m<int>[39m[23m ...
$ PF.Brand [3m[90m<int>[39m[23m ...
$ PF.EasyStart [3m[90m<int>[39m[23m ...
$ PF.Other [3m[90m<int>[39m[23m ...
$ EmployerLikeFactor [3m[90m<int>[39m[23m ...
$ MultipleCo [3m[90m<int>[39m[23m ...
$ WhichApp.Habit [3m[90m<int>[39m[23m ...
$ WhichApp.Demand [3m[90m<int>[39m[23m ...
$ WhichApp.Bonus [3m[90m<int>[39m[23m ...
$ WhichApp.Surge [3m[90m<int>[39m[23m ...
$ WhichApp.Riders [3m[90m<int>[39m[23m ...
$ WhichApp.Traffic [3m[90m<int>[39m[23m ...
$ WhichApp.Other [3m[90m<int>[39m[23m ...
$ TimetoFRide [3m[90m<int>[39m[23m ...
$ FullTimer [3m[90m<int>[39m[23m ...
$ HoursAvail [3m[90m<fct>[39m[23m ...
$ KidsMoreLess [3m[90m<int>[39m[23m ...
$ ComfortLvl [3m[90m<int>[39m[23m ...
$ KidsEZR.Car [3m[90m<int>[39m[23m ...
$ KidsEZR.CPR [3m[90m<int>[39m[23m ...
$ KidsEZR.Toys [3m[90m<int>[39m[23m ...
$ KidsEZR.Cln [3m[90m<int>[39m[23m ...
$ KidsEZR.Liability [3m[90m<int>[39m[23m ...
$ KidsEZR.Rltnshp [3m[90m<int>[39m[23m ...
$ KidsEZR.Other [3m[90m<int>[39m[23m ...
$ ZumYN [3m[90m<int>[39m[23m ...
$ Zum.is.a.rideshare.service.that.enables.families.and.schools.to.arrange.rides.for.their.children.for.traveling.to.and.from.school.or.other.related.activities..such.as.ballet..soccer..or.music.lessons..Would.you.drive.for.Zum..Why.or.why.not. [3m[90m<fct>[39m[23m ...
$ PreventNotConsidered [3m[90m<int>[39m[23m ...
$ PreventTime [3m[90m<int>[39m[23m ...
$ PreventSafety [3m[90m<int>[39m[23m ...
$ PreventNoCar [3m[90m<int>[39m[23m ...
$ PreventOther [3m[90m<int>[39m[23m ...
$ DropOff [3m[90m<int>[39m[23m ...
$ DropWComp [3m[90m<int>[39m[23m ...
$ DriveWhnAvail [3m[90m<int>[39m[23m ...
$ WhyNotDriveKids [3m[90m<int>[39m[23m ...
$ WeekdayAvail [3m[90m<int>[39m[23m ...
$ PaxAgeGapComf [3m[90m<int>[39m[23m ...
$ Sex [3m[90m<int>[39m[23m ...
$ AgeCat [3m[90m<int>[39m[23m ...
$ Race [3m[90m<int>[39m[23m ...
$ EmployStat [3m[90m<int>[39m[23m ...
$ SocialMed [3m[90m<int>[39m[23m ...
$ X [3m[90m<lgl>[39m[23m ...
summary(Ride_Share) # View a summary of the data
Age Parent AnnInc DriveStat Platform PF.Comp
Min. :25.00 Min. :0.0000 Min. :1.000 Min. :1.000 Min. :1.0 Min. :1.00
1st Qu.:30.00 1st Qu.:1.0000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:1.0 1st Qu.:1.00
Median :33.00 Median :1.0000 Median :2.000 Median :2.000 Median :1.0 Median :2.00
Mean :35.31 Mean :0.9592 Mean :2.459 Mean :1.643 Mean :1.2 Mean :2.01
3rd Qu.:39.00 3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.:1.0 3rd Qu.:3.00
Max. :57.00 Max. :1.0000 Max. :4.000 Max. :2.000 Max. :2.0 Max. :6.00
NA's :63
PF.TimeFlex PF.GeoFlex PF.Autonomy PF.Brand PF.EasyStart PF.Other
Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.00 Min. :1.000 Min. :1.000
1st Qu.:1.000 1st Qu.:3.000 1st Qu.:4.000 1st Qu.:5.00 1st Qu.:3.000 1st Qu.:7.000
Median :2.000 Median :4.000 Median :5.000 Median :5.00 Median :4.000 Median :7.000
Mean :2.204 Mean :3.582 Mean :4.296 Mean :5.02 Mean :4.051 Mean :6.837
3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:5.000 3rd Qu.:6.00 3rd Qu.:6.000 3rd Qu.:7.000
Max. :6.000 Max. :6.000 Max. :6.000 Max. :7.00 Max. :7.000 Max. :7.000
EmployerLikeFactor MultipleCo WhichApp.Habit WhichApp.Demand WhichApp.Bonus WhichApp.Surge
Min. :1.0 Min. :0.0000 Min. :1.00 Min. :1.000 Min. :1.00 Min. :1.000
1st Qu.:1.5 1st Qu.:0.0000 1st Qu.:1.75 1st Qu.:2.750 1st Qu.:2.00 1st Qu.:2.500
Median :2.0 Median :0.0000 Median :2.50 Median :3.500 Median :2.00 Median :4.500
Mean :2.4 Mean :0.2286 Mean :2.75 Mean :3.125 Mean :3.00 Mean :3.625
3rd Qu.:2.5 3rd Qu.:0.0000 3rd Qu.:3.50 3rd Qu.:4.000 3rd Qu.:4.25 3rd Qu.:5.000
Max. :6.0 Max. :1.0000 Max. :5.00 Max. :4.000 Max. :6.00 Max. :5.000
NA's :63 NA's :63 NA's :90 NA's :90 NA's :90 NA's :90
WhichApp.Riders WhichApp.Traffic WhichApp.Other TimetoFRide FullTimer HoursAvail
Min. :1.00 Min. :1.00 Min. :7 Min. :1.0 Min. :0.0000 :63
1st Qu.:3.00 1st Qu.:2.75 1st Qu.:7 1st Qu.:1.0 1st Qu.:1.0000 3 :13
Median :4.00 Median :6.00 Median :7 Median :2.0 Median :1.0000 2 : 6
Mean :4.00 Mean :4.50 Mean :7 Mean :1.8 Mean :0.8286 4 : 6
3rd Qu.:5.25 3rd Qu.:6.00 3rd Qu.:7 3rd Qu.:2.0 3rd Qu.:1.0000 3,4 : 3
Max. :6.00 Max. :6.00 Max. :7 Max. :3.0 Max. :1.0000 2,3,4 : 2
NA's :90 NA's :90 NA's :90 NA's :63 NA's :63 (Other): 5
KidsMoreLess ComfortLvl KidsEZR.Car KidsEZR.CPR KidsEZR.Toys KidsEZR.Cln
Min. :1.000 Min. : 0.000 Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
1st Qu.:1.000 1st Qu.: 4.000 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:3.000 1st Qu.:4.000
Median :2.000 Median : 5.000 Median :3.000 Median :3.000 Median :4.000 Median :5.000
Mean :2.029 Mean : 5.514 Mean :3.143 Mean :3.031 Mean :3.694 Mean :4.612
3rd Qu.:3.000 3rd Qu.: 7.000 3rd Qu.:4.000 3rd Qu.:4.000 3rd Qu.:5.000 3rd Qu.:6.000
Max. :3.000 Max. :10.000 Max. :6.000 Max. :6.000 Max. :6.000 Max. :7.000
NA's :63 NA's :63
KidsEZR.Liability KidsEZR.Rltnshp KidsEZR.Other ZumYN
Min. :1.000 Min. :1.000 Min. :1.000 Min. :0.0000
1st Qu.:1.000 1st Qu.:2.000 1st Qu.:7.000 1st Qu.:1.0000
Median :2.000 Median :4.000 Median :7.000 Median :1.0000
Mean :2.755 Mean :3.878 Mean :6.888 Mean :0.7653
3rd Qu.:4.000 3rd Qu.:6.000 3rd Qu.:7.000 3rd Qu.:1.0000
Max. :6.000 Max. :6.000 Max. :7.000 Max. :1.0000
Zum.is.a.rideshare.service.that.enables.families.and.schools.to.arrange.rides.for.their.children.for.traveling.to.and.from.school.or.other.related.activities..such.as.ballet..soccer..or.music.lessons..Would.you.drive.for.Zum..Why.or.why.not.
Definitely. I like the family related vibe. : 1
Depends on the pay pretty much and if it is worth my time. : 1
good driving\n : 1
I'm not sure. It seems like a great idea, but an awful lot of liability. I also think it takes a special person to work exclusively with children. I think it's going to be very hard to ensure the safety of the children and I would be very fearful of being accused of something. : 1
I am already fine driving for Uber. There is no reason for me to drive for Zum. : 1
I am not sure. Driving someone elses children is a responsibility and a privilige. If I knew the parents, I may. If I know they needed a favor I would do it and not expect to be paid. It also would depend on the child :) : 1
(Other) :92
PreventNotConsidered PreventTime PreventSafety PreventNoCar PreventOther
Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.00000
1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000
Median :0.00000 Median :1.0000 Median :0.0000 Median :0.00000 Median :0.00000
Mean :0.01587 Mean :0.5714 Mean :0.2698 Mean :0.07937 Mean :0.06349
3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:0.00000
Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.00000
NA's :35 NA's :35 NA's :35 NA's :35 NA's :35
DropOff DropWComp DriveWhnAvail WhyNotDriveKids WeekdayAvail PaxAgeGapComf
Min. :0.0000 Min. :1.000 Min. :1 Min. :1.000 Min. :1.000 Min. :1.000
1st Qu.:1.0000 1st Qu.:1.000 1st Qu.:1 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:1.000
Median :1.0000 Median :1.000 Median :1 Median :3.000 Median :2.000 Median :1.000
Mean :0.8254 Mean :1.615 Mean :1 Mean :2.833 Mean :2.263 Mean :1.239
3rd Qu.:1.0000 3rd Qu.:2.000 3rd Qu.:1 3rd Qu.:4.000 3rd Qu.:3.000 3rd Qu.:1.000
Max. :1.0000 Max. :4.000 Max. :1 Max. :4.000 Max. :3.000 Max. :2.000
NA's :35 NA's :46 NA's :87 NA's :92 NA's :41 NA's :52
Sex AgeCat Race EmployStat SocialMed X
Min. :1.000 Min. :2.000 Min. :1.000 Min. :1.00 Min. :1.000 Mode:logical
1st Qu.:1.000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:3.00 1st Qu.:1.000 NA's:98
Median :1.000 Median :2.000 Median :1.000 Median :3.00 Median :2.000
Mean :1.388 Mean :2.612 Mean :1.551 Mean :3.02 Mean :2.663
3rd Qu.:2.000 3rd Qu.:3.000 3rd Qu.:1.000 3rd Qu.:3.00 3rd Qu.:3.750
Max. :2.000 Max. :5.000 Max. :6.000 Max. :4.00 Max. :7.000
head(Ride_Share, n = 15) # View the first 15 rows of data
tail(Ride_Share, n = 15) # View the last 15 rows of data
length(unique(Ride_Share$AnnInc)) # Count the number of distinct values in a column
[1] 4
#########################################################################################################

#########################################################################################################
# STEP 3: VISUALIZE THE DATA (PART I)
# View histogram for select numeric variables to get a sense of the distribution for each variable.
hist(Ride_Share$Age, main = "Distribution of Age", xlab = "Age")
table(Ride_Share$Age)
25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 42 43 44 45 46 47 48 49 50 52 53 54 55 57
5 3 2 9 3 11 8 5 5 5 5 4 2 4 3 2 3 1 3 1 1 2 3 2 1 1 1 1 1 1
hist(Ride_Share$Parent, main = "Distribution of Parenthood", xlab = "Parent or Not?")

table(Ride_Share$Parent)
0 1
4 94
hist(Ride_Share$AnnInc, main = "Distribution of Income", xlab = "Income Bracket", breaks = 8)

table(Ride_Share$AnnInc)
1 2 3 4
19 33 28 18
hist(Ride_Share$EmployStat, main = "Distribution of Employment Status", xlab = "Employment Status")

table(Ride_Share$EmployStat)
1 2 3 4
3 4 79 12
hist(Ride_Share$SocialMed, main = "Distribution of Social Media", xlab = "Social Media Platform",
breaks = 10)

table(Ride_Share$SocialMed)
1 2 3 4 6 7
45 18 10 2 19 4
########################################################################################################

#########################################################################################################
# STEP 4: VISUALIZE THE DATA (PART II)
# What factors matter to those who drive for a ride-share company or would consider doing so?
hist(Ride_Share$PF.Comp, main = "Distribution of Ranking of Compensation",
xlab = "Ranking of Compensation", breaks = 10)
table(Ride_Share$PF.Comp)
1 2 3 4 5 6
46 25 15 6 4 2
hist(Ride_Share$PF.TimeFlex, main = "Distribution of Ranking of Time Flexibility",
xlab = "Ranking of Time Flexibility", breaks = 10)

table(Ride_Share$PF.TimeFlex)
1 2 3 4 5 6
31 38 14 9 5 1
hist(Ride_Share$PF.GeoFlex, main = "Distribution of Ranking of Geographic Flexibility",
xlab = "Ranking of Geographic Flexibility", breaks = 10)

table(Ride_Share$PF.GeoFlex)
1 2 3 4 5 6
6 11 30 31 11 9
hist(Ride_Share$PF.Autonomy, main = "Distribution of Ranking of Autonomy",
xlab = "Ranking of Autonomy", breaks = 10)

table(Ride_Share$PF.Autonomy)
1 2 3 4 5 6
4 4 14 26 37 13
hist(Ride_Share$PF.Brand, main = "Distribution of Ranking of Brand",
xlab = "Ranking of Brand", breaks = 10)

table(Ride_Share$PF.Brand)
1 2 3 4 5 6 7
3 5 7 7 28 46 2
hist(Ride_Share$PF.EasyStart, main = "Distribution of Ranking of Ease of Getting Started",
xlab = "Ranking of Ease of Getting Started", breaks = 10)

table(Ride_Share$PF.EasyStart)
1 2 3 4 5 6 7
7 15 16 19 12 27 2
#########################################################################################################

#########################################################################################################
# STEP 5: VISUALIZE THE DATA (PART III)
# View scatter plot of select pairs of variables.
plot(Ride_Share$Age, Ride_Share$ZumYN,
main = "Regression for Age on Whether the Person Would Drive for Zum",
xlab = "Age", ylab = "Would You Drive for Zum?")
abline(lm(Ride_Share$ZumYN ~ Ride_Share$Age, data = Ride_Share), col = "red")
plot(Ride_Share$Parent, Ride_Share$ZumYN,
main = "Regression for Parenthood on Whether the Person Would Drive for Zum",
xlab = "Is the Person a Parent?", ylab = "Would You Drive for Zum?")
abline(lm(Ride_Share$ZumYN ~ Ride_Share$Parent, data = Ride_Share), col = "red")

plot(Ride_Share$AnnInc, Ride_Share$ZumYN,
main = "Regression for Annual Income on Whether the Person Would Drive for Zum",
xlab = "Annual Income", ylab = "Would You Drive for Zum?")
abline(lm(Ride_Share$ZumYN ~ Ride_Share$AnnInc, data = Ride_Share), col = "red")

plot(Ride_Share$Sex, Ride_Share$ZumYN,
main = "Regression for Sex on Whether the Person Would Drive for Zum",
xlab = "Sex", ylab = "Would You Drive for Zum?")
abline(lm(Ride_Share$ZumYN ~ Ride_Share$Sex, data = Ride_Share), col = "red")

#########################################################################################################


#########################################################################################################
# STEP 7: DETERMINE CORRELATIONS AMONG INDEPENDENT VARIABLES
# Visualize correlations among the independent variables.
ggcorr(Ride_Share5, nbreaks = 4, palette = "RdGy")
# Alternative method for visualizing correlations:
allcorrelations1 = cor(Ride_Share5)
corrplot(allcorrelations1)
# For the multivariate regression analysis, we should omit variables that have high correlation with
# other variables. In other words, for each strongly correlated (+/-) pair of variables, keep only one
# of the two variables. Doing so will help to overcome the problem of multi-collinearity.
# In this case, the explanatory variables Age and AgeCat are strongly correlated with each other (not
# surprisingly). So, let's create a data frame without one of those variables. Specifically, we can get
# rid of AgeCat.
Ride_Share6 <- Ride_Share4[,c(1:19,21:24)] # Note: this data frame includes the y variable.
# Also, let's get rid of the two "Other" columns, since they are virtually useless, given the lack of
# variation of responses for them.
Ride_Share7 <- Ride_Share6[,c(1:10,12:17,19:23)]
#########################################################################################################
#########################################################################################################
# STEP 9: BUILD PREDICTION MODELS
#-------------------------------------------------------------------------------------------------------#
# Univariate Regression
model1 = lm(ZumYN ~ Age, data = training)
summary(model1)
Call:
lm(formula = ZumYN ~ Age, data = training)
Residuals:
Min 1Q Median 3Q Max
-0.8231 -0.2654 0.2433 0.2678 0.2888
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.623840 0.267117 2.335 0.0231 *
Age 0.003495 0.007478 0.467 0.6420
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4422 on 57 degrees of freedom
Multiple R-squared: 0.003818, Adjusted R-squared: -0.01366
F-statistic: 0.2185 on 1 and 57 DF, p-value: 0.642
# According to this model, for every one unit increase in Age, the dependent variable increases by
# ~0.01 units. However, the result is not statistically significant.
plot(ZumYN ~ Age, data = training, col = 'red') # Visualize the regression
abline(model1)

prediction1 = predict(model1, newdata = validation)
validationErrors1 = validation$ZumYN - prediction1
# Calculating RMSE on validation data
validationRMSE1 = sqrt(mean(validationErrors1^2))
cat('Validation RMSE = ', validationRMSE1,'\n')
Validation RMSE = 0.2963254
# The goal is to minimize RMSE without overfitting. The RMSE is in the same units as the y variable.
#--------------------------------------------------------------------------------------------------------#
#--------------------------------------------------------------------------------------------------------#
# Multivariate Regression
model2 = lm(ZumYN ~ ., data = training)
summary(model2)
Call:
lm(formula = ZumYN ~ ., data = training)
Residuals:
Min 1Q Median 3Q Max
-0.74668 -0.26779 0.09332 0.27193 0.62306
Coefficients: (1 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.001671 3.025857 0.331 0.7424
Age -0.003869 0.009741 -0.397 0.6934
Parent NA NA NA NA
AnnInc -0.020526 0.061269 -0.335 0.7394
DriveStat 0.194189 0.149799 1.296 0.2025
PF.Comp -0.082994 0.101253 -0.820 0.4174
PF.TimeFlex -0.221208 0.110768 -1.997 0.0528 .
PF.GeoFlex -0.180741 0.112367 -1.608 0.1158
PF.Autonomy -0.154005 0.112415 -1.370 0.1785
PF.Brand -0.144544 0.098865 -1.462 0.1517
PF.EasyStart -0.186519 0.088724 -2.102 0.0420 *
KidsEZR.Car 0.165629 0.155280 1.067 0.2927
KidsEZR.CPR 0.214801 0.149624 1.436 0.1591
KidsEZR.Toys 0.160223 0.138975 1.153 0.2560
KidsEZR.Cln 0.106503 0.150285 0.709 0.4827
KidsEZR.Liability 0.190436 0.150804 1.263 0.2142
KidsEZR.Rltnshp 0.143406 0.141956 1.010 0.3186
Sex 0.146017 0.144750 1.009 0.3193
Race -0.010620 0.065297 -0.163 0.8716
EmployStat -0.137979 0.129572 -1.065 0.2935
SocialMed -0.002775 0.033308 -0.083 0.9340
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4551 on 39 degrees of freedom
Multiple R-squared: 0.2779, Adjusted R-squared: -0.07384
F-statistic: 0.7901 on 19 and 39 DF, p-value: 0.704
prediction2 = predict(model2, newdata = validation)
prediction from a rank-deficient fit may be misleading
validationErrors2 = validation$ZumYN - prediction2
validationRMSE2 = sqrt(mean(validationErrors2^2))
cat('Validation RMSE = ', validationRMSE2,'\n')
Validation RMSE = 0.3884708
#--------------------------------------------------------------------------------------------------------#
#--------------------------------------------------------------------------------------------------------#
# Stepwise Regression
model3 = step(model2, data = training, direction = "backward")
Start: AIC=-77.32
ZumYN ~ Age + Parent + AnnInc + DriveStat + PF.Comp + PF.TimeFlex +
PF.GeoFlex + PF.Autonomy + PF.Brand + PF.EasyStart + KidsEZR.Car +
KidsEZR.CPR + KidsEZR.Toys + KidsEZR.Cln + KidsEZR.Liability +
KidsEZR.Rltnshp + Sex + Race + EmployStat + SocialMed
Step: AIC=-77.32
ZumYN ~ Age + AnnInc + DriveStat + PF.Comp + PF.TimeFlex + PF.GeoFlex +
PF.Autonomy + PF.Brand + PF.EasyStart + KidsEZR.Car + KidsEZR.CPR +
KidsEZR.Toys + KidsEZR.Cln + KidsEZR.Liability + KidsEZR.Rltnshp +
Sex + Race + EmployStat + SocialMed
Df Sum of Sq RSS AIC
- SocialMed 1 0.00144 8.0788 -79.309
- Race 1 0.00548 8.0828 -79.280
- AnnInc 1 0.02325 8.1006 -79.150
- Age 1 0.03267 8.1100 -79.082
- KidsEZR.Cln 1 0.10402 8.1814 -78.565
- PF.Comp 1 0.13915 8.2165 -78.312
- Sex 1 0.21075 8.2881 -77.800
- KidsEZR.Rltnshp 1 0.21136 8.2887 -77.796
- EmployStat 1 0.23486 8.3122 -77.629
- KidsEZR.Car 1 0.23564 8.3130 -77.623
- KidsEZR.Toys 1 0.27528 8.3526 -77.343
<none> 8.0774 -77.320
- KidsEZR.Liability 1 0.33028 8.4076 -76.955
- DriveStat 1 0.34805 8.4254 -76.831
- PF.Autonomy 1 0.38871 8.4661 -76.547
- KidsEZR.CPR 1 0.42685 8.5042 -76.282
- PF.Brand 1 0.44272 8.5201 -76.172
- PF.GeoFlex 1 0.53585 8.6132 -75.530
- PF.TimeFlex 1 0.82600 8.9034 -73.575
- PF.EasyStart 1 0.91531 8.9927 -72.986
Step: AIC=-79.31
ZumYN ~ Age + AnnInc + DriveStat + PF.Comp + PF.TimeFlex + PF.GeoFlex +
PF.Autonomy + PF.Brand + PF.EasyStart + KidsEZR.Car + KidsEZR.CPR +
KidsEZR.Toys + KidsEZR.Cln + KidsEZR.Liability + KidsEZR.Rltnshp +
Sex + Race + EmployStat
Df Sum of Sq RSS AIC
- Race 1 0.00469 8.0835 -81.275
- AnnInc 1 0.02210 8.1009 -81.148
- Age 1 0.03184 8.1106 -81.077
- KidsEZR.Cln 1 0.11418 8.1930 -80.481
- PF.Comp 1 0.13775 8.2166 -80.312
- Sex 1 0.21664 8.2954 -79.748
- KidsEZR.Rltnshp 1 0.22740 8.3062 -79.672
- EmployStat 1 0.25021 8.3290 -79.510
- KidsEZR.Car 1 0.26100 8.3398 -79.433
<none> 8.0788 -79.309
- KidsEZR.Toys 1 0.28757 8.3664 -79.246
- DriveStat 1 0.34694 8.4257 -78.828
- KidsEZR.Liability 1 0.36219 8.4410 -78.722
- PF.Autonomy 1 0.38745 8.4663 -78.546
- PF.Brand 1 0.44204 8.5208 -78.166
- KidsEZR.CPR 1 0.44868 8.5275 -78.120
- PF.GeoFlex 1 0.54367 8.6225 -77.467
- PF.TimeFlex 1 0.85069 8.9295 -75.402
- PF.EasyStart 1 0.92189 9.0007 -74.934
Step: AIC=-81.28
ZumYN ~ Age + AnnInc + DriveStat + PF.Comp + PF.TimeFlex + PF.GeoFlex +
PF.Autonomy + PF.Brand + PF.EasyStart + KidsEZR.Car + KidsEZR.CPR +
KidsEZR.Toys + KidsEZR.Cln + KidsEZR.Liability + KidsEZR.Rltnshp +
Sex + EmployStat
Df Sum of Sq RSS AIC
- AnnInc 1 0.02108 8.1046 -83.121
- Age 1 0.03180 8.1153 -83.043
- KidsEZR.Cln 1 0.12581 8.2093 -82.364
- PF.Comp 1 0.13835 8.2218 -82.274
- Sex 1 0.21233 8.2958 -81.745
- KidsEZR.Rltnshp 1 0.23538 8.3189 -81.582
- EmployStat 1 0.24592 8.3294 -81.507
<none> 8.0835 -81.275
- KidsEZR.Car 1 0.28408 8.3676 -81.237
- KidsEZR.Toys 1 0.29524 8.3787 -81.159
- DriveStat 1 0.34358 8.4271 -80.819
- PF.Autonomy 1 0.38789 8.4714 -80.510
- KidsEZR.Liability 1 0.38812 8.4716 -80.508
- PF.Brand 1 0.43747 8.5210 -80.165
- KidsEZR.CPR 1 0.45016 8.5337 -80.078
- PF.GeoFlex 1 0.54668 8.6302 -79.414
- PF.TimeFlex 1 0.85041 8.9339 -77.373
- PF.EasyStart 1 0.91895 9.0024 -76.922
Step: AIC=-83.12
ZumYN ~ Age + DriveStat + PF.Comp + PF.TimeFlex + PF.GeoFlex +
PF.Autonomy + PF.Brand + PF.EasyStart + KidsEZR.Car + KidsEZR.CPR +
KidsEZR.Toys + KidsEZR.Cln + KidsEZR.Liability + KidsEZR.Rltnshp +
Sex + EmployStat
Df Sum of Sq RSS AIC
- Age 1 0.02964 8.1342 -84.906
- KidsEZR.Cln 1 0.12014 8.2247 -84.253
- PF.Comp 1 0.14790 8.2525 -84.054
- Sex 1 0.21950 8.3241 -83.545
- KidsEZR.Rltnshp 1 0.23034 8.3349 -83.468
- EmployStat 1 0.23483 8.3394 -83.436
- KidsEZR.Car 1 0.27924 8.3838 -83.123
<none> 8.1046 -83.121
- KidsEZR.Toys 1 0.28249 8.3871 -83.100
- DriveStat 1 0.32629 8.4309 -82.793
- KidsEZR.Liability 1 0.37905 8.4836 -82.425
- PF.Autonomy 1 0.38841 8.4930 -82.360
- PF.Brand 1 0.43423 8.5388 -82.042
- KidsEZR.CPR 1 0.43915 8.5437 -82.008
- PF.GeoFlex 1 0.53771 8.6423 -81.331
- PF.TimeFlex 1 0.85619 8.9608 -79.196
- PF.EasyStart 1 0.95506 9.0596 -78.549
Step: AIC=-84.91
ZumYN ~ DriveStat + PF.Comp + PF.TimeFlex + PF.GeoFlex + PF.Autonomy +
PF.Brand + PF.EasyStart + KidsEZR.Car + KidsEZR.CPR + KidsEZR.Toys +
KidsEZR.Cln + KidsEZR.Liability + KidsEZR.Rltnshp + Sex +
EmployStat
Df Sum of Sq RSS AIC
- KidsEZR.Cln 1 0.09785 8.2321 -86.201
- PF.Comp 1 0.14043 8.2746 -85.896
- KidsEZR.Rltnshp 1 0.20185 8.3361 -85.460
- Sex 1 0.20601 8.3402 -85.430
- EmployStat 1 0.21568 8.3499 -85.362
- KidsEZR.Car 1 0.25242 8.3866 -85.103
- KidsEZR.Toys 1 0.26180 8.3960 -85.037
<none> 8.1342 -84.906
- DriveStat 1 0.29969 8.4339 -84.771
- KidsEZR.Liability 1 0.35564 8.4898 -84.381
- PF.Autonomy 1 0.36086 8.4951 -84.345
- KidsEZR.CPR 1 0.41146 8.5457 -83.995
- PF.Brand 1 0.42364 8.5579 -83.911
- PF.GeoFlex 1 0.54487 8.6791 -83.081
- PF.TimeFlex 1 0.82710 8.9613 -81.193
- PF.EasyStart 1 0.93263 9.0668 -80.502
Step: AIC=-86.2
ZumYN ~ DriveStat + PF.Comp + PF.TimeFlex + PF.GeoFlex + PF.Autonomy +
PF.Brand + PF.EasyStart + KidsEZR.Car + KidsEZR.CPR + KidsEZR.Toys +
KidsEZR.Liability + KidsEZR.Rltnshp + Sex + EmployStat
Df Sum of Sq RSS AIC
- PF.Comp 1 0.10638 8.3384 -87.443
- EmployStat 1 0.16557 8.3976 -87.026
- KidsEZR.Rltnshp 1 0.16795 8.4000 -87.009
- Sex 1 0.23329 8.4654 -86.552
- KidsEZR.Toys 1 0.23358 8.4656 -86.550
- KidsEZR.Car 1 0.23982 8.4719 -86.506
- PF.Autonomy 1 0.28133 8.5134 -86.218
<none> 8.2321 -86.201
- DriveStat 1 0.29949 8.5315 -86.092
- PF.Brand 1 0.33405 8.5661 -85.854
- PF.GeoFlex 1 0.46547 8.6975 -84.955
- KidsEZR.Liability 1 0.59866 8.8307 -84.059
- KidsEZR.CPR 1 0.65628 8.8883 -83.675
- PF.TimeFlex 1 0.73320 8.9653 -83.167
- PF.EasyStart 1 0.83876 9.0708 -82.476
Step: AIC=-87.44
ZumYN ~ DriveStat + PF.TimeFlex + PF.GeoFlex + PF.Autonomy +
PF.Brand + PF.EasyStart + KidsEZR.Car + KidsEZR.CPR + KidsEZR.Toys +
KidsEZR.Liability + KidsEZR.Rltnshp + Sex + EmployStat
Df Sum of Sq RSS AIC
- EmployStat 1 0.16409 8.5025 -88.293
- PF.Autonomy 1 0.17653 8.5150 -88.207
- KidsEZR.Rltnshp 1 0.19038 8.5288 -88.111
- Sex 1 0.19551 8.5340 -88.076
- KidsEZR.Car 1 0.20976 8.5482 -87.977
- PF.Brand 1 0.22790 8.5663 -87.852
- KidsEZR.Toys 1 0.25723 8.5957 -87.650
<none> 8.3384 -87.443
- PF.GeoFlex 1 0.37099 8.7094 -86.875
- DriveStat 1 0.49261 8.8311 -86.057
- KidsEZR.CPR 1 0.56608 8.9045 -85.568
- KidsEZR.Liability 1 0.60034 8.9388 -85.341
- PF.TimeFlex 1 0.74728 9.0857 -84.379
- PF.EasyStart 1 0.93489 9.2733 -83.173
Step: AIC=-88.29
ZumYN ~ DriveStat + PF.TimeFlex + PF.GeoFlex + PF.Autonomy +
PF.Brand + PF.EasyStart + KidsEZR.Car + KidsEZR.CPR + KidsEZR.Toys +
KidsEZR.Liability + KidsEZR.Rltnshp + Sex
Df Sum of Sq RSS AIC
- Sex 1 0.15393 8.6565 -89.235
- KidsEZR.Rltnshp 1 0.19986 8.7024 -88.922
- PF.Autonomy 1 0.23621 8.7387 -88.677
- PF.Brand 1 0.24207 8.7446 -88.637
- KidsEZR.Car 1 0.24866 8.7512 -88.593
- KidsEZR.Toys 1 0.25339 8.7559 -88.561
<none> 8.5025 -88.293
- PF.GeoFlex 1 0.43285 8.9354 -87.364
- DriveStat 1 0.44443 8.9470 -87.287
- KidsEZR.CPR 1 0.50460 9.0071 -86.892
- KidsEZR.Liability 1 0.51785 9.0204 -86.805
- PF.TimeFlex 1 0.84184 9.3444 -84.723
- PF.EasyStart 1 1.01543 9.5180 -83.637
Step: AIC=-89.23
ZumYN ~ DriveStat + PF.TimeFlex + PF.GeoFlex + PF.Autonomy +
PF.Brand + PF.EasyStart + KidsEZR.Car + KidsEZR.CPR + KidsEZR.Toys +
KidsEZR.Liability + KidsEZR.Rltnshp
Df Sum of Sq RSS AIC
- KidsEZR.Rltnshp 1 0.13927 8.7957 -90.293
- PF.Brand 1 0.15463 8.8111 -90.190
- PF.Autonomy 1 0.16935 8.8258 -90.092
- KidsEZR.Car 1 0.19779 8.8543 -89.902
- KidsEZR.Toys 1 0.20498 8.8615 -89.854
<none> 8.6565 -89.235
- PF.GeoFlex 1 0.33529 8.9918 -88.992
- KidsEZR.CPR 1 0.44433 9.1008 -88.281
- DriveStat 1 0.46121 9.1177 -88.172
- KidsEZR.Liability 1 0.49073 9.1472 -87.981
- PF.TimeFlex 1 0.74345 9.3999 -86.373
- PF.EasyStart 1 0.92099 9.5775 -85.269
Step: AIC=-90.29
ZumYN ~ DriveStat + PF.TimeFlex + PF.GeoFlex + PF.Autonomy +
PF.Brand + PF.EasyStart + KidsEZR.Car + KidsEZR.CPR + KidsEZR.Toys +
KidsEZR.Liability
Df Sum of Sq RSS AIC
- KidsEZR.Car 1 0.07340 8.8691 -91.803
- KidsEZR.Toys 1 0.07368 8.8694 -91.801
- PF.Autonomy 1 0.18993 8.9857 -91.032
- PF.Brand 1 0.23174 9.0275 -90.759
<none> 8.7957 -90.293
- KidsEZR.CPR 1 0.30506 9.1008 -90.281
- KidsEZR.Liability 1 0.35313 9.1489 -89.971
- PF.GeoFlex 1 0.38628 9.1820 -89.757
- DriveStat 1 0.47252 9.2683 -89.206
- PF.TimeFlex 1 0.84660 9.6423 -86.871
- PF.EasyStart 1 0.93634 9.7321 -86.324
Step: AIC=-91.8
ZumYN ~ DriveStat + PF.TimeFlex + PF.GeoFlex + PF.Autonomy +
PF.Brand + PF.EasyStart + KidsEZR.CPR + KidsEZR.Toys + KidsEZR.Liability
Df Sum of Sq RSS AIC
- KidsEZR.Toys 1 0.03464 8.9038 -93.573
- PF.Autonomy 1 0.14176 9.0109 -92.867
- KidsEZR.CPR 1 0.23388 9.1030 -92.267
- PF.Brand 1 0.26128 9.1304 -92.090
- KidsEZR.Liability 1 0.27990 9.1490 -91.969
<none> 8.8691 -91.803
- PF.GeoFlex 1 0.32857 9.1977 -91.656
- DriveStat 1 0.47506 9.3442 -90.724
- PF.TimeFlex 1 0.79483 9.6640 -88.739
- PF.EasyStart 1 0.95018 9.8193 -87.798
Step: AIC=-93.57
ZumYN ~ DriveStat + PF.TimeFlex + PF.GeoFlex + PF.Autonomy +
PF.Brand + PF.EasyStart + KidsEZR.CPR + KidsEZR.Liability
Df Sum of Sq RSS AIC
- PF.Autonomy 1 0.14039 9.0442 -94.650
- KidsEZR.CPR 1 0.22987 9.1336 -94.069
- PF.Brand 1 0.23578 9.1396 -94.031
- KidsEZR.Liability 1 0.24674 9.1505 -93.960
<none> 8.9038 -93.573
- PF.GeoFlex 1 0.31512 9.2189 -93.521
- DriveStat 1 0.52327 9.4270 -92.203
- PF.TimeFlex 1 0.78100 9.6848 -90.612
- PF.EasyStart 1 0.91662 9.8204 -89.791
Step: AIC=-94.65
ZumYN ~ DriveStat + PF.TimeFlex + PF.GeoFlex + PF.Brand + PF.EasyStart +
KidsEZR.CPR + KidsEZR.Liability
Df Sum of Sq RSS AIC
- PF.Brand 1 0.11188 9.1561 -95.924
- KidsEZR.CPR 1 0.17583 9.2200 -95.514
- PF.GeoFlex 1 0.17957 9.2237 -95.490
<none> 9.0442 -94.650
- KidsEZR.Liability 1 0.35555 9.3997 -94.375
- DriveStat 1 0.49303 9.5372 -93.518
- PF.TimeFlex 1 0.72742 9.7716 -92.085
- PF.EasyStart 1 0.82052 9.8647 -91.526
Step: AIC=-95.92
ZumYN ~ DriveStat + PF.TimeFlex + PF.GeoFlex + PF.EasyStart +
KidsEZR.CPR + KidsEZR.Liability
Df Sum of Sq RSS AIC
- PF.GeoFlex 1 0.08722 9.2433 -97.365
- KidsEZR.CPR 1 0.26166 9.4177 -96.262
<none> 9.1561 -95.924
- KidsEZR.Liability 1 0.38497 9.5410 -95.494
- DriveStat 1 0.38738 9.5434 -95.479
- PF.TimeFlex 1 0.61640 9.7725 -94.080
- PF.EasyStart 1 0.74896 9.9050 -93.285
Step: AIC=-97.36
ZumYN ~ DriveStat + PF.TimeFlex + PF.EasyStart + KidsEZR.CPR +
KidsEZR.Liability
Df Sum of Sq RSS AIC
- KidsEZR.CPR 1 0.24500 9.4883 -97.821
- KidsEZR.Liability 1 0.31546 9.5587 -97.385
<none> 9.2433 -97.365
- DriveStat 1 0.37496 9.6182 -97.019
- PF.TimeFlex 1 0.55108 9.7944 -95.948
- PF.EasyStart 1 0.67357 9.9168 -95.215
Step: AIC=-97.82
ZumYN ~ DriveStat + PF.TimeFlex + PF.EasyStart + KidsEZR.Liability
Df Sum of Sq RSS AIC
- KidsEZR.Liability 1 0.25894 9.7472 -98.233
<none> 9.4883 -97.821
- DriveStat 1 0.36943 9.8577 -97.568
- PF.TimeFlex 1 0.57150 10.0598 -96.371
- PF.EasyStart 1 0.65098 10.1393 -95.906
Step: AIC=-98.23
ZumYN ~ DriveStat + PF.TimeFlex + PF.EasyStart
Df Sum of Sq RSS AIC
- DriveStat 1 0.28910 10.0363 -98.508
<none> 9.7472 -98.233
- PF.TimeFlex 1 0.56957 10.3168 -96.882
- PF.EasyStart 1 0.57015 10.3174 -96.879
Step: AIC=-98.51
ZumYN ~ PF.TimeFlex + PF.EasyStart
Df Sum of Sq RSS AIC
<none> 10.036 -98.508
- PF.TimeFlex 1 0.51885 10.555 -97.534
- PF.EasyStart 1 0.67155 10.708 -96.687
summary(model3)
Call:
lm(formula = ZumYN ~ PF.TimeFlex + PF.EasyStart, data = training)
Residuals:
Min 1Q Median 3Q Max
-0.8887 -0.2015 0.1587 0.2684 0.5007
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.18338 0.18177 6.510 2.22e-08 ***
PF.TimeFlex -0.07742 0.04550 -1.701 0.0944 .
PF.EasyStart -0.06240 0.03224 -1.936 0.0580 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4233 on 56 degrees of freedom
Multiple R-squared: 0.1028, Adjusted R-squared: 0.07077
F-statistic: 3.209 on 2 and 56 DF, p-value: 0.04794
prediction3 = predict(model3, newdata = validation)
validationErrors3 = validation$ZumYN - prediction3
validationRMSE3 = sqrt(mean(validationErrors3^2))
cat('Validation RMSE = ', validationRMSE3,'\n')
Validation RMSE = 0.3006132
#--------------------------------------------------------------------------------------------------------#

#-------------------------------------------------------------------------------------------------------#
# Tree
model4 = rpart(ZumYN ~ ., training)
prp(model4, extra=1, fallen.leaves=TRUE, type=1, box.col=rainbow(80), varlen=0,digits=2,faclen=0)
prediction4 = predict(model4, newdata = validation)
validationErrors4 = validation$ZumYN - prediction4
validationRMSE4 = sqrt(mean(validationErrors4^2))
cat('Validation RMSE = ', validationRMSE4,'\n')
Validation RMSE = 0.367987
#-------------------------------------------------------------------------------------------------------#
#-------------------------------------------------------------------------------------------------------#
# Gradient Boosting
require(gbm)
model5 = gbm(ZumYN ~ .,
data = training,
cv.folds = 5,
distribution = "bernoulli",
n.trees = 5,
shrinkage = 0.1,
interaction.depth = 3)
variable 2: Parent has no variation.
model5
gbm(formula = ZumYN ~ ., distribution = "bernoulli", data = training,
n.trees = 5, interaction.depth = 3, shrinkage = 0.1, cv.folds = 5)
A gradient boosted model with bernoulli loss function.
5 iterations were performed.
The best cross-validation iteration was 5.
There were 20 predictors of which 4 had non-zero influence.
summary(model5)

prediction5 = predict(model5, validation, n.trees = 5)
validationErrors5 = validation$ZumYN - prediction5
validationRMSE5 = sqrt(mean(validationErrors5^2))
cat('Validation RMSE = ', validationRMSE5,'\n')
Validation RMSE = 0.4593657
#-------------------------------------------------------------------------------------------------------#
#-------------------------------------------------------------------------------------------------------#
# Random Forest
model6 = randomForest(ZumYN ~ ., training)
The response has five or fewer unique values. Are you sure you want to do regression?
summary(model6)
Length Class Mode
call 3 -none- call
type 1 -none- character
predicted 59 -none- numeric
mse 500 -none- numeric
rsq 500 -none- numeric
oob.times 59 -none- numeric
importance 20 -none- numeric
importanceSD 0 -none- NULL
localImportance 0 -none- NULL
proximity 0 -none- NULL
ntree 1 -none- numeric
mtry 1 -none- numeric
forest 11 -none- list
coefs 0 -none- NULL
y 59 -none- numeric
test 0 -none- NULL
inbag 0 -none- NULL
terms 3 terms call
prediction6 = predict(model6, newdata = validation)
validationErrors6 = validation$ZumYN - prediction6
validationRMSE6 = sqrt(mean(validationErrors6^2))
cat('Validation RMSE = ', validationRMSE6,'\n')
Validation RMSE = 0.2796911
# Random Forest, by principle, randomizes the variable selection during each tree split, so it's not as
# prone to overfitting as some of the other models.
#########################################################################################################
# STEP 10: TEST ON THE TEST DATASET
# Check out the different RMSEs.
validationRMSE1
[1] 0.2963254
validationRMSE2
[1] 0.3884708
validationRMSE3
[1] 0.3006132
validationRMSE4
[1] 0.367987
validationRMSE5
[1] 0.4593657
validationRMSE6
[1] 0.2796911
# We choose model 6 because it has consistently proven to be the better predictive model, given that it
# typically generates a lower RMSE than the other models while maintaining a low risk for overfitting.
predictiontest = predict(model6, newdata = test)
test <- test %>% add_predictions(model6) # Append the model predictions to the test set.
test <- rename(test, c(pred = "Predicted_Outcome"))
test[,21:22] # View only the last two columns
testErrors = test$ZumYN - predictiontest
testRMSE = sqrt(mean(testErrors^2))
cat('Test RMSE = ', testRMSE,'\n')
Test RMSE = 0.5347391
#########################################################################################################

#########################################################################################################
# STEP 11: CONDUCT DIMENSIONALITY REDUCTION WITH PCA
# Run PCA on the explanatory variables.
Ride_Share9 <- Ride_Share8[,1:20]
PCA1 = principal(Ride_Share9, rotate="none") # Use as much data as possible to come up with the PCs
# How many components (factors) should we focus on? Consider the eigenvalues to answer this question.
# Keep the components where the number of eigenvalues is greater than one. After all, the cost of
# additional complexity outweighs the value of additional insight from adding the next dimension.
# Let's plot the eigenvalues to make this decision easy. The second command below adds to the plot a
# dashed horizontal line at y = 1 (i.e., a line with intercept 1 and slope 0). If you don't see big
# eigenvalues in your plot, that tells you that the PCA is not really useful.
plot(PCA1$values, main = "Eigenvalues", ylab = "Eigenvalues", xlab = "Number of Principal Components",
col = "blue", type = "b", pch = 19)
abline(a = 1, b = 0, lty = "dashed")
# Eigenvalue of specific PCAx / Total eigenvalues = Percent of variation in the data explained by PCAx
# As is apparent, the number of principal components is equivalent to the number of explanatory
# variables in the data frame.
# There are nine factors whose eigenvalues are greater than 1. Rerun PCA to keep those components.
PCA2 = principal(Ride_Share9, nfactors = 9, rotate="none")
PCA2$values # Display only the eigenvalues
[1] 2.73388824 1.96484372 1.66366342 1.48147389 1.43070799 1.31491913 1.25265100 1.09749895
[9] 1.04596955 0.91676106 0.87070943 0.81229907 0.71068267 0.66952589 0.58773444 0.48813952
[17] 0.46094309 0.41679483 0.05331187 0.02748224
#########################################################################################################
#########################################################################################################
# STEP 12: INTERPRET THE PRINCIPAL COMPONENTS
# What do the selected components stand for? We need to interpret them. Based on the loadings, we can
# appropriately name each column (a rather subjective process). The loadings give the weights of the
# variables in each of the principal components. We can see the loadings using the following code:
fa.sort(PCA2$loadings) # The loadings tell you how strong the relationship is between each variable
Loadings:
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9
DriveStat 0.624 0.171 0.186 0.215 -0.138 0.252
KidsEZR.Cln 0.617 0.157 -0.345 -0.146 -0.201 -0.246
KidsEZR.Liability -0.607 0.176 -0.171 -0.526 0.177
KidsEZR.Toys 0.601 0.447 0.154 0.103 -0.218
PF.Brand 0.584 0.156 -0.470 0.144 0.178 0.194
KidsEZR.Rltnshp -0.529 0.233 -0.122 -0.237 0.318 0.306 0.352 0.427
PF.Comp -0.396 -0.260 -0.123 0.256 0.321 0.373 -0.370
KidsEZR.Car 0.138 -0.730 0.122 -0.211 -0.349 0.190
PF.EasyStart -0.330 0.587 0.306 0.168 -0.282 0.165 -0.160
PF.GeoFlex 0.237 -0.557 0.329 0.123 -0.218 -0.309 0.267
AnnInc 0.277 0.502 -0.101 -0.229 0.110 -0.209 0.349
PF.TimeFlex 0.190 0.277 -0.576 0.237 -0.331 -0.186 -0.409
SocialMed 0.301 -0.175 0.573 -0.229 0.368 -0.161
EmployStat 0.504 0.308 0.147 -0.473
KidsEZR.CPR 0.131 -0.111 0.367 0.451 0.197 -0.238 0.383 -0.431
Age 0.426 -0.134 0.152 0.481 0.437 0.122
Parent 0.192 -0.171 0.417 0.147 -0.198 0.422 -0.205 0.227 0.132
PF.Autonomy 0.112 -0.316 -0.343 -0.418 -0.249 0.542 0.110
Race 0.341 0.343 -0.373 0.290 0.310 0.434
Sex -0.284 0.206 0.363 -0.238 -0.233 0.428
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9
SS loadings 2.734 1.965 1.664 1.481 1.431 1.315 1.253 1.097 1.046
Proportion Var 0.137 0.098 0.083 0.074 0.072 0.066 0.063 0.055 0.052
Cumulative Var 0.137 0.235 0.318 0.392 0.464 0.529 0.592 0.647 0.699
# (constituting the component) with the component itself.
# If any of the original columns are not represented in the nine factors, we would see the communality
# for that column to be very low.
round(data.frame(PCA2$communality),digits=1)
#########################################################################################################
#########################################################################################################
# STEP 13: TRANSFORM THE DATA USING THE PRINCIPAL COMPONENTS
# After doing PCA and narrowing down to the key components, our data is transformed from 20 columns to
# 9 columns. Moreover, now that we have interpreted the new columns, we need to fill them up. For every
# observation, we need to know its value for PC1 through PC9.
score1 = data.frame(PCA2$score)
# Take a look at the transformed data.
head(score1)
# Notice the lack of correlation among the different components.
allcorrelations2 = cor(score1)
corrplot(allcorrelations2)

# Notice the mean and standard deviation for each component.
round(mean(score1[,1]),1) # Mean of principal component one.
[1] 0
round(sd(score1[,1]),1) # Standard deviation of principal component one.
[1] 1
round(mean(score1[,2]),1) # Mean of principal component two.
[1] 0
round(sd(score1[,2]),1) # Standard deviation of principal component two.
[1] 1
# More efficient way to find mean and sd of each principal component:
round(apply(score1, 2, FUN = mean),1)
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9
0 0 0 0 0 0 0 0 0
round(apply(score1, 2, FUN = sd),1)
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9
1 1 1 1 1 1 1 1 1
# Even more efficient way to find the mean of each principal component:
round(colMeans(score1), 1)
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9
0 0 0 0 0 0 0 0 0
#########################################################################################################
# STEP 14: PERFORM REGRESSION USING PRINCIPAL COMPONENTS
# Regress the depedent variable on the components. Furthermore, we can interpret the coefficients
# without worrying about multicollinearity.
reg = lm(Ride_Share8$ZumYN ~ ., data = score1)
summary(reg)
Call:
lm(formula = Ride_Share8$ZumYN ~ ., data = score1)
Residuals:
Min 1Q Median 3Q Max
-0.91321 -0.00112 0.18567 0.27277 0.52012
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.765306 0.043609 17.549 <2e-16 ***
PC1 -0.022015 0.043833 -0.502 0.617
PC2 -0.053372 0.043833 -1.218 0.227
PC3 -0.032021 0.043833 -0.731 0.467
PC4 -0.010944 0.043833 -0.250 0.803
PC5 0.071028 0.043833 1.620 0.109
PC6 0.011533 0.043833 0.263 0.793
PC7 0.005747 0.043833 0.131 0.896
PC8 0.030861 0.043833 0.704 0.483
PC9 0.041794 0.043833 0.953 0.343
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4317 on 88 degrees of freedom
Multiple R-squared: 0.06827, Adjusted R-squared: -0.02702
F-statistic: 0.7164 on 9 and 88 DF, p-value: 0.6925
# In lm command a "." tells R to pick all the columns in the table specified after "data = ", in this
# case, the score table. So, in the above command, it regresses satisfaction on all nine components.
#########################################################################################################
# STEP 15: IMPROVE INTERPRETATION OF PRINCIPAL COMPONENTS
# We can take additional steps to better align our components with the original columns so they can be
# interpreted more easily.
# Rotating the Components
# For this purpose, we use an option in PCA called the "varimax" rotation. This rotation maximizes the
# correlations between variables and factors. Applying this rotation will not change the amount of
# variation explained by the components.
# Rotated PCA
# Command to run a new PCA with rotation.
PCA3 = principal(Ride_Share9, nfactors = 9, rotate="varimax")
# Now, consider the loadings to interpret the components.
fa.sort(PCA3$loadings)
Loadings:
RC1 RC2 RC6 RC5 RC7 RC4 RC3 RC8 RC9
Age 0.739 0.143 -0.171 0.182 0.140
KidsEZR.Liability -0.704 -0.233 0.258 0.137 -0.212 0.256
PF.Brand 0.586 -0.196 0.117 -0.273 -0.162 -0.265 0.319
DriveStat 0.539 0.272 0.317 -0.118 0.189 0.235 0.105
PF.GeoFlex 0.763 0.146 -0.216 0.143 0.161
Parent 0.200 0.606 0.201 0.295 -0.127 -0.136
KidsEZR.Car 0.579 -0.176 -0.132 -0.391 -0.331 -0.110 -0.336
PF.Comp -0.799 0.140 0.117 0.143
KidsEZR.Toys 0.284 0.576 -0.232 0.210 0.274 0.142 0.201
KidsEZR.Rltnshp -0.412 -0.179 0.727 -0.118 -0.328 -0.190
Race 0.111 0.281 0.660 -0.210 0.103 0.185 0.368
KidsEZR.Cln 0.312 -0.137 0.283 -0.640 -0.105 0.174
PF.Autonomy 0.121 -0.859 -0.139
PF.TimeFlex -0.111 -0.120 0.460 0.506 -0.458 -0.204 -0.255
SocialMed 0.122 0.800
AnnInc 0.136 -0.132 0.726
PF.EasyStart -0.249 -0.275 0.230 0.287 0.604 -0.257
KidsEZR.CPR 0.129 0.875
EmployStat 0.207 0.410 -0.392 0.478
Sex -0.132 0.734
RC1 RC2 RC6 RC5 RC7 RC4 RC3 RC8 RC9
SS loadings 1.999 1.722 1.695 1.562 1.471 1.466 1.422 1.376 1.272
Proportion Var 0.100 0.086 0.085 0.078 0.074 0.073 0.071 0.069 0.064
Cumulative Var 0.100 0.186 0.271 0.349 0.422 0.496 0.567 0.636 0.699
# Score2 will be our new scores, which is the transformed data.
score2 = data.frame(PCA3$score)
head(score2)
# Efficient way to find mean and sd of each rotated component:
round(apply(score2, 2, FUN = mean),1)
RC1 RC2 RC6 RC5 RC7 RC4 RC3 RC8 RC9
0 0 0 0 0 0 0 0 0
round(apply(score2, 2, FUN = sd),1)
RC1 RC2 RC6 RC5 RC7 RC4 RC3 RC8 RC9
1 1 1 1 1 1 1 1 1
# Regress the depedent variable on the rotated components.
reg = lm(Ride_Share8$ZumYN ~ ., data = score2)
summary(reg)
Call:
lm(formula = Ride_Share8$ZumYN ~ ., data = score2)
Residuals:
Min 1Q Median 3Q Max
-0.91321 -0.00112 0.18567 0.27277 0.52012
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.76531 0.04361 17.549 <2e-16 ***
RC1 0.02231 0.04383 0.509 0.612
RC2 0.01010 0.04383 0.230 0.818
RC6 -0.04343 0.04383 -0.991 0.324
RC5 0.05719 0.04383 1.305 0.195
RC7 -0.00693 0.04383 -0.158 0.875
RC4 -0.02763 0.04383 -0.630 0.530
RC3 -0.04746 0.04383 -1.083 0.282
RC8 -0.02230 0.04383 -0.509 0.612
RC9 0.05541 0.04383 1.264 0.210
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4317 on 88 degrees of freedom
Multiple R-squared: 0.06827, Adjusted R-squared: -0.02702
F-statistic: 0.7164 on 9 and 88 DF, p-value: 0.6925
#########################################################################################################
# STEP 16: RUN CLUSTER ANALYSIS ON ROTATED COMPONENTS
# Conduct a cluster analysis based on the factor scores. First create the distance.
d1 = dist(score2, method="euclidean")
# Run the command to conduct hierarchical cluster analysis
hc = hclust(d1, method = "ward")
The "ward" method has been renamed to "ward.D"; note new "ward.D2"
# Check out the height at which the clusters were merged, given that height is the measure of
# dissimilarity within clusters. Naturally, larger clusters will merge at a larger height.
# hc, the output from hclust command, has the height stored in it. Reverse it and look at the first 10
# elements.
plot(rev(hc$height)[1:10], type = "b", col = "blue", xlab = "Number of Clusters")

# It looks like going from three clusters to two clusters took a big compromise, compared to other
# mergers of clusters. This tells us that the clusters in a three-cluster solution are very dissimilar.
# The more clusters you choose, the more the clusters seem similar to one another.
# Visualize how the algorithm progressed.
plot(cut(as.dendrogram(hc),5)$upper, main = "Dendrogram for Cluster Analysis (Zoomed-In)",
leaflab = "none")

# Attach the cluster identities to the original data frame.
Ride_Share9$clus = cutree(hc,3) # Cut tree at three clusters.
# How many observations fall into each of the clusters?
table(Ride_Share9$clus)
1 2 3
31 34 33
# Inspect cluster means.
clusterMeans <- round(aggregate(score2, by = list(Ride_Share9$clus), FUN = mean), digits=2)
clusterMeans
# Consider the difference in the means for each RC to determine for which RCs you should build a
# histogram. Focus on the RCs with the biggest difference.
apply(clusterMeans, 2, diff)
Group.1 RC1 RC2 RC6 RC5 RC7 RC4 RC3 RC8 RC9
[1,] 1 -0.18 -0.01 0.46 0.07 0.92 -0.67 -0.51 1.38 -0.21
[2,] 1 1.14 -0.03 -0.79 -0.62 -0.74 -0.15 0.71 -0.04 -0.05
# Given the differences, choose RC1, RC6, RC7, and RC8.

#########################################################################################################
# STEP 17: DISTRIBUTION OF THE ROTATED COMPONENTS ACROSS THE CLUSTERS
# Histograms with overlaid distributions
hist(score2$RC1[Ride_Share9$clus==1] , xlab = "RC1", ylab = "# of Observations", col = rgb(1,0,0,0.5),
xlim = c(-3,3), ylim = c(0,15), main = "Histogram RC1")
hist(score2$RC1[Ride_Share9$clus==2], col = rgb(0,0,1,0.5), add = T)
hist(score2$RC1[Ride_Share9$clus==3], col = rgb(0,1,0,0.5), add = T)
legend("topright", c("Cluster 1", "Cluster 2", "Cluster 3"),
col = c(rgb(1,0,0,0.5), rgb(0,0,1,0.5), rgb(0,1,0,0.5)), lwd = 10)
hist(score2$RC6[Ride_Share9$clus==1] , xlab = "RC6", ylab = "# of Observations", col = rgb(1,0,0,0.5),
xlim = c(-5,5), ylim = c(0,10), main = "Histogram RC6")
hist(score2$RC6[Ride_Share9$clus==2], col = rgb(0,0,1,0.5), add = T)
hist(score2$RC6[Ride_Share9$clus==3], col = rgb(0,1,0,0.5), add = T)
legend("topright", c("Cluster 1", "Cluster 2", "Cluster 3"),
col = c(rgb(1,0,0,0.5), rgb(0,0,1,0.5), rgb(0,1,0,0.5)), lwd = 10)

hist(score2$RC7[Ride_Share9$clus==1] , xlab = "RC7", ylab = "# of Observations", col = rgb(1,0,0,0.5),
xlim = c(-3,4), ylim = c(0,20), main = "Histogram RC7")
hist(score2$RC7[Ride_Share9$clus==2], col = rgb(0,0,1,0.5), add = T)
hist(score2$RC7[Ride_Share9$clus==3], col = rgb(0,1,0,0.5), add = T)
legend("topright", c("Cluster 1", "Cluster 2", "Cluster 3"),
col = c(rgb(1,0,0,0.5), rgb(0,0,1,0.5), rgb(0,1,0,0.5)), lwd = 10)

hist(score2$RC8[Ride_Share9$clus==1] , xlab = "RC8", ylab = "# of Observations", col = rgb(1,0,0,0.5),
xlim = c(-3,4), ylim = c(0,15), main = "Histogram RC8")
hist(score2$RC8[Ride_Share9$clus==2], col = rgb(0,0,1,0.5), add = T)
hist(score2$RC8[Ride_Share9$clus==3], col = rgb(0,1,0,0.5), add = T)
legend("topright", c("Cluster 1", "Cluster 2", "Cluster 3"),
col = c(rgb(1,0,0,0.5), rgb(0,0,1,0.5), rgb(0,1,0,0.5)), lwd = 10)

# Calculate the average Age across the various clusters.
aggregate(Ride_Share9$Age, by = list(Ride_Share9$clus), FUN = mean)
# Calculate the average AnnInc across the various clusters.
aggregate(Ride_Share9$AnnInc, by = list(Ride_Share9$clus), FUN = mean)
# Calculate the average PFComp across the various clusters.
aggregate(Ride_Share9$PF.Comp, by = list(Ride_Share9$clus), FUN = mean)
# Calculate the average EmployStat across the various clusters.
aggregate(Ride_Share9$EmployStat, by = list(Ride_Share9$clus), FUN = mean)
#########################################################################################################
# STEP 18: MEASURE TOTAL TIME
# The final phase is to report the time it took to run the code.
endTime=proc.time()[3] # Records current time to calculate overall code's run-time
cat("This code took ", endTime-startTime, " seconds\n")
This code took 270.86 seconds
