Get Data

library(readr)
df <- read_csv("~/Documents/School files/MS Program Spring 2016/Classes 2017/Fall 2017/CSE 891/Taxi Project/Data/TaxiData Offical/Yellow2016Sample.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   VendorID = col_integer(),
##   tpep_pickup_datetime = col_datetime(format = ""),
##   pickup_date = col_character(),
##   pickup_time = col_time(format = ""),
##   pickup_weekday = col_character(),
##   pickup_timeofday = col_character(),
##   tpep_dropoff_datetime = col_datetime(format = ""),
##   dropoff_date = col_character(),
##   dropoff_time = col_time(format = ""),
##   dropoff_weekday = col_character(),
##   dropoff_timeofday = col_character(),
##   passenger_count = col_integer(),
##   RatecodeID = col_integer(),
##   store_and_fwd_flag = col_character(),
##   payment_type = col_integer()
## )
## See spec(...) for full column specifications.

Columns type

colnames(df)
##  [1] "VendorID"              "tpep_pickup_datetime" 
##  [3] "pickup_date"           "pickup_time"          
##  [5] "pickup_weekday"        "pickup_timeofday"     
##  [7] "tpep_dropoff_datetime" "dropoff_date"         
##  [9] "dropoff_time"          "dropoff_weekday"      
## [11] "dropoff_timeofday"     "passenger_count"      
## [13] "trip_distance"         "pickup_longitude"     
## [15] "pickup_latitude"       "RatecodeID"           
## [17] "store_and_fwd_flag"    "dropoff_longitude"    
## [19] "dropoff_latitude"      "payment_type"         
## [21] "fare_amount"           "extra"                
## [23] "mta_tax"               "tip_amount"           
## [25] "tolls_amount"          "improvement_surcharge"
## [27] "total_amount"
sapply(df, class)
## $VendorID
## [1] "integer"
## 
## $tpep_pickup_datetime
## [1] "POSIXct" "POSIXt" 
## 
## $pickup_date
## [1] "character"
## 
## $pickup_time
## [1] "hms"      "difftime"
## 
## $pickup_weekday
## [1] "character"
## 
## $pickup_timeofday
## [1] "character"
## 
## $tpep_dropoff_datetime
## [1] "POSIXct" "POSIXt" 
## 
## $dropoff_date
## [1] "character"
## 
## $dropoff_time
## [1] "hms"      "difftime"
## 
## $dropoff_weekday
## [1] "character"
## 
## $dropoff_timeofday
## [1] "character"
## 
## $passenger_count
## [1] "integer"
## 
## $trip_distance
## [1] "numeric"
## 
## $pickup_longitude
## [1] "numeric"
## 
## $pickup_latitude
## [1] "numeric"
## 
## $RatecodeID
## [1] "integer"
## 
## $store_and_fwd_flag
## [1] "character"
## 
## $dropoff_longitude
## [1] "numeric"
## 
## $dropoff_latitude
## [1] "numeric"
## 
## $payment_type
## [1] "integer"
## 
## $fare_amount
## [1] "numeric"
## 
## $extra
## [1] "numeric"
## 
## $mta_tax
## [1] "numeric"
## 
## $tip_amount
## [1] "numeric"
## 
## $tolls_amount
## [1] "numeric"
## 
## $improvement_surcharge
## [1] "numeric"
## 
## $total_amount
## [1] "numeric"

Find correlated columns

num_data <- df[, sapply(df, is.numeric)]

cormatrix

cormat <- round(cor(num_data, use = "complete.obs"),2)
head(cormat)
##                  VendorID passenger_count trip_distance pickup_longitude
## VendorID             1.00            0.29          0.02            -0.05
## passenger_count      0.29            1.00          0.01            -0.02
## trip_distance        0.02            0.01          1.00            -0.01
## pickup_longitude    -0.05           -0.02         -0.01             1.00
## pickup_latitude      0.05            0.02          0.01            -1.00
## RatecodeID           0.00           -0.01          0.22             0.04
##                  pickup_latitude RatecodeID dropoff_longitude
## VendorID                    0.05       0.00             -0.05
## passenger_count             0.02      -0.01             -0.02
## trip_distance               0.01       0.22             -0.01
## pickup_longitude           -1.00       0.04              0.88
## pickup_latitude             1.00      -0.04             -0.88
## RatecodeID                 -0.04       1.00              0.05
##                  dropoff_latitude payment_type fare_amount extra mta_tax
## VendorID                     0.05        -0.01        0.01 -0.01    0.00
## passenger_count              0.02         0.01        0.01 -0.01    0.00
## trip_distance                0.01        -0.06        0.72 -0.05   -0.03
## pickup_longitude            -0.88         0.00        0.02  0.00    0.00
## pickup_latitude              0.88         0.00       -0.02  0.00    0.00
## RatecodeID                  -0.05         0.00        0.24 -0.06    0.05
##                  tip_amount tolls_amount improvement_surcharge
## VendorID               0.01         0.01                 -0.02
## passenger_count        0.00         0.01                  0.00
## trip_distance          0.53         0.45                  0.02
## pickup_longitude       0.01         0.00                 -0.02
## pickup_latitude       -0.01         0.00                  0.02
## RatecodeID             0.16         0.17                 -0.07
##                  total_amount
## VendorID                 0.01
## passenger_count          0.01
## trip_distance            0.77
## pickup_longitude         0.01
## pickup_latitude         -0.01
## RatecodeID               0.25

Melt the correlation matrix

library(reshape2)
MELTcormat <- melt(cormat)

str(MELTcormat)
## 'data.frame':    256 obs. of  3 variables:
##  $ Var1 : Factor w/ 16 levels "VendorID","passenger_count",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Var2 : Factor w/ 16 levels "VendorID","passenger_count",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ value: num  1 0.29 0.02 -0.05 0.05 0 -0.05 0.05 -0.01 0.01 ...
MELTcormat2 <- MELTcormat[MELTcormat$Var1!=MELTcormat$Var2,]
corVarOrderedDESC <- MELTcormat2[order(-MELTcormat2$value),]
(corVarOrderedDESC)
##                      Var1                  Var2 value
## 160          total_amount           fare_amount  0.98
## 250           fare_amount          total_amount  0.98
## 55      dropoff_longitude      pickup_longitude  0.88
## 72       dropoff_latitude       pickup_latitude  0.88
## 100      pickup_longitude     dropoff_longitude  0.88
## 117       pickup_latitude      dropoff_latitude  0.88
## 48           total_amount         trip_distance  0.77
## 243         trip_distance          total_amount  0.77
## 42            fare_amount         trip_distance  0.72
## 147         trip_distance           fare_amount  0.72
## 208          total_amount            tip_amount  0.59
## 253            tip_amount          total_amount  0.59
## 45             tip_amount         trip_distance  0.53
## 195         trip_distance            tip_amount  0.53
## 224          total_amount          tolls_amount  0.47
## 254          tolls_amount          total_amount  0.47
## 46           tolls_amount         trip_distance  0.45
## 157            tip_amount           fare_amount  0.45
## 202           fare_amount            tip_amount  0.45
## 211         trip_distance          tolls_amount  0.45
## 158          tolls_amount           fare_amount  0.35
## 218           fare_amount          tolls_amount  0.35
## 206          tolls_amount            tip_amount  0.31
## 221            tip_amount          tolls_amount  0.31
## 2         passenger_count              VendorID  0.29
## 17               VendorID       passenger_count  0.29
## 96           total_amount            RatecodeID  0.25
## 246            RatecodeID          total_amount  0.25
## 90            fare_amount            RatecodeID  0.24
## 150            RatecodeID           fare_amount  0.24
## 38             RatecodeID         trip_distance  0.22
## 83          trip_distance            RatecodeID  0.22
## 156               mta_tax           fare_amount  0.19
## 186           fare_amount               mta_tax  0.19
## 94           tolls_amount            RatecodeID  0.17
## 214            RatecodeID          tolls_amount  0.17
## 93             tip_amount            RatecodeID  0.16
## 191 improvement_surcharge               mta_tax  0.16
## 192          total_amount               mta_tax  0.16
## 198            RatecodeID            tip_amount  0.16
## 236               mta_tax improvement_surcharge  0.16
## 252               mta_tax          total_amount  0.16
## 5         pickup_latitude              VendorID  0.05
## 8        dropoff_latitude              VendorID  0.05
## 65               VendorID       pickup_latitude  0.05
## 87      dropoff_longitude            RatecodeID  0.05
## 92                mta_tax            RatecodeID  0.05
## 102            RatecodeID     dropoff_longitude  0.05
## 113              VendorID      dropoff_latitude  0.05
## 182            RatecodeID               mta_tax  0.05
## 54             RatecodeID      pickup_longitude  0.04
## 84       pickup_longitude            RatecodeID  0.04
## 175 improvement_surcharge                 extra  0.04
## 235                 extra improvement_surcharge  0.04
## 3           trip_distance              VendorID  0.02
## 21        pickup_latitude       passenger_count  0.02
## 24       dropoff_latitude       passenger_count  0.02
## 33               VendorID         trip_distance  0.02
## 47  improvement_surcharge         trip_distance  0.02
## 58            fare_amount      pickup_longitude  0.02
## 66        passenger_count       pickup_latitude  0.02
## 79  improvement_surcharge       pickup_latitude  0.02
## 114       passenger_count      dropoff_latitude  0.02
## 127 improvement_surcharge      dropoff_latitude  0.02
## 148      pickup_longitude           fare_amount  0.02
## 159 improvement_surcharge           fare_amount  0.02
## 227         trip_distance improvement_surcharge  0.02
## 229       pickup_latitude improvement_surcharge  0.02
## 232      dropoff_latitude improvement_surcharge  0.02
## 234           fare_amount improvement_surcharge  0.02
## 240          total_amount improvement_surcharge  0.02
## 255 improvement_surcharge          total_amount  0.02
## 10            fare_amount              VendorID  0.01
## 13             tip_amount              VendorID  0.01
## 14           tolls_amount              VendorID  0.01
## 16           total_amount              VendorID  0.01
## 19          trip_distance       passenger_count  0.01
## 25           payment_type       passenger_count  0.01
## 26            fare_amount       passenger_count  0.01
## 30           tolls_amount       passenger_count  0.01
## 32           total_amount       passenger_count  0.01
## 34        passenger_count         trip_distance  0.01
## 37        pickup_latitude         trip_distance  0.01
## 40       dropoff_latitude         trip_distance  0.01
## 61             tip_amount      pickup_longitude  0.01
## 64           total_amount      pickup_longitude  0.01
## 67          trip_distance       pickup_latitude  0.01
## 105          payment_type     dropoff_longitude  0.01
## 106           fare_amount     dropoff_longitude  0.01
## 112          total_amount     dropoff_longitude  0.01
## 115         trip_distance      dropoff_latitude  0.01
## 130       passenger_count          payment_type  0.01
## 135     dropoff_longitude          payment_type  0.01
## 145              VendorID           fare_amount  0.01
## 146       passenger_count           fare_amount  0.01
## 151     dropoff_longitude           fare_amount  0.01
## 193              VendorID            tip_amount  0.01
## 196      pickup_longitude            tip_amount  0.01
## 207 improvement_surcharge            tip_amount  0.01
## 209              VendorID          tolls_amount  0.01
## 210       passenger_count          tolls_amount  0.01
## 237            tip_amount improvement_surcharge  0.01
## 241              VendorID          total_amount  0.01
## 242       passenger_count          total_amount  0.01
## 244      pickup_longitude          total_amount  0.01
## 247     dropoff_longitude          total_amount  0.01
## 6              RatecodeID              VendorID  0.00
## 12                mta_tax              VendorID  0.00
## 28                mta_tax       passenger_count  0.00
## 29             tip_amount       passenger_count  0.00
## 31  improvement_surcharge       passenger_count  0.00
## 57           payment_type      pickup_longitude  0.00
## 59                  extra      pickup_longitude  0.00
## 60                mta_tax      pickup_longitude  0.00
## 62           tolls_amount      pickup_longitude  0.00
## 73           payment_type       pickup_latitude  0.00
## 75                  extra       pickup_latitude  0.00
## 76                mta_tax       pickup_latitude  0.00
## 78           tolls_amount       pickup_latitude  0.00
## 81               VendorID            RatecodeID  0.00
## 89           payment_type            RatecodeID  0.00
## 107                 extra     dropoff_longitude  0.00
## 108               mta_tax     dropoff_longitude  0.00
## 109            tip_amount     dropoff_longitude  0.00
## 110          tolls_amount     dropoff_longitude  0.00
## 123                 extra      dropoff_latitude  0.00
## 124               mta_tax      dropoff_latitude  0.00
## 125            tip_amount      dropoff_latitude  0.00
## 126          tolls_amount      dropoff_latitude  0.00
## 132      pickup_longitude          payment_type  0.00
## 133       pickup_latitude          payment_type  0.00
## 134            RatecodeID          payment_type  0.00
## 164      pickup_longitude                 extra  0.00
## 165       pickup_latitude                 extra  0.00
## 167     dropoff_longitude                 extra  0.00
## 168      dropoff_latitude                 extra  0.00
## 177              VendorID               mta_tax  0.00
## 178       passenger_count               mta_tax  0.00
## 180      pickup_longitude               mta_tax  0.00
## 181       pickup_latitude               mta_tax  0.00
## 183     dropoff_longitude               mta_tax  0.00
## 184      dropoff_latitude               mta_tax  0.00
## 194       passenger_count            tip_amount  0.00
## 199     dropoff_longitude            tip_amount  0.00
## 200      dropoff_latitude            tip_amount  0.00
## 212      pickup_longitude          tolls_amount  0.00
## 213       pickup_latitude          tolls_amount  0.00
## 215     dropoff_longitude          tolls_amount  0.00
## 216      dropoff_latitude          tolls_amount  0.00
## 223 improvement_surcharge          tolls_amount  0.00
## 226       passenger_count improvement_surcharge  0.00
## 238          tolls_amount improvement_surcharge  0.00
## 9            payment_type              VendorID -0.01
## 11                  extra              VendorID -0.01
## 22             RatecodeID       passenger_count -0.01
## 27                  extra       passenger_count -0.01
## 36       pickup_longitude         trip_distance -0.01
## 39      dropoff_longitude         trip_distance -0.01
## 51          trip_distance      pickup_longitude -0.01
## 77             tip_amount       pickup_latitude -0.01
## 80           total_amount       pickup_latitude -0.01
## 82        passenger_count            RatecodeID -0.01
## 99          trip_distance     dropoff_longitude -0.01
## 121          payment_type      dropoff_latitude -0.01
## 122           fare_amount      dropoff_latitude -0.01
## 128          total_amount      dropoff_latitude -0.01
## 129              VendorID          payment_type -0.01
## 136      dropoff_latitude          payment_type -0.01
## 152      dropoff_latitude           fare_amount -0.01
## 161              VendorID                 extra -0.01
## 162       passenger_count                 extra -0.01
## 173            tip_amount                 extra -0.01
## 197       pickup_latitude            tip_amount -0.01
## 203                 extra            tip_amount -0.01
## 245       pickup_latitude          total_amount -0.01
## 248      dropoff_latitude          total_amount -0.01
## 15  improvement_surcharge              VendorID -0.02
## 20       pickup_longitude       passenger_count -0.02
## 23      dropoff_longitude       passenger_count -0.02
## 50        passenger_count      pickup_longitude -0.02
## 63  improvement_surcharge      pickup_longitude -0.02
## 74            fare_amount       pickup_latitude -0.02
## 98        passenger_count     dropoff_longitude -0.02
## 111 improvement_surcharge     dropoff_longitude -0.02
## 140               mta_tax          payment_type -0.02
## 149       pickup_latitude           fare_amount -0.02
## 185          payment_type               mta_tax -0.02
## 225              VendorID improvement_surcharge -0.02
## 228      pickup_longitude improvement_surcharge -0.02
## 231     dropoff_longitude improvement_surcharge -0.02
## 44                mta_tax         trip_distance -0.03
## 139                 extra          payment_type -0.03
## 142          tolls_amount          payment_type -0.03
## 169          payment_type                 extra -0.03
## 172               mta_tax                 extra -0.03
## 179         trip_distance               mta_tax -0.03
## 187                 extra               mta_tax -0.03
## 217          payment_type          tolls_amount -0.03
## 70             RatecodeID       pickup_latitude -0.04
## 85        pickup_latitude            RatecodeID -0.04
## 189            tip_amount               mta_tax -0.04
## 204               mta_tax            tip_amount -0.04
## 4        pickup_longitude              VendorID -0.05
## 7       dropoff_longitude              VendorID -0.05
## 43                  extra         trip_distance -0.05
## 49               VendorID      pickup_longitude -0.05
## 88       dropoff_latitude            RatecodeID -0.05
## 97               VendorID     dropoff_longitude -0.05
## 118            RatecodeID      dropoff_latitude -0.05
## 138           fare_amount          payment_type -0.05
## 153          payment_type           fare_amount -0.05
## 163         trip_distance                 extra -0.05
## 174          tolls_amount                 extra -0.05
## 219                 extra          tolls_amount -0.05
## 41           payment_type         trip_distance -0.06
## 91                  extra            RatecodeID -0.06
## 131         trip_distance          payment_type -0.06
## 166            RatecodeID                 extra -0.06
## 190          tolls_amount               mta_tax -0.06
## 220               mta_tax          tolls_amount -0.06
## 95  improvement_surcharge            RatecodeID -0.07
## 230            RatecodeID improvement_surcharge -0.07
## 143 improvement_surcharge          payment_type -0.08
## 233          payment_type improvement_surcharge -0.08
## 176          total_amount                 extra -0.09
## 251                 extra          total_amount -0.09
## 155                 extra           fare_amount -0.12
## 170           fare_amount                 extra -0.12
## 144          total_amount          payment_type -0.13
## 249          payment_type          total_amount -0.13
## 141            tip_amount          payment_type -0.49
## 201          payment_type            tip_amount -0.49
## 56       dropoff_latitude      pickup_longitude -0.88
## 71      dropoff_longitude       pickup_latitude -0.88
## 101       pickup_latitude     dropoff_longitude -0.88
## 116      pickup_longitude      dropoff_latitude -0.88
## 53        pickup_latitude      pickup_longitude -1.00
## 68       pickup_longitude       pickup_latitude -1.00
## 104      dropoff_latitude     dropoff_longitude -1.00
## 119     dropoff_longitude      dropoff_latitude -1.00

Map distributions

library(reshape2)
head(melt(num_data))
## No id variables; using all as measure variables
##   variable value
## 1 VendorID     1
## 2 VendorID     1
## 3 VendorID     1
## 4 VendorID     1
## 5 VendorID     2
## 6 VendorID     1
head(melt(df))
## Using pickup_date, pickup_weekday, pickup_timeofday, dropoff_date, dropoff_weekday, dropoff_timeofday, store_and_fwd_flag as id variables
## Warning: attributes are not identical across measure variables; they will
## be dropped
##   pickup_date pickup_weekday pickup_timeofday dropoff_date dropoff_weekday
## 1    1/4/2016            Mon        Afternoon     1/4/2016             Mon
## 2   1/10/2016            Sun          Evening    1/10/2016             Sun
## 3    1/6/2016            Wed          Morning     1/6/2016             Wed
## 4    1/4/2016            Mon        Afternoon     1/4/2016             Mon
## 5   1/14/2016            Thu          Evening    1/14/2016             Thu
## 6    1/8/2016            Fri          Morning     1/8/2016             Fri
##   dropoff_timeofday store_and_fwd_flag variable value
## 1         Afternoon                  N VendorID     1
## 2           Evening                  N VendorID     1
## 3           Morning                  N VendorID     1
## 4         Afternoon                  N VendorID     1
## 5           Evening                  N VendorID     2
## 6           Morning                  N VendorID     1
library(ggplot2)
ggplot(data = melt(num_data), mapping = aes(x = value)) + 
  geom_histogram(bins = 10) + facet_wrap(~variable, scales = 'free_x')
## No id variables; using all as measure variables
## Warning: Removed 998672 rows containing non-finite values (stat_bin).

ggplot(data = melt(df), mapping = aes(x = value)) + 
  geom_histogram(bins = 10) + facet_wrap(~variable, scales = 'free_x')
## Using pickup_date, pickup_weekday, pickup_timeofday, dropoff_date, dropoff_weekday, dropoff_timeofday, store_and_fwd_flag as id variables
## Warning: attributes are not identical across measure variables; they will
## be dropped
## Warning: Removed 1248340 rows containing non-finite values (stat_bin).