Get Data
library(readr)
df <- read_csv("~/Documents/School files/MS Program Spring 2016/Classes 2017/Fall 2017/CSE 891/Taxi Project/Data/TaxiData Offical/Yellow2016Sample.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## VendorID = col_integer(),
## tpep_pickup_datetime = col_datetime(format = ""),
## pickup_date = col_character(),
## pickup_time = col_time(format = ""),
## pickup_weekday = col_character(),
## pickup_timeofday = col_character(),
## tpep_dropoff_datetime = col_datetime(format = ""),
## dropoff_date = col_character(),
## dropoff_time = col_time(format = ""),
## dropoff_weekday = col_character(),
## dropoff_timeofday = col_character(),
## passenger_count = col_integer(),
## RatecodeID = col_integer(),
## store_and_fwd_flag = col_character(),
## payment_type = col_integer()
## )
## See spec(...) for full column specifications.
Columns type
colnames(df)
## [1] "VendorID" "tpep_pickup_datetime"
## [3] "pickup_date" "pickup_time"
## [5] "pickup_weekday" "pickup_timeofday"
## [7] "tpep_dropoff_datetime" "dropoff_date"
## [9] "dropoff_time" "dropoff_weekday"
## [11] "dropoff_timeofday" "passenger_count"
## [13] "trip_distance" "pickup_longitude"
## [15] "pickup_latitude" "RatecodeID"
## [17] "store_and_fwd_flag" "dropoff_longitude"
## [19] "dropoff_latitude" "payment_type"
## [21] "fare_amount" "extra"
## [23] "mta_tax" "tip_amount"
## [25] "tolls_amount" "improvement_surcharge"
## [27] "total_amount"
sapply(df, class)
## $VendorID
## [1] "integer"
##
## $tpep_pickup_datetime
## [1] "POSIXct" "POSIXt"
##
## $pickup_date
## [1] "character"
##
## $pickup_time
## [1] "hms" "difftime"
##
## $pickup_weekday
## [1] "character"
##
## $pickup_timeofday
## [1] "character"
##
## $tpep_dropoff_datetime
## [1] "POSIXct" "POSIXt"
##
## $dropoff_date
## [1] "character"
##
## $dropoff_time
## [1] "hms" "difftime"
##
## $dropoff_weekday
## [1] "character"
##
## $dropoff_timeofday
## [1] "character"
##
## $passenger_count
## [1] "integer"
##
## $trip_distance
## [1] "numeric"
##
## $pickup_longitude
## [1] "numeric"
##
## $pickup_latitude
## [1] "numeric"
##
## $RatecodeID
## [1] "integer"
##
## $store_and_fwd_flag
## [1] "character"
##
## $dropoff_longitude
## [1] "numeric"
##
## $dropoff_latitude
## [1] "numeric"
##
## $payment_type
## [1] "integer"
##
## $fare_amount
## [1] "numeric"
##
## $extra
## [1] "numeric"
##
## $mta_tax
## [1] "numeric"
##
## $tip_amount
## [1] "numeric"
##
## $tolls_amount
## [1] "numeric"
##
## $improvement_surcharge
## [1] "numeric"
##
## $total_amount
## [1] "numeric"
cormatrix
cormat <- round(cor(num_data, use = "complete.obs"),2)
head(cormat)
## VendorID passenger_count trip_distance pickup_longitude
## VendorID 1.00 0.29 0.02 -0.05
## passenger_count 0.29 1.00 0.01 -0.02
## trip_distance 0.02 0.01 1.00 -0.01
## pickup_longitude -0.05 -0.02 -0.01 1.00
## pickup_latitude 0.05 0.02 0.01 -1.00
## RatecodeID 0.00 -0.01 0.22 0.04
## pickup_latitude RatecodeID dropoff_longitude
## VendorID 0.05 0.00 -0.05
## passenger_count 0.02 -0.01 -0.02
## trip_distance 0.01 0.22 -0.01
## pickup_longitude -1.00 0.04 0.88
## pickup_latitude 1.00 -0.04 -0.88
## RatecodeID -0.04 1.00 0.05
## dropoff_latitude payment_type fare_amount extra mta_tax
## VendorID 0.05 -0.01 0.01 -0.01 0.00
## passenger_count 0.02 0.01 0.01 -0.01 0.00
## trip_distance 0.01 -0.06 0.72 -0.05 -0.03
## pickup_longitude -0.88 0.00 0.02 0.00 0.00
## pickup_latitude 0.88 0.00 -0.02 0.00 0.00
## RatecodeID -0.05 0.00 0.24 -0.06 0.05
## tip_amount tolls_amount improvement_surcharge
## VendorID 0.01 0.01 -0.02
## passenger_count 0.00 0.01 0.00
## trip_distance 0.53 0.45 0.02
## pickup_longitude 0.01 0.00 -0.02
## pickup_latitude -0.01 0.00 0.02
## RatecodeID 0.16 0.17 -0.07
## total_amount
## VendorID 0.01
## passenger_count 0.01
## trip_distance 0.77
## pickup_longitude 0.01
## pickup_latitude -0.01
## RatecodeID 0.25
Melt the correlation matrix
library(reshape2)
MELTcormat <- melt(cormat)
str(MELTcormat)
## 'data.frame': 256 obs. of 3 variables:
## $ Var1 : Factor w/ 16 levels "VendorID","passenger_count",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Var2 : Factor w/ 16 levels "VendorID","passenger_count",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ value: num 1 0.29 0.02 -0.05 0.05 0 -0.05 0.05 -0.01 0.01 ...
MELTcormat2 <- MELTcormat[MELTcormat$Var1!=MELTcormat$Var2,]
corVarOrderedDESC <- MELTcormat2[order(-MELTcormat2$value),]
(corVarOrderedDESC)
## Var1 Var2 value
## 160 total_amount fare_amount 0.98
## 250 fare_amount total_amount 0.98
## 55 dropoff_longitude pickup_longitude 0.88
## 72 dropoff_latitude pickup_latitude 0.88
## 100 pickup_longitude dropoff_longitude 0.88
## 117 pickup_latitude dropoff_latitude 0.88
## 48 total_amount trip_distance 0.77
## 243 trip_distance total_amount 0.77
## 42 fare_amount trip_distance 0.72
## 147 trip_distance fare_amount 0.72
## 208 total_amount tip_amount 0.59
## 253 tip_amount total_amount 0.59
## 45 tip_amount trip_distance 0.53
## 195 trip_distance tip_amount 0.53
## 224 total_amount tolls_amount 0.47
## 254 tolls_amount total_amount 0.47
## 46 tolls_amount trip_distance 0.45
## 157 tip_amount fare_amount 0.45
## 202 fare_amount tip_amount 0.45
## 211 trip_distance tolls_amount 0.45
## 158 tolls_amount fare_amount 0.35
## 218 fare_amount tolls_amount 0.35
## 206 tolls_amount tip_amount 0.31
## 221 tip_amount tolls_amount 0.31
## 2 passenger_count VendorID 0.29
## 17 VendorID passenger_count 0.29
## 96 total_amount RatecodeID 0.25
## 246 RatecodeID total_amount 0.25
## 90 fare_amount RatecodeID 0.24
## 150 RatecodeID fare_amount 0.24
## 38 RatecodeID trip_distance 0.22
## 83 trip_distance RatecodeID 0.22
## 156 mta_tax fare_amount 0.19
## 186 fare_amount mta_tax 0.19
## 94 tolls_amount RatecodeID 0.17
## 214 RatecodeID tolls_amount 0.17
## 93 tip_amount RatecodeID 0.16
## 191 improvement_surcharge mta_tax 0.16
## 192 total_amount mta_tax 0.16
## 198 RatecodeID tip_amount 0.16
## 236 mta_tax improvement_surcharge 0.16
## 252 mta_tax total_amount 0.16
## 5 pickup_latitude VendorID 0.05
## 8 dropoff_latitude VendorID 0.05
## 65 VendorID pickup_latitude 0.05
## 87 dropoff_longitude RatecodeID 0.05
## 92 mta_tax RatecodeID 0.05
## 102 RatecodeID dropoff_longitude 0.05
## 113 VendorID dropoff_latitude 0.05
## 182 RatecodeID mta_tax 0.05
## 54 RatecodeID pickup_longitude 0.04
## 84 pickup_longitude RatecodeID 0.04
## 175 improvement_surcharge extra 0.04
## 235 extra improvement_surcharge 0.04
## 3 trip_distance VendorID 0.02
## 21 pickup_latitude passenger_count 0.02
## 24 dropoff_latitude passenger_count 0.02
## 33 VendorID trip_distance 0.02
## 47 improvement_surcharge trip_distance 0.02
## 58 fare_amount pickup_longitude 0.02
## 66 passenger_count pickup_latitude 0.02
## 79 improvement_surcharge pickup_latitude 0.02
## 114 passenger_count dropoff_latitude 0.02
## 127 improvement_surcharge dropoff_latitude 0.02
## 148 pickup_longitude fare_amount 0.02
## 159 improvement_surcharge fare_amount 0.02
## 227 trip_distance improvement_surcharge 0.02
## 229 pickup_latitude improvement_surcharge 0.02
## 232 dropoff_latitude improvement_surcharge 0.02
## 234 fare_amount improvement_surcharge 0.02
## 240 total_amount improvement_surcharge 0.02
## 255 improvement_surcharge total_amount 0.02
## 10 fare_amount VendorID 0.01
## 13 tip_amount VendorID 0.01
## 14 tolls_amount VendorID 0.01
## 16 total_amount VendorID 0.01
## 19 trip_distance passenger_count 0.01
## 25 payment_type passenger_count 0.01
## 26 fare_amount passenger_count 0.01
## 30 tolls_amount passenger_count 0.01
## 32 total_amount passenger_count 0.01
## 34 passenger_count trip_distance 0.01
## 37 pickup_latitude trip_distance 0.01
## 40 dropoff_latitude trip_distance 0.01
## 61 tip_amount pickup_longitude 0.01
## 64 total_amount pickup_longitude 0.01
## 67 trip_distance pickup_latitude 0.01
## 105 payment_type dropoff_longitude 0.01
## 106 fare_amount dropoff_longitude 0.01
## 112 total_amount dropoff_longitude 0.01
## 115 trip_distance dropoff_latitude 0.01
## 130 passenger_count payment_type 0.01
## 135 dropoff_longitude payment_type 0.01
## 145 VendorID fare_amount 0.01
## 146 passenger_count fare_amount 0.01
## 151 dropoff_longitude fare_amount 0.01
## 193 VendorID tip_amount 0.01
## 196 pickup_longitude tip_amount 0.01
## 207 improvement_surcharge tip_amount 0.01
## 209 VendorID tolls_amount 0.01
## 210 passenger_count tolls_amount 0.01
## 237 tip_amount improvement_surcharge 0.01
## 241 VendorID total_amount 0.01
## 242 passenger_count total_amount 0.01
## 244 pickup_longitude total_amount 0.01
## 247 dropoff_longitude total_amount 0.01
## 6 RatecodeID VendorID 0.00
## 12 mta_tax VendorID 0.00
## 28 mta_tax passenger_count 0.00
## 29 tip_amount passenger_count 0.00
## 31 improvement_surcharge passenger_count 0.00
## 57 payment_type pickup_longitude 0.00
## 59 extra pickup_longitude 0.00
## 60 mta_tax pickup_longitude 0.00
## 62 tolls_amount pickup_longitude 0.00
## 73 payment_type pickup_latitude 0.00
## 75 extra pickup_latitude 0.00
## 76 mta_tax pickup_latitude 0.00
## 78 tolls_amount pickup_latitude 0.00
## 81 VendorID RatecodeID 0.00
## 89 payment_type RatecodeID 0.00
## 107 extra dropoff_longitude 0.00
## 108 mta_tax dropoff_longitude 0.00
## 109 tip_amount dropoff_longitude 0.00
## 110 tolls_amount dropoff_longitude 0.00
## 123 extra dropoff_latitude 0.00
## 124 mta_tax dropoff_latitude 0.00
## 125 tip_amount dropoff_latitude 0.00
## 126 tolls_amount dropoff_latitude 0.00
## 132 pickup_longitude payment_type 0.00
## 133 pickup_latitude payment_type 0.00
## 134 RatecodeID payment_type 0.00
## 164 pickup_longitude extra 0.00
## 165 pickup_latitude extra 0.00
## 167 dropoff_longitude extra 0.00
## 168 dropoff_latitude extra 0.00
## 177 VendorID mta_tax 0.00
## 178 passenger_count mta_tax 0.00
## 180 pickup_longitude mta_tax 0.00
## 181 pickup_latitude mta_tax 0.00
## 183 dropoff_longitude mta_tax 0.00
## 184 dropoff_latitude mta_tax 0.00
## 194 passenger_count tip_amount 0.00
## 199 dropoff_longitude tip_amount 0.00
## 200 dropoff_latitude tip_amount 0.00
## 212 pickup_longitude tolls_amount 0.00
## 213 pickup_latitude tolls_amount 0.00
## 215 dropoff_longitude tolls_amount 0.00
## 216 dropoff_latitude tolls_amount 0.00
## 223 improvement_surcharge tolls_amount 0.00
## 226 passenger_count improvement_surcharge 0.00
## 238 tolls_amount improvement_surcharge 0.00
## 9 payment_type VendorID -0.01
## 11 extra VendorID -0.01
## 22 RatecodeID passenger_count -0.01
## 27 extra passenger_count -0.01
## 36 pickup_longitude trip_distance -0.01
## 39 dropoff_longitude trip_distance -0.01
## 51 trip_distance pickup_longitude -0.01
## 77 tip_amount pickup_latitude -0.01
## 80 total_amount pickup_latitude -0.01
## 82 passenger_count RatecodeID -0.01
## 99 trip_distance dropoff_longitude -0.01
## 121 payment_type dropoff_latitude -0.01
## 122 fare_amount dropoff_latitude -0.01
## 128 total_amount dropoff_latitude -0.01
## 129 VendorID payment_type -0.01
## 136 dropoff_latitude payment_type -0.01
## 152 dropoff_latitude fare_amount -0.01
## 161 VendorID extra -0.01
## 162 passenger_count extra -0.01
## 173 tip_amount extra -0.01
## 197 pickup_latitude tip_amount -0.01
## 203 extra tip_amount -0.01
## 245 pickup_latitude total_amount -0.01
## 248 dropoff_latitude total_amount -0.01
## 15 improvement_surcharge VendorID -0.02
## 20 pickup_longitude passenger_count -0.02
## 23 dropoff_longitude passenger_count -0.02
## 50 passenger_count pickup_longitude -0.02
## 63 improvement_surcharge pickup_longitude -0.02
## 74 fare_amount pickup_latitude -0.02
## 98 passenger_count dropoff_longitude -0.02
## 111 improvement_surcharge dropoff_longitude -0.02
## 140 mta_tax payment_type -0.02
## 149 pickup_latitude fare_amount -0.02
## 185 payment_type mta_tax -0.02
## 225 VendorID improvement_surcharge -0.02
## 228 pickup_longitude improvement_surcharge -0.02
## 231 dropoff_longitude improvement_surcharge -0.02
## 44 mta_tax trip_distance -0.03
## 139 extra payment_type -0.03
## 142 tolls_amount payment_type -0.03
## 169 payment_type extra -0.03
## 172 mta_tax extra -0.03
## 179 trip_distance mta_tax -0.03
## 187 extra mta_tax -0.03
## 217 payment_type tolls_amount -0.03
## 70 RatecodeID pickup_latitude -0.04
## 85 pickup_latitude RatecodeID -0.04
## 189 tip_amount mta_tax -0.04
## 204 mta_tax tip_amount -0.04
## 4 pickup_longitude VendorID -0.05
## 7 dropoff_longitude VendorID -0.05
## 43 extra trip_distance -0.05
## 49 VendorID pickup_longitude -0.05
## 88 dropoff_latitude RatecodeID -0.05
## 97 VendorID dropoff_longitude -0.05
## 118 RatecodeID dropoff_latitude -0.05
## 138 fare_amount payment_type -0.05
## 153 payment_type fare_amount -0.05
## 163 trip_distance extra -0.05
## 174 tolls_amount extra -0.05
## 219 extra tolls_amount -0.05
## 41 payment_type trip_distance -0.06
## 91 extra RatecodeID -0.06
## 131 trip_distance payment_type -0.06
## 166 RatecodeID extra -0.06
## 190 tolls_amount mta_tax -0.06
## 220 mta_tax tolls_amount -0.06
## 95 improvement_surcharge RatecodeID -0.07
## 230 RatecodeID improvement_surcharge -0.07
## 143 improvement_surcharge payment_type -0.08
## 233 payment_type improvement_surcharge -0.08
## 176 total_amount extra -0.09
## 251 extra total_amount -0.09
## 155 extra fare_amount -0.12
## 170 fare_amount extra -0.12
## 144 total_amount payment_type -0.13
## 249 payment_type total_amount -0.13
## 141 tip_amount payment_type -0.49
## 201 payment_type tip_amount -0.49
## 56 dropoff_latitude pickup_longitude -0.88
## 71 dropoff_longitude pickup_latitude -0.88
## 101 pickup_latitude dropoff_longitude -0.88
## 116 pickup_longitude dropoff_latitude -0.88
## 53 pickup_latitude pickup_longitude -1.00
## 68 pickup_longitude pickup_latitude -1.00
## 104 dropoff_latitude dropoff_longitude -1.00
## 119 dropoff_longitude dropoff_latitude -1.00
Map distributions
library(reshape2)
head(melt(num_data))
## No id variables; using all as measure variables
## variable value
## 1 VendorID 1
## 2 VendorID 1
## 3 VendorID 1
## 4 VendorID 1
## 5 VendorID 2
## 6 VendorID 1
head(melt(df))
## Using pickup_date, pickup_weekday, pickup_timeofday, dropoff_date, dropoff_weekday, dropoff_timeofday, store_and_fwd_flag as id variables
## Warning: attributes are not identical across measure variables; they will
## be dropped
## pickup_date pickup_weekday pickup_timeofday dropoff_date dropoff_weekday
## 1 1/4/2016 Mon Afternoon 1/4/2016 Mon
## 2 1/10/2016 Sun Evening 1/10/2016 Sun
## 3 1/6/2016 Wed Morning 1/6/2016 Wed
## 4 1/4/2016 Mon Afternoon 1/4/2016 Mon
## 5 1/14/2016 Thu Evening 1/14/2016 Thu
## 6 1/8/2016 Fri Morning 1/8/2016 Fri
## dropoff_timeofday store_and_fwd_flag variable value
## 1 Afternoon N VendorID 1
## 2 Evening N VendorID 1
## 3 Morning N VendorID 1
## 4 Afternoon N VendorID 1
## 5 Evening N VendorID 2
## 6 Morning N VendorID 1
library(ggplot2)
ggplot(data = melt(num_data), mapping = aes(x = value)) +
geom_histogram(bins = 10) + facet_wrap(~variable, scales = 'free_x')
## No id variables; using all as measure variables
## Warning: Removed 998672 rows containing non-finite values (stat_bin).

ggplot(data = melt(df), mapping = aes(x = value)) +
geom_histogram(bins = 10) + facet_wrap(~variable, scales = 'free_x')
## Using pickup_date, pickup_weekday, pickup_timeofday, dropoff_date, dropoff_weekday, dropoff_timeofday, store_and_fwd_flag as id variables
## Warning: attributes are not identical across measure variables; they will
## be dropped
## Warning: Removed 1248340 rows containing non-finite values (stat_bin).
