The dataset I used for this analysis was from Kaggle (https://www.kaggle.com/brllrb/uber-and-lyft-dataset-boston-ma), which was collected from 11-26-18 to 12-18-18 in Boston, MA. The data was gathered from various entities including Uber and Lyft.
Upon studying this dataset, I found that of the 637,976 rows of data, the split between Uber and Lyft rides was roughly 50/50. When coding, I searched for complete cases in order to avoid any NA values in the dataframe. I registered my own API with google in order to assist with making any desired maps.
lyft.uber.dataset <- read_csv("/Users/bethanyleach/Downloads/rideshare_kaggle.csv")
## Rows: 693071 Columns: 57
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): id, timezone, source, destination, cab_type, product_id, name, sh...
## dbl (46): timestamp, hour, day, month, price, distance, surge_multiplier, l...
## dttm (1): datetime
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
lyft.uber.revised <- as_tibble(lyft.uber.dataset)
lyft.uber.remove.na <- lyft.uber.revised[complete.cases(lyft.uber.revised), ]
register_google("AIzaSyBZx2Za3bJfe2OY0QxgPgef-4GX9jd61Pg")
There are twelve neighborhoods included in this analysis. Below, I’ve attached a labeled map of the neighborhoods, a density diagram, and a graph examining the average cab price per neighborhood route.
#labeled neighborhood map
boston3 <- boston_stamen
lat <- c(42.364, 42.352, 42.3398, 42.3503, 42.3505, 42.3505, 42.3519, 42.3559, 42.3588, 42.3647, 42.3661, 42.3661)
lon <- c(-71.060, -71.065, -71.0892, -71.0810, -71.1054, -71.1054, -71.0551, -71.0550, -71.0707, -71.0542, -71.0631, -71.0631)
labels <- c("Haymarket Square", "Theatre District", "Northeastern University", "Back Bay", "Boston University",
"Fenway", "South Station", "Financial District", "Beacon Hill", "North End", "North Station", "West End")
combined_df <- data.frame(lat, lon, labels)
neighborhood_map_adj <- ggmap(boston3) + ggtitle("Labeled Map of Neighborhoods in Boston")+
geom_point(data=combined_df, aes(x=lon, y=lat),
size=0.5, alpha =0.5, fill="red") + labs(x = 'Longitude', y = 'Latitude') +
geom_label_repel(data=combined_df, aes(x = lon, y= lat,
label = labels), fill = "white",
box.padding = unit(.4, "lines"),
label.padding = unit(.15, "lines"),
segment.color = "red", segment.size = 1)
neighborhood_map_adj
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
stat_density_boston_map <- ggmap(boston3) + stat_density_2d(data=lyft.uber.remove.na, aes(x=longitude, y=latitude, fill = stat(level)), geom="polygon") +
ggtitle("Density Map of Cab Rides in Boston Neighborhoods")
stat_density_boston_map
## Warning: Removed 81570 rows containing non-finite values (stat_density2d).
I used both ggmap and get_stamenmap to create the labeled map featured above. I used latitude and longitude to clearly indicate where each neighborhood is located.
This densitymap outlines the density of total rides in each of the neighborhoods. The lighter blue indicates a larger number of rides.
#total rides by hour and day
my_months_name <- month.name[lyft.uber.remove.na$month]
lyft.uber.remove.na_months <- lyft.uber.remove.na %>%
mutate(month_name = my_months_name)
uber_lyft_rides_per_day_november <- lyft.uber.remove.na_months %>%
filter(month_name == "November")
uber_lyft_rides_per_day_december <- lyft.uber.remove.na_months %>%
filter(month_name == "December")
#separating datetime column in month being November
uber_lyft_rides_per_day_november$Date <- as.Date(uber_lyft_rides_per_day_november$datetime)
uber_lyft_rides_per_day_november$Time <- format(as.POSIXct(uber_lyft_rides_per_day_november$datetime),
format = "%H:%M:%S")
unique(uber_lyft_rides_per_day_november$day)
## [1] 27 28 30 29 26
begin_nov <- as.POSIXct("2018-11-26")
finish_nov <- as.POSIXct("2018-11-30")
dat3 <- data.frame(Date = seq.POSIXt(from = begin_nov, to = finish_nov, by ="DSTday"))
dat3$weekday1 <- as.numeric(format(dat3$Date, format = "%u"))
dat3$weekday2 <- format(dat3$Date, format = "%a")
dat3$weekday3 <- format(dat3$Date, format = "%A")
uber_lyft_november_merge_dates <- uber_lyft_rides_per_day_november %>%
full_join(dat3, by="Date")
uber_lyft_november_merge_dates$weekday3 <- ordered(uber_lyft_november_merge_dates$weekday3,
levels=c("Monday","Tuesday", "Wednesday",
"Thursday", "Friday", "Saturday", "Sunday"))
uber_lyft_november_merge_dates <- uber_lyft_november_merge_dates %>%
select(-datetime)
uber_lyft_november_merge_dates
## # A tibble: 263,771 × 62
## id timestamp hour day month timezone source destination cab_type
## <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr>
## 1 4bd23055-68… 1.54e9 2 27 11 America… Hayma… North Stat… Lyft
## 2 981a3613-77… 1.54e9 1 28 11 America… Hayma… North Stat… Lyft
## 3 c2d88af2-d2… 1.54e9 4 30 11 America… Hayma… North Stat… Lyft
## 4 e0126e1f-8c… 1.54e9 3 29 11 America… Hayma… North Stat… Lyft
## 5 462816a3-82… 1.54e9 5 26 11 America… Back … Northeaste… Lyft
## 6 8612d909-98… 1.54e9 10 27 11 America… Back … Northeaste… Lyft
## 7 9043bf77-1d… 1.54e9 16 30 11 America… Back … Northeaste… Lyft
## 8 d859ec69-b3… 1.54e9 19 28 11 America… Back … Northeaste… Lyft
## 9 009e9c53-07… 1.54e9 22 30 11 America… North… West End Uber
## 10 e219e545-a0… 1.54e9 19 29 11 America… North… West End Uber
## # … with 263,761 more rows, and 53 more variables: product_id <chr>,
## # name <chr>, price <dbl>, distance <dbl>, surge_multiplier <dbl>,
## # latitude <dbl>, longitude <dbl>, temperature <dbl>,
## # apparentTemperature <dbl>, short_summary <chr>, long_summary <chr>,
## # precipIntensity <dbl>, precipProbability <dbl>, humidity <dbl>,
## # windSpeed <dbl>, windGust <dbl>, windGustTime <dbl>, visibility <dbl>,
## # temperatureHigh <dbl>, temperatureHighTime <dbl>, temperatureLow <dbl>, …
uber_lyft_november_merge_dates <- uber_lyft_november_merge_dates %>%
filter(source != "NA")
unique(uber_lyft_november_merge_dates)
## # A tibble: 263,771 × 62
## id timestamp hour day month timezone source destination cab_type
## <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr>
## 1 4bd23055-68… 1.54e9 2 27 11 America… Hayma… North Stat… Lyft
## 2 981a3613-77… 1.54e9 1 28 11 America… Hayma… North Stat… Lyft
## 3 c2d88af2-d2… 1.54e9 4 30 11 America… Hayma… North Stat… Lyft
## 4 e0126e1f-8c… 1.54e9 3 29 11 America… Hayma… North Stat… Lyft
## 5 462816a3-82… 1.54e9 5 26 11 America… Back … Northeaste… Lyft
## 6 8612d909-98… 1.54e9 10 27 11 America… Back … Northeaste… Lyft
## 7 9043bf77-1d… 1.54e9 16 30 11 America… Back … Northeaste… Lyft
## 8 d859ec69-b3… 1.54e9 19 28 11 America… Back … Northeaste… Lyft
## 9 009e9c53-07… 1.54e9 22 30 11 America… North… West End Uber
## 10 e219e545-a0… 1.54e9 19 29 11 America… North… West End Uber
## # … with 263,761 more rows, and 53 more variables: product_id <chr>,
## # name <chr>, price <dbl>, distance <dbl>, surge_multiplier <dbl>,
## # latitude <dbl>, longitude <dbl>, temperature <dbl>,
## # apparentTemperature <dbl>, short_summary <chr>, long_summary <chr>,
## # precipIntensity <dbl>, precipProbability <dbl>, humidity <dbl>,
## # windSpeed <dbl>, windGust <dbl>, windGustTime <dbl>, visibility <dbl>,
## # temperatureHigh <dbl>, temperatureHighTime <dbl>, temperatureLow <dbl>, …
daily_number_rides_st_nov <- ggplot(uber_lyft_november_merge_dates, aes(source, fill=weekday3)) + geom_bar(position="dodge") +
scale_y_continuous(labels=comma) + theme(axis.text.x = element_text(angle=90, vjust=.5, hjust=1)) +
ggtitle("Daily Number of Total Rides Per Station - November") + scale_fill_discrete(name = "Day of the Week") +
xlab("Station") + ylab("Total Rides")
uber_lyft_november_merge_dates_imp_info<- uber_lyft_november_merge_dates %>%
dplyr::group_by(Date, weekday1, weekday2, weekday3) %>%
dplyr::summarize(total_rides=n())
## `summarise()` has grouped output by 'Date', 'weekday1', 'weekday2'. You can
## override using the `.groups` argument.
uber_lyft_november_merge_dates_imp_info
## # A tibble: 5 × 5
## # Groups: Date, weekday1, weekday2 [5]
## Date weekday1 weekday2 weekday3 total_rides
## <dttm> <dbl> <chr> <ord> <int>
## 1 2018-11-26 00:00:00 1 Mon Monday 29028
## 2 2018-11-27 00:00:00 2 Tue Tuesday 70135
## 3 2018-11-28 00:00:00 3 Wed Wednesday 67842
## 4 2018-11-29 00:00:00 4 Thu Thursday 55222
## 5 2018-11-30 00:00:00 5 Fri Friday 41544
#separating datetime column in month being December
uber_lyft_rides_per_day_december_altered_date <- uber_lyft_rides_per_day_december
uber_lyft_rides_per_day_december_altered_date$Date <- as.Date(uber_lyft_rides_per_day_december_altered_date$datetime)
uber_lyft_rides_per_day_december_altered_date$Time <- format(as.POSIXct(uber_lyft_rides_per_day_december_altered_date$datetime),
format = "%H:%M:%S")
begin <- as.POSIXct("2018-12-01")
finish <- as.POSIXct("2018-12-18")
dat2 <- data.frame(Date = seq.POSIXt(from = begin, to = finish, by ="DSTday"))
dat2$weekday1 <- as.numeric(format(dat2$Date, format = "%u"))
dat2$weekday2 <- format(dat2$Date, format = "%a")
dat2$weekday3 <- format(dat2$Date, format = "%A")
uber_lyft_december_merge_dates <- uber_lyft_rides_per_day_december_altered_date %>%
full_join(dat2, by="Date")
uber_lyft_december_merge_dates$weekday3 <- ordered(uber_lyft_december_merge_dates$weekday3,
levels=c("Monday","Tuesday", "Wednesday",
"Thursday", "Friday", "Saturday", "Sunday"))
uber_lyft_december_merge_dates <- uber_lyft_december_merge_dates %>%
select(-datetime)
uber_lyft_december_merge_dates
## # A tibble: 374,211 × 62
## id timestamp hour day month timezone source destination cab_type
## <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr>
## 1 424553bb-71… 1.54e9 9 16 12 America… Hayma… North Stat… Lyft
## 2 f6f6d7e4-3e… 1.55e9 18 17 12 America… Hayma… North Stat… Lyft
## 3 474d6376-bc… 1.54e9 19 2 12 America… Back … Northeaste… Lyft
## 4 4f9fee41-fd… 1.54e9 6 3 12 America… Back … Northeaste… Lyft
## 5 23f145da-f0… 1.54e9 10 13 12 America… North… West End Uber
## 6 357559cb-8c… 1.54e9 19 13 12 America… North… West End Uber
## 7 50ef1165-9d… 1.55e9 23 16 12 America… North… West End Uber
## 8 91c4861c-17… 1.54e9 0 14 12 America… North… West End Uber
## 9 18d580ac-c9… 1.54e9 6 16 12 America… North… Haymarket … Lyft
## 10 5ef44fdf-c5… 1.55e9 11 18 12 America… North… Haymarket … Lyft
## # … with 374,201 more rows, and 53 more variables: product_id <chr>,
## # name <chr>, price <dbl>, distance <dbl>, surge_multiplier <dbl>,
## # latitude <dbl>, longitude <dbl>, temperature <dbl>,
## # apparentTemperature <dbl>, short_summary <chr>, long_summary <chr>,
## # precipIntensity <dbl>, precipProbability <dbl>, humidity <dbl>,
## # windSpeed <dbl>, windGust <dbl>, windGustTime <dbl>, visibility <dbl>,
## # temperatureHigh <dbl>, temperatureHighTime <dbl>, temperatureLow <dbl>, …
uber_lyft_december_merge_dates <- uber_lyft_december_merge_dates %>%
filter(source != "NA")
unique(uber_lyft_december_merge_dates$source)
## [1] "Haymarket Square" "Back Bay"
## [3] "North End" "North Station"
## [5] "Beacon Hill" "Boston University"
## [7] "Fenway" "South Station"
## [9] "Theatre District" "West End"
## [11] "Financial District" "Northeastern University"
daily_number_rides_st_dec <- ggplot(uber_lyft_december_merge_dates, aes(source, fill=weekday3)) + geom_bar(position="dodge") +
scale_y_continuous(labels=comma) + theme(axis.text.x = element_text(angle=90, vjust=.5, hjust=1)) +
ggtitle("Daily Number of Total Rides Per Station - December") + xlab("Station") + ylab("Total Rides") +
scale_fill_discrete(name = "Day of the Week")
uber_lyft_december_merge_dates_imp_info<- uber_lyft_december_merge_dates %>%
dplyr::group_by(Date, weekday1, weekday2, weekday3) %>%
dplyr::summarize(total_rides=n())
## `summarise()` has grouped output by 'Date', 'weekday1', 'weekday2'. You can
## override using the `.groups` argument.
uber_lyft_december_merge_dates_imp_info
## # A tibble: 12 × 5
## # Groups: Date, weekday1, weekday2 [12]
## Date weekday1 weekday2 weekday3 total_rides
## <dttm> <dbl> <chr> <ord> <int>
## 1 2018-12-01 00:00:00 6 Sat Saturday 41680
## 2 2018-12-02 00:00:00 7 Sun Sunday 41298
## 3 2018-12-03 00:00:00 1 Mon Monday 41323
## 4 2018-12-04 00:00:00 2 Tue Tuesday 11627
## 5 2018-12-09 00:00:00 7 Sun Sunday 1529
## 6 2018-12-10 00:00:00 1 Mon Monday 2534
## 7 2018-12-13 00:00:00 4 Thu Thursday 35496
## 8 2018-12-14 00:00:00 5 Fri Friday 41344
## 9 2018-12-15 00:00:00 6 Sat Saturday 41332
## 10 2018-12-16 00:00:00 7 Sun Sunday 41359
## 11 2018-12-17 00:00:00 1 Mon Monday 41354
## 12 2018-12-18 00:00:00 2 Tue Tuesday 33329
full_nov_dec_df <- rbind(uber_lyft_november_merge_dates, uber_lyft_december_merge_dates)
day_Hour <- full_nov_dec_df %>%
filter(hour != "NA") %>%
group_by(day, weekday3, hour) %>%
dplyr::summarize(total_rides=n())
## `summarise()` has grouped output by 'day', 'weekday3'. You can override using
## the `.groups` argument.
day_hour_week_plto <- ggplot(day_Hour, aes(hour, total_rides, fill=weekday3)) + geom_bar(stat = "identity") +
ggtitle("Rides by Hour and Day of the Week") +
theme(plot.title = element_text(hjust=0.5)) +
scale_fill_discrete(name = "Day of the Week") + ylab("Total Rides") + xlab("Hour")
This section first examines the distribution of total number of cab rides per source and then per hour.
#daily number of rides per station
daily_rides_data <- full_nov_dec_df %>%
group_by(source, hour, weekday3) %>%
dplyr::summarize(total_rides = n())
## `summarise()` has grouped output by 'source', 'hour'. You can override using
## the `.groups` argument.
daily_num_rides_station <- ggplot(daily_rides_data, aes(source, total_rides)) +
geom_bar(stat = "identity", position = "dodge", aes(fill = weekday3)) +
theme(axis.text.x = element_text(angle=90)) + xlab("Source") + ylab("Total Rides") +
ggtitle("Total Number of Rides pet Day of the Week by Source") +
scale_fill_discrete(name = "Day of the Week")
This graph clearly shows that regardless of source, the largest number of cab rides took place on Monday and Tuesday. Specifically, Tuesday tended to have even more cab rides than Monday.
#total number of rides by hour
hour_rides_data <- lyft.uber.remove.na %>%
dplyr::group_by(hour) %>%
dplyr::summarize(total_rides = n())
datatable(hour_rides_data)
hour_rides_data
## # A tibble: 24 × 2
## hour total_rides
## <dbl> <int>
## 1 0 29872
## 2 1 26310
## 3 2 26323
## 4 3 25530
## 5 4 26125
## 6 5 22995
## 7 6 25147
## 8 7 22930
## 9 8 22337
## 10 9 26673
## # … with 14 more rows
total_ride_per_hr_plot <- ggplot(hour_rides_data, aes(hour, total_rides)) +
geom_bar(stat="identity", fill="blue", color="orange") +
theme(legend.position="none") + ggtitle("Total Rides Per Hour") +
scale_y_continuous(label=comma) + ylab("Total Rides") +
theme(plot.title = element_text(hjust=0.5))
total_ride_per_hr_plot
This graph shows that the majority of rides occurred between 11PM and 12AM. The second highest number of rides occurred between 5PM and 6PM.
#total lyft rides by hour
hour_lyft_data <- lyft.uber.remove.na %>%
filter(cab_type == "Lyft") %>%
dplyr::group_by(hour) %>%
dplyr::summarize(total_rides = n())
hour_lyft_data_plot <- ggplot(hour_lyft_data, aes(hour, total_rides)) +
geom_bar(stat="identity", fill="blue", color="orange") +
theme(legend.position="none") + ggtitle("Lyft Rides Per Hour") +
scale_y_continuous(labels=comma) + theme(plot.title = element_text(hjust=0.5)) +
xlab("Hour") + ylab("Total Rides")
hour_lyft_data_plot
Similarly, this graph shows that the majority of rides occurred between 11PM and 12AM. The second highest number of rides occurred at 5PM.
#total uber rides by hour
hour_uber_data <- lyft.uber.remove.na %>%
filter(cab_type == "Uber") %>%
dplyr::group_by(hour) %>%
dplyr::summarize(total_rides = n())
hour_uber_data_plot <- ggplot(hour_uber_data, aes(hour, total_rides)) +
geom_bar(stat="identity", fill="blue", color="orange") +
theme(legend.position="none") + ggtitle("Uber Rides Per Hour") +
scale_y_continuous(labels=comma) + theme(plot.title = element_text(hjust=0.5)) +
xlab("Hour") + ylab("Total Rides")
hour_uber_data_plot
This graph shows the majority of rides occurred between 11PM and 12AM.
#Price vs hour - uber - distance/price to start
lyft.uber.price.vs.hour <- lyft.uber.remove.na %>%
group_by(price, distance) %>%
dplyr::select(-apparentTemperatureMaxTime, -apparentTemperatureMax, -apparentTemperatureMinTime, -apparentTemperatureMin,
-temperatureMaxTime, -temperatureMax, -temperatureMinTime, -temperatureMin, -uvIndexTime, -precipIntensityMax,
-moonPhase, -sunsetTime, -sunriseTime, -ozone, -visibility.1, -uvIndex, -cloudCover, -windBearing, -pressure,
-dewPoint, -icon, -apparentTemperatureLowTime, -apparentTemperatureLow, -apparentTemperatureHighTime,
-apparentTemperatureHigh, -temperatureLowTime, -temperatureLow, -temperatureHighTime, -temperatureHigh, -visibility,
-windGustTime, -windGust, -windSpeed, -humidity, -precipProbability, -precipIntensity, -long_summary, -short_summary,
-apparentTemperature) %>%
filter(surge_multiplier <= 1)
lyft.uber.avg.dist.price <- lyft.uber.price.vs.hour %>%
group_by(distance, cab_type) %>%
summarize_at(vars(price), list(name = mean))
lyft.uber.avg.dist.price_plot <- ggplot(lyft.uber.avg.dist.price, aes(distance, name, color=cab_type)) + geom_line() + ggtitle("Average Price by Distance for both Lyft and Uber")+
scale_fill_discrete("Type of Cab") + xlab("Distance") + ylab("Average Price")
lyft.uber.avg.dist.hour <- lyft.uber.price.vs.hour %>%
group_by(hour) %>%
summarize_at(vars(distance), list(name = mean))
lyft.uber.avg.dist.hour_plot <- ggplot(lyft.uber.avg.dist.hour, aes(hour, name)) + geom_line() + ggtitle("Average Ride Distance by Hour") + xlab("Hour of the Day") +
ylab("Average Distance (Miles)")
lyft.uber.avg.dist.hour_plot
This graph shows that on average, the longest ride distance took place at roughly 3PM and 9PM. The next longest ride distance intervals were at 7:30AM and 10AM. The points all make sense because the morning data correlates to the early commute and the evening data correlates to the latter commute.
#Lyft.Uber.TotalRides vs Temperature
temp_rides_data <- lyft.uber.remove.na %>%
dplyr::group_by(temperature, cab_type) %>%
dplyr::summarize(total_rides = n())
## `summarise()` has grouped output by 'temperature'. You can override using the
## `.groups` argument.
total_rides_data_plot <- ggplot(temp_rides_data, aes(temperature, total_rides, color=cab_type)) +
geom_line() +
theme(legend.position="right") + ggtitle("Total Rides vs Temperature (F)") +
ylab("Total Rides") +
theme(plot.title = element_text(hjust=0.5)) + xlim(30,50) + ylim(0,2000)
total_rides_data_plot
## Warning: Removed 96 row(s) containing missing values (geom_path).
wind_gust_rides_data <- lyft.uber.remove.na %>%
dplyr::group_by(windGust, temperature, cab_type) %>%
dplyr::summarize(total_rides = n())
## `summarise()` has grouped output by 'windGust', 'temperature'. You can override
## using the `.groups` argument.
I narrowed the temperature range of this graph in order to more accurately show the spikes in the number of cab rides. Regardless of cab type (Uber or Lyft), there were similar major surges and drops in the number of cab rides right around 35, 38, and 42 degrees. There were surges in the number of both Uber and Lyft rides ordered between 43 and 47 degrees Fahrenheit.
#Lyft.Uber_Wind_Gust
wind_gust_rides_data_plot <- ggplot(wind_gust_rides_data, aes(temperature, total_rides, color=windGust)) +
geom_line() + geom_point() +
theme(legend.position="right") + ggtitle("Total Rides vs Temperature (F) Colored by Wind Gust (mph") +
ylab("Total Rides") + xlab("Temperature") +
theme(plot.title = element_text(hjust=0.5)) + xlim(30,50) + ylim(0,2000)
wind_gust_rides_data_plot
## Warning: Removed 100 row(s) containing missing values (geom_path).
## Warning: Removed 131 rows containing missing values (geom_point).
cor(wind_gust_rides_data$windGust, wind_gust_rides_data$total_rides)
## [1] 0.2456817
To explore the temperature and total rides relationship further, I made another plot looking again at temperature vs rides, but colored by wind gust speed (mph). The higher speeds correlate to a lighter blue color and these surges in wind gust speeds fall between 42 and 45 degrees Fahrenheit, which was when there was an increase in both Uber and Lyft cab rides in the previous plot. In other words, this means that there is a positive correlation between an increase in rides and higher wind gusts although it is small (0.246).
lyft.uber.avg.dist.price_plot
This graph shows that regardless of distance traveled, riders prefer to use Uber over Lyft.
Before diving into the analysis, I wanted to determine if there was an obvious correlation between distance and price for both types of cabs. I noticed that the surge multiplier only applied to Lyfts, so I made two graphs. The first (Uber) was a correlation plot between distance and price and the second (Lyft) was a correlation plot between distance, price, and surge multiplier.
lyft.uber.remove.na_LYFT <- lyft.uber.remove.na %>%
filter(cab_type == "Lyft")
mydata_LYFT <- lyft.uber.remove.na_LYFT[, c(13,14,15)]
corrplot(cor(mydata_LYFT), tl.col = "brown", tl.srt = 30, bg = "White",
title = "\n\n Lyft: Correlation Between Price, Distance, and Surge Multiplier",
type = "full")
cor(lyft.uber.remove.na_LYFT[, c(13,14,15)])
## price distance surge_multiplier
## price 1.0000000 0.36156662 0.30823712
## distance 0.3615666 1.00000000 0.04007585
## surge_multiplier 0.3082371 0.04007585 1.00000000
lyft.uber.remove.na_LYFT <- lyft.uber.remove.na %>%
filter(cab_type == "Lyft")
mydata_LYFT <- lyft.uber.remove.na_LYFT[, c(13,14,15)]
corrplot(cor(mydata_LYFT), tl.col = "brown", tl.srt = 30, bg = "White",
title = "\n\n Lyft: Correlation Between Price, Distance, and Surge Multiplier",
type = "full")
cor(lyft.uber.remove.na_LYFT[, c(13,14,15)])
## price distance surge_multiplier
## price 1.0000000 0.36156662 0.30823712
## distance 0.3615666 1.00000000 0.04007585
## surge_multiplier 0.3082371 0.04007585 1.00000000
In addition to creating these correlation plots, I also found the direct correlation data for each cab type. Looking at the Uber graph, price and distance are weakly correlated with an exact correlation value of 0.3362. The Lyft graph shows that price and distance too aren’t strongly correlated (correlation value of 0.3616) and similarly surge_multiplier and price are weakly correlated (correlation value of 0.3082) and surge_multiplier and distance are even more weakly correlated (correlation value of 0.04008).
I was interested in seeing if there was a correlation between a cab route and the total number of rides. This section includes all available ride data for both Uber and Lyft. With respect to Lyft, this means that I’ve included both surge and non-surge data.
#avg price by source & neighborhood map, Both cab types, Both Surge and no Surge
avg_price_simulator <- full_nov_dec_df
avg_price_simulator <- avg_price_simulator[-c(1:2, 4:6, 10:11, 15:50)]
avg_price_simulator_reduced <- avg_price_simulator[-c(8:18)]
avg_price_simulator_reduced_2 <- avg_price_simulator_reduced
source_destination_lyft_surge_2 <- avg_price_simulator_reduced_2 %>%
unite('Route', source:destination, remove = FALSE)
source_destination_lyft_surge_2_adj<- source_destination_lyft_surge_2 %>%
group_by(Route) %>%
dplyr::summarize(total_rides = n(), mean_price = mean(price))
source_destination_lyft_surge_2_adj
## # A tibble: 72 × 3
## Route total_rides mean_price
## <chr> <int> <dbl>
## 1 Back Bay_Boston University 8682 14.0
## 2 Back Bay_Fenway 8718 13.7
## 3 Back Bay_Haymarket Square 8838 18.0
## 4 Back Bay_North End 9414 19.5
## 5 Back Bay_Northeastern University 8825 13.2
## 6 Back Bay_South Station 8724 17.7
## 7 Beacon Hill_Boston University 8490 16.4
## 8 Beacon Hill_Fenway 8688 16.2
## 9 Beacon Hill_Haymarket Square 8802 13.8
## 10 Beacon Hill_North End 9240 15.3
## # … with 62 more rows
price_rides_route <- source_destination_lyft_surge_2_adj %>%
arrange(desc(mean_price))
price_rides_route_desc <- price_rides_route[1:5,]
price_rides_route_asc <- price_rides_route[68:72,]
price_rides_route_top_lowest <- rbind(price_rides_route_desc, price_rides_route_asc)
price_rides_route_desc_plot <- ggplot(data = price_rides_route_desc, aes(forcats::fct_reorder(Route, desc(mean_price)), mean_price)) +
geom_bar(stat = "identity", position = "dodge", aes(fill=Route)) + xlab("Route") + ylab("Total Rides") +
theme(axis.text.x = element_text(angle=90)) + ggtitle("Mean Ride Price vs Cab Route") +
geom_text(aes(label=round(mean_price, 1)))
gprice_rides_route_top_lowest_plot <- ggplot(data = price_rides_route_top_lowest, aes(forcats::fct_reorder(Route, desc(mean_price)), mean_price)) +
geom_bar(stat = "identity", position = "dodge", aes(fill=Route)) + xlab("Route") + ylab("Total Rides") +
theme(axis.text.x = element_text(angle=90), legend.position = "none") + ggtitle("Five Priciest and Five Least Priciest Ride Price vs Cab Route") +
geom_text(aes(label=round(mean_price, 1)))
gprice_rides_route_top_lowest_plot
Above is a graph of the top ten mean cab rides in the aforementioned Boston neighborhoods. This plot shows that the Financial District to Boston University route has the highest average cab ride price. Inversely, the Haymarket Square to North Station route has the lowest average cab ride price. This is reasonable as the distance between the Financial District and Boston University is 4 miles whereas the distance between Haymarket Square and North Station is 0.5 miles. However, this translates to the most expensive ride equating to $6.38 per mile and the least expensive ride being a whopping $24.60 per mile.
#hour of day shortest ride and longest ride
#previous graph shows most expensive avg ride being financial district to boston university
#and least expensive being both Cab Types
source_destination_lyft_surge_2_adj_hour_shortest_ride <- source_destination_lyft_surge_2 %>%
filter(Route=="Haymarket Square_North Station") %>%
group_by(hour) %>%
dplyr::summarize(total_rides = n())
source_destination_lyft_surge_2_adj_hour_longest_ride <- source_destination_lyft_surge_2 %>%
filter(Route=="Financial District_Boston University") %>%
group_by(hour) %>%
dplyr::summarize(total_rides = n())
longest_shortest_combined <- rbind(source_destination_lyft_surge_2_adj_hour_shortest_ride,
source_destination_lyft_surge_2_adj_hour_longest_ride)
source_destination_lyft_surge_2_adj_hour_shortest_ride_plot <- ggplot(data = source_destination_lyft_surge_2_adj_hour_shortest_ride, aes(hour, total_rides, color="green")) +
geom_bar(stat = "identity", position = "dodge", fill="purple") + xlab("Hour") + ylab("Total Rides") +
theme(axis.text.x = element_text(angle=90), legend.position = "none") +
ggtitle("Haymarket Sqare to North Station - Number of Rides Per Hour")
source_destination_lyft_surge_2_adj_hour_shortest_ride_plot
source_destination_lyft_surge_2_adj_hour_longest_ride_plot <- ggplot(data = source_destination_lyft_surge_2_adj_hour_longest_ride, aes(hour, total_rides, color="green")) +
geom_bar(stat = "identity", position = "dodge", fill="purple") + xlab("Hour") + ylab("Total Rides") +
theme(axis.text.x = element_text(angle=90)) +
ggtitle("Financial District to Boston University - Number of Rides Per Hour") +
theme(legend.position="none")
source_destination_lyft_surge_2_adj_hour_longest_ride_plot
The previous plot (which included the five most expensive and least expensive routes on average) showed that the priciest route was from the Financial District to Boston University. Here, I filtered for this route looking at the total number of rides per hour. The plot shows that the highest number of rides (464) occurred during the 2 o’clock hour (PM).
The aforementioned price plot (which included the five most expensive and least expensive routes on average) showed that the least expensive route was from the Haymarket Square to North Shore. Here, I filtered for this route looking at the total number of rides per hour. The plot shows that the highest number of rides (453) occurred during the 1 o’clock hour (PM). Interestingly enough, the highest number of rides took place during the same time of the day for both the most expensive and least expensive route.
In this section, I separated the data in order to look at the different cab types (Uber, Lyft) individually. Specifically, I focused on all Uber data and the Lyft data where no surge (surge_multiplier = 1.00) was present.
#Uber Avg Price vs Route Info
avg_price_simulator_reduced_3 <- avg_price_simulator_reduced
source_destination_lyft_surge_3 <- avg_price_simulator_reduced_3 %>%
unite('Route', source:destination, remove = FALSE)
source_destination_lyft_surge_3_adj<- source_destination_lyft_surge_3 %>%
filter(cab_type == "Uber") %>%
group_by(Route) %>%
dplyr::summarize(total_rides = n(), mean_price = mean(price))
price_rides_route_SURGE <- source_destination_lyft_surge_3_adj %>%
arrange(desc(mean_price))
price_rides_route_desc_SURGE <- price_rides_route_SURGE[1:5,]
price_rides_route_asc_SURGE <- price_rides_route_SURGE[68:72,]
price_rides_route_top_lowest_SURGE <- rbind(price_rides_route_desc_SURGE, price_rides_route_asc_SURGE)
price_rides_route_top_lowest_SURGE_plot <- ggplot(data = price_rides_route_top_lowest_SURGE, aes(forcats::fct_reorder(Route, desc(mean_price)), mean_price)) +
geom_bar(stat = "identity", position = "dodge", aes(fill=Route)) + xlab("Route") + ylab("Total Rides") +
theme(axis.text.x = element_text(angle=90), legend.position = "none") +
ggtitle("Uber: Five Priciest and Five Least Priciest Ride Price vs Cab Route") +
geom_text(aes(label=round(mean_price, 1)))
price_rides_route_top_lowest_SURGE_plot
This graph shows that the most expensive Uber average cab route was from the Financial District to Boston University (4 miles) and the least expensive route was from the Financial District to South Station (0.6 miles). This translates to the most expensive route being $6.08 per mile and the least expensive route being $20.17 per mile.
#Lyft Avg Price vs Route (no surge)
avg_price_simulator_reduced_4 <- avg_price_simulator_reduced
source_destination_lyft_surge_4 <- avg_price_simulator_reduced_4 %>%
unite('Route', source:destination, remove = FALSE)
source_destination_lyft_surge_4_adj<- source_destination_lyft_surge_4 %>%
filter(cab_type == "Lyft", surge_multiplier == 1) %>%
group_by(Route) %>%
dplyr::summarize(total_rides = n(), mean_price = mean(price))
source_destination_lyft_surge_4_adj
## # A tibble: 72 × 3
## Route total_rides mean_price
## <chr> <int> <dbl>
## 1 Back Bay_Boston University 3634 13.7
## 2 Back Bay_Fenway 3682 13.3
## 3 Back Bay_Haymarket Square 3791 17.4
## 4 Back Bay_North End 4053 19.0
## 5 Back Bay_Northeastern University 3811 12.8
## 6 Back Bay_South Station 3814 15.4
## 7 Beacon Hill_Boston University 3758 16.4
## 8 Beacon Hill_Fenway 3855 16.2
## 9 Beacon Hill_Haymarket Square 3948 13.4
## 10 Beacon Hill_North End 4026 15.0
## # … with 62 more rows
lyft_no_surge_price_rides_route <- source_destination_lyft_surge_4_adj %>%
arrange(desc(mean_price))
lyft_no_surge_price_rides_route_asc <- lyft_no_surge_price_rides_route[1:5,]
lyft_no_surge_price_rides_route_desc <- lyft_no_surge_price_rides_route[68:72,]
lyft_no_surge_price_rides_route_top_lowest <- rbind(lyft_no_surge_price_rides_route_asc, lyft_no_surge_price_rides_route_desc)
lyft_no_surge_price_rides_route_top_lowest_plot <- ggplot(data = lyft_no_surge_price_rides_route_top_lowest, aes(forcats::fct_reorder(Route, desc(mean_price)), mean_price)) +
geom_bar(stat = "identity", position = "dodge", aes(fill=Route)) + xlab("Route") + ylab("Total Rides") +
theme(axis.text.x = element_text(angle=90), legend.position = "none") +
ggtitle("Lyft (No Surge): Five Priciest and Five Least Priciest Ride Price vs Cab Route") +
geom_text(aes(label=round(mean_price, 1)))
lyft_no_surge_price_rides_route_top_lowest_plot
This graph shows that the most expensive Lyft average cab route was from the Financial District to Boston University (4 miles) and the least expensive route was from the Financial District to South Station (0.6 miles). This translates to the most expensive route being $6.45 per mile and the least expensive route being $20.50 per mile.
The surge multiplier only applied to Lyft rides, so Uber data wasn’t considered in this analysis. In this section, I separated the surge multiplier into two groups (1.25 - 1.75) and (2.00 - 3.00) in order to examine the most expensive cab route on average with respect to the two surge groups.
#Lyft Avg Price vs Route (SURGE = 1.25-1.75)
avg_price_simulator_reduced_1.25.75 <- avg_price_simulator_reduced
source_destination_lyft_surge_1.25.75 <- avg_price_simulator_reduced_1.25.75 %>%
unite('Route', source:destination, remove = FALSE)
source_destination_lyft_surge_1.25.75_adj<- source_destination_lyft_surge_1.25.75 %>%
filter(cab_type == "Lyft", surge_multiplier >= 1.25, surge_multiplier <= 1.75) %>%
group_by(Route) %>%
dplyr::summarize(total_rides = n(), mean_price = mean(price))
source_destination_lyft_surge_1.25.75_adj
## # A tibble: 72 × 3
## Route total_rides mean_price
## <chr> <int> <dbl>
## 1 Back Bay_Boston University 415 21.3
## 2 Back Bay_Fenway 435 21.2
## 3 Back Bay_Haymarket Square 350 28.7
## 4 Back Bay_North End 395 30.0
## 5 Back Bay_Northeastern University 370 20.2
## 6 Back Bay_South Station 325 24.9
## 7 Beacon Hill_Boston University 320 24.8
## 8 Beacon Hill_Fenway 325 25.4
## 9 Beacon Hill_Haymarket Square 240 20.7
## 10 Beacon Hill_North End 375 23.2
## # … with 62 more rows
lyft_1.25.75_surge_price_rides_route <- source_destination_lyft_surge_1.25.75_adj %>%
arrange(desc(mean_price))
lyft_1.25.75_surge_price_rides_route_asc <- lyft_1.25.75_surge_price_rides_route[1:5,]
lyft_1.25.75_surge_price_rides_route_desc <- lyft_1.25.75_surge_price_rides_route[68:72,]
lyft_1.25.75_surge_price_rides_route_top_lowest <- rbind(lyft_1.25.75_surge_price_rides_route_asc, lyft_1.25.75_surge_price_rides_route_desc)
lyft_1.25.75_surge_price_rides_route_top_lowest_plot <- ggplot(data = lyft_1.25.75_surge_price_rides_route_top_lowest, aes(forcats::fct_reorder(Route, desc(mean_price)), mean_price)) +
geom_bar(stat = "identity", position = "dodge", aes(fill=Route)) + xlab("Route") + ylab("Total Rides") +
theme(axis.text.x = element_text(angle=90), legend.position = "none") +
ggtitle("1.25-1.75 Surge Data: Five Priciest and Five Least Priciest Ride Price vs Cab Route") +
geom_text(aes(label=round(mean_price, 1)))
lyft_1.25.75_surge_price_rides_route_top_lowest_plot
This graph shows that the most expensive route on average was from Fenway Park to the Financial District (4.4 miles) and the least expensive route was from Haymarket Square to North Station (0.5 miles). This translates to the most expensive route being $9.00 per hour and the least expensive being $35.40 per hour. The average surge multiplier in this group is 1.5, which makes sense as the most expensive route is roughly 1.5 times the ride price from the no surge data. However, the least expensive route is actually 1.75 times the ride price from the no surge data.
#Lyft Avg Price vs Route (SURGE = 2.00 - 3.00)
avg_price_simulator_reduced_2.3 <- avg_price_simulator_reduced
source_destination_lyft_surge_2.3 <- avg_price_simulator_reduced_2.3 %>%
unite('Route', source:destination, remove = FALSE)
source_destination_lyft_surge_2.3_adj<- source_destination_lyft_surge_2.3 %>%
filter(cab_type == "Lyft", surge_multiplier >= 2.00, surge_multiplier <= 3.00) %>%
group_by(Route) %>%
dplyr::summarize(total_rides = n(), mean_price = mean(price))
lyft_2.3_surge_price_rides_route <- source_destination_lyft_surge_2.3_adj %>%
arrange(desc(mean_price))
lyft_2.3_surge_price_rides_route_asc <- lyft_2.3_surge_price_rides_route[1:5,]
lyft_2.3_surge_price_rides_route_desc <- lyft_2.3_surge_price_rides_route[50:54,]
lyft_2.3_surge_price_rides_route_top_lowest <- rbind(lyft_2.3_surge_price_rides_route_asc, lyft_2.3_surge_price_rides_route_desc)
lyft_2.3_surge_price_rides_route_top_lowest_plot <- ggplot(data = lyft_2.3_surge_price_rides_route_top_lowest, aes(forcats::fct_reorder(Route, desc(mean_price)), mean_price)) +
geom_bar(stat = "identity", position = "dodge", aes(fill=Route)) + xlab("Route") + ylab("Total Rides") +
theme(axis.text.x = element_text(angle=90), legend.position = "none") +
ggtitle("2.00-3.00 Surge: Five Priciest and Five Least Priciest Ride Price vs Cab Route") +
geom_text(aes(label=round(mean_price, 1)))
lyft_2.3_surge_price_rides_route_top_lowest_plot
This graph shows that the most expensive route on average was from the Financial District to Boston University (4 miles) and the least expensive route was from the Financial District to South Station (0.6 miles). This translates to the most expensive route being $14.20 per hour and the least expensive being $42.50 per hour. The average surge multiplier in this group is 2.5, which means that the most expensive route is actually almost $2.00 cheaper than anticipated. Similarly, the least expensive route is $10 cheaper than anticipated. This can likely be explained by the difference in number of cab_rides. The first surge multiplier group contained 18,570 samples whereas this group only contained $2,405 samples. This difference well explains the variance in cab route price data.
#Surge-Day of Week-Total Rides Lyft
day_Hour_lyft_surge <- full_nov_dec_df %>%
filter(weekday3 != "NA", surge_multiplier > 1.00) %>%
group_by(day, weekday3, surge_multiplier) %>%
dplyr::summarize(total_rides=n())
## `summarise()` has grouped output by 'day', 'weekday3'. You can override using
## the `.groups` argument.
day_Hour_lyft_surge_plot <- ggplot(data = day_Hour_lyft_surge, aes(x = weekday3, y = total_rides, fill = factor(surge_multiplier))) +
geom_bar(stat = "identity", position = "dodge") +
xlab("Day of the Week") + ylab("Total Rides") + ggtitle("Total Rides vs Day of the Week Per Surge Multiplier") +
labs(fill="Surge Multiplier")
day_Hour_lyft_surge_plot
This plot examined the total number of Lyft rides per day of the week and their respective surge multiplier. Regardless of surge multiplier, the highest number of rides occurred on Tuesday and Wednesday.
#Lyft - Surge Per Hour
lyft.uber.remove.na_surge <- lyft.uber.remove.na %>%
filter(cab_type == "Lyft", surge_multiplier > 1.00) %>%
dplyr::group_by(hour, surge_multiplier) %>%
dplyr::summarize(total_rides = n())
## `summarise()` has grouped output by 'hour'. You can override using the
## `.groups` argument.
lyft.uber.remove.na_surge$surge_multiplier <- as.factor(lyft.uber.remove.na_surge$surge_multiplier)
lyft.uber.remove.na_surge_plot <- ggplot(lyft.uber.remove.na_surge, aes(hour, total_rides, color = surge_multiplier)) +
geom_point(alpha=0.8, size=1, aes(color = surge_multiplier)) +
geom_line(aes(color = surge_multiplier)) + ggtitle("Lyft: Total Rides vs Hour of the Day Per Surge Multiplier") +
facet_wrap(~surge_multiplier, ncol=1, scales="free") + xlab("Hour") + ylab("Total Rides") +
guides(color=guide_legend(ncol=1)) + theme(legend.position="none",
panel.border = element_blank(),
panel.spacing.x = unit(0,"line"))
lyft.uber.remove.na_surge_plot
This plot examined the total number of Lyft rides per hour of the day and their respective surge multiplier. Regardless of surge multiplier, the highest number of rides took place between 12 and 1 o’clock in the afternoon.
#Surge Rides Source - Lyft
source_destination_lyft_surge <- full_nov_dec_df %>%
filter(source != "NA", destination != "NA", surge_multiplier > 1.00) %>%
group_by(source, destination, surge_multiplier, weekday3) %>%
dplyr::summarize(total_rides = n()) %>%
arrange(desc(total_rides))
## `summarise()` has grouped output by 'source', 'destination',
## 'surge_multiplier'. You can override using the `.groups` argument.
source_destination_lyft_surge_combine <- source_destination_lyft_surge %>%
unite('Route', source:destination, remove = FALSE)
data_source_destination_route <- source_destination_lyft_surge_combine[with(source_destination_lyft_surge_combine,order(-total_rides)),]
data_source_destination_route_2 <- data_source_destination_route
total_rides_route <- ddply(data_source_destination_route_2,"Route",numcolwise(sum))
total_rides_route_desc <- total_rides_route %>%
arrange(desc(total_rides))
data_source_destination_route <- total_rides_route_desc[1:10,]
data_source_destination_route_lowest <- total_rides_route_desc[63:72,]
data_source_destination_route_up_plot <- ggplot(data = data_source_destination_route, aes(forcats::fct_reorder(Route, desc(total_rides)), total_rides)) +
geom_bar(stat = "identity", position = "dodge", aes(fill=Route)) + xlab("Route") + ylab("Total Rides") +
theme(axis.text.x = element_text(angle=90), legend.position = "none") + ggtitle("Surge: Relationship Between Lyft Rides and Routes - Upper Bound") +
scale_y_continuous(breaks = c(100, 200, 300, 400, 500, 600))
data_source_destination_route_up_plot
This plot shows that the routes from Back Bay to Boston University and Fenway more often correlated with a surge in Lyft prices. In addition, rides originating in Back Bay constituted forty percent of this graph.
data_source_destination_route_lowest_lb_plot <- ggplot(data = data_source_destination_route_lowest, aes(forcats::fct_reorder(Route, desc(total_rides)), total_rides)) +
geom_bar(stat = "identity", position = "dodge", aes(fill=Route)) + xlab("Route") + ylab("Total Rides") +
theme(axis.text.x = element_text(angle=90), legend.position = "none") + ggtitle("Surge: Relationship Between Lyft Rides and Routes - Lower Bound") +
scale_y_continuous(breaks = c(20, 40, 60, 80))
data_source_destination_route_lowest_lb_plot
This plot shows that routes from Haymarket Square to Beacon Hill less likely experienced a surge in Lyft prices. Furthermore, routes originating in Haymarket Square and North End constituted this entire graph with respect to less common surge prices.