Introduction

The dataset I used for this analysis was from Kaggle (https://www.kaggle.com/brllrb/uber-and-lyft-dataset-boston-ma), which was collected from 11-26-18 to 12-18-18 in Boston, MA. The data was gathered from various entities including Uber and Lyft.

Data Dive

Upon studying this dataset, I found that of the 637,976 rows of data, the split between Uber and Lyft rides was roughly 50/50. When coding, I searched for complete cases in order to avoid any NA values in the dataframe. I registered my own API with google in order to assist with making any desired maps.

lyft.uber.dataset <- read_csv("/Users/bethanyleach/Downloads/rideshare_kaggle.csv")
## Rows: 693071 Columns: 57
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (10): id, timezone, source, destination, cab_type, product_id, name, sh...
## dbl  (46): timestamp, hour, day, month, price, distance, surge_multiplier, l...
## dttm  (1): datetime
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
lyft.uber.revised <- as_tibble(lyft.uber.dataset)

lyft.uber.remove.na <- lyft.uber.revised[complete.cases(lyft.uber.revised), ]

register_google("AIzaSyBZx2Za3bJfe2OY0QxgPgef-4GX9jd61Pg")

Overview of Boston Neighborhoods:

There are twelve neighborhoods included in this analysis. Below, I’ve attached a labeled map of the neighborhoods, a density diagram, and a graph examining the average cab price per neighborhood route.

#labeled neighborhood map
boston3 <- boston_stamen

 lat <- c(42.364, 42.352, 42.3398, 42.3503, 42.3505, 42.3505, 42.3519, 42.3559, 42.3588, 42.3647, 42.3661, 42.3661)
 lon <- c(-71.060, -71.065, -71.0892, -71.0810, -71.1054, -71.1054, -71.0551, -71.0550, -71.0707, -71.0542, -71.0631, -71.0631)
 labels <- c("Haymarket Square", "Theatre District", "Northeastern University", "Back Bay", "Boston University",
             "Fenway", "South Station", "Financial District", "Beacon Hill", "North End", "North Station", "West End")

 combined_df <- data.frame(lat, lon, labels)
 neighborhood_map_adj <- ggmap(boston3) + ggtitle("Labeled Map of Neighborhoods in Boston")+
   geom_point(data=combined_df, aes(x=lon, y=lat), 
                                                     size=0.5, alpha =0.5, fill="red") + labs(x = 'Longitude', y = 'Latitude') +
                                                     geom_label_repel(data=combined_df, aes(x = lon, y= lat,
                                                                                            label = labels), fill = "white",
                                                                                            box.padding = unit(.4, "lines"),
                                                                                            label.padding = unit(.15, "lines"),
                                                                                            segment.color = "red", segment.size = 1)

neighborhood_map_adj
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf
## Warning in min(x): no non-missing arguments to min; returning Inf
## Warning in max(x): no non-missing arguments to max; returning -Inf

stat_density_boston_map <- ggmap(boston3) + stat_density_2d(data=lyft.uber.remove.na, aes(x=longitude, y=latitude, fill = stat(level)), geom="polygon") +
        ggtitle("Density Map of Cab Rides in Boston Neighborhoods")
stat_density_boston_map
## Warning: Removed 81570 rows containing non-finite values (stat_density2d).

Map Explanation

I used both ggmap and get_stamenmap to create the labeled map featured above. I used latitude and longitude to clearly indicate where each neighborhood is located.

This densitymap outlines the density of total rides in each of the neighborhoods. The lighter blue indicates a larger number of rides.

#total rides by hour and day
my_months_name <- month.name[lyft.uber.remove.na$month] 

lyft.uber.remove.na_months <- lyft.uber.remove.na %>%
        mutate(month_name = my_months_name)

uber_lyft_rides_per_day_november <- lyft.uber.remove.na_months %>%
        filter(month_name == "November")

uber_lyft_rides_per_day_december <- lyft.uber.remove.na_months %>%
        filter(month_name == "December")
#separating datetime column in month being November
uber_lyft_rides_per_day_november$Date <- as.Date(uber_lyft_rides_per_day_november$datetime)
uber_lyft_rides_per_day_november$Time <- format(as.POSIXct(uber_lyft_rides_per_day_november$datetime), 
                                                format = "%H:%M:%S") 
unique(uber_lyft_rides_per_day_november$day)
## [1] 27 28 30 29 26
begin_nov <- as.POSIXct("2018-11-26")
finish_nov <- as.POSIXct("2018-11-30")

dat3 <- data.frame(Date = seq.POSIXt(from = begin_nov, to = finish_nov, by ="DSTday"))

dat3$weekday1 <- as.numeric(format(dat3$Date, format = "%u"))
dat3$weekday2 <- format(dat3$Date, format = "%a")
dat3$weekday3 <- format(dat3$Date, format = "%A")

uber_lyft_november_merge_dates <- uber_lyft_rides_per_day_november %>%
        full_join(dat3, by="Date")

uber_lyft_november_merge_dates$weekday3 <- ordered(uber_lyft_november_merge_dates$weekday3, 
                                                   levels=c("Monday","Tuesday", "Wednesday", 
                                                            "Thursday", "Friday", "Saturday", "Sunday"))

uber_lyft_november_merge_dates <- uber_lyft_november_merge_dates %>%
        select(-datetime)
uber_lyft_november_merge_dates
## # A tibble: 263,771 × 62
##    id           timestamp  hour   day month timezone source destination cab_type
##    <chr>            <dbl> <dbl> <dbl> <dbl> <chr>    <chr>  <chr>       <chr>   
##  1 4bd23055-68…    1.54e9     2    27    11 America… Hayma… North Stat… Lyft    
##  2 981a3613-77…    1.54e9     1    28    11 America… Hayma… North Stat… Lyft    
##  3 c2d88af2-d2…    1.54e9     4    30    11 America… Hayma… North Stat… Lyft    
##  4 e0126e1f-8c…    1.54e9     3    29    11 America… Hayma… North Stat… Lyft    
##  5 462816a3-82…    1.54e9     5    26    11 America… Back … Northeaste… Lyft    
##  6 8612d909-98…    1.54e9    10    27    11 America… Back … Northeaste… Lyft    
##  7 9043bf77-1d…    1.54e9    16    30    11 America… Back … Northeaste… Lyft    
##  8 d859ec69-b3…    1.54e9    19    28    11 America… Back … Northeaste… Lyft    
##  9 009e9c53-07…    1.54e9    22    30    11 America… North… West End    Uber    
## 10 e219e545-a0…    1.54e9    19    29    11 America… North… West End    Uber    
## # … with 263,761 more rows, and 53 more variables: product_id <chr>,
## #   name <chr>, price <dbl>, distance <dbl>, surge_multiplier <dbl>,
## #   latitude <dbl>, longitude <dbl>, temperature <dbl>,
## #   apparentTemperature <dbl>, short_summary <chr>, long_summary <chr>,
## #   precipIntensity <dbl>, precipProbability <dbl>, humidity <dbl>,
## #   windSpeed <dbl>, windGust <dbl>, windGustTime <dbl>, visibility <dbl>,
## #   temperatureHigh <dbl>, temperatureHighTime <dbl>, temperatureLow <dbl>, …
uber_lyft_november_merge_dates <- uber_lyft_november_merge_dates %>%
        filter(source != "NA")
unique(uber_lyft_november_merge_dates)
## # A tibble: 263,771 × 62
##    id           timestamp  hour   day month timezone source destination cab_type
##    <chr>            <dbl> <dbl> <dbl> <dbl> <chr>    <chr>  <chr>       <chr>   
##  1 4bd23055-68…    1.54e9     2    27    11 America… Hayma… North Stat… Lyft    
##  2 981a3613-77…    1.54e9     1    28    11 America… Hayma… North Stat… Lyft    
##  3 c2d88af2-d2…    1.54e9     4    30    11 America… Hayma… North Stat… Lyft    
##  4 e0126e1f-8c…    1.54e9     3    29    11 America… Hayma… North Stat… Lyft    
##  5 462816a3-82…    1.54e9     5    26    11 America… Back … Northeaste… Lyft    
##  6 8612d909-98…    1.54e9    10    27    11 America… Back … Northeaste… Lyft    
##  7 9043bf77-1d…    1.54e9    16    30    11 America… Back … Northeaste… Lyft    
##  8 d859ec69-b3…    1.54e9    19    28    11 America… Back … Northeaste… Lyft    
##  9 009e9c53-07…    1.54e9    22    30    11 America… North… West End    Uber    
## 10 e219e545-a0…    1.54e9    19    29    11 America… North… West End    Uber    
## # … with 263,761 more rows, and 53 more variables: product_id <chr>,
## #   name <chr>, price <dbl>, distance <dbl>, surge_multiplier <dbl>,
## #   latitude <dbl>, longitude <dbl>, temperature <dbl>,
## #   apparentTemperature <dbl>, short_summary <chr>, long_summary <chr>,
## #   precipIntensity <dbl>, precipProbability <dbl>, humidity <dbl>,
## #   windSpeed <dbl>, windGust <dbl>, windGustTime <dbl>, visibility <dbl>,
## #   temperatureHigh <dbl>, temperatureHighTime <dbl>, temperatureLow <dbl>, …
daily_number_rides_st_nov <- ggplot(uber_lyft_november_merge_dates, aes(source, fill=weekday3)) + geom_bar(position="dodge") + 
        scale_y_continuous(labels=comma) + theme(axis.text.x = element_text(angle=90, vjust=.5, hjust=1)) + 
        ggtitle("Daily Number of Total Rides Per Station - November") + scale_fill_discrete(name = "Day of the Week") +
        xlab("Station") + ylab("Total Rides")

uber_lyft_november_merge_dates_imp_info<- uber_lyft_november_merge_dates %>%
        dplyr::group_by(Date, weekday1, weekday2, weekday3) %>%
        dplyr::summarize(total_rides=n())
## `summarise()` has grouped output by 'Date', 'weekday1', 'weekday2'. You can
## override using the `.groups` argument.
uber_lyft_november_merge_dates_imp_info
## # A tibble: 5 × 5
## # Groups:   Date, weekday1, weekday2 [5]
##   Date                weekday1 weekday2 weekday3  total_rides
##   <dttm>                 <dbl> <chr>    <ord>           <int>
## 1 2018-11-26 00:00:00        1 Mon      Monday          29028
## 2 2018-11-27 00:00:00        2 Tue      Tuesday         70135
## 3 2018-11-28 00:00:00        3 Wed      Wednesday       67842
## 4 2018-11-29 00:00:00        4 Thu      Thursday        55222
## 5 2018-11-30 00:00:00        5 Fri      Friday          41544
#separating datetime column in month being December
uber_lyft_rides_per_day_december_altered_date <- uber_lyft_rides_per_day_december 

uber_lyft_rides_per_day_december_altered_date$Date <- as.Date(uber_lyft_rides_per_day_december_altered_date$datetime)
uber_lyft_rides_per_day_december_altered_date$Time <- format(as.POSIXct(uber_lyft_rides_per_day_december_altered_date$datetime), 
                                                             format = "%H:%M:%S")

begin <- as.POSIXct("2018-12-01")
finish <- as.POSIXct("2018-12-18")
dat2 <- data.frame(Date = seq.POSIXt(from = begin, to = finish, by ="DSTday"))

dat2$weekday1 <- as.numeric(format(dat2$Date, format = "%u"))
dat2$weekday2 <- format(dat2$Date, format = "%a")
dat2$weekday3 <- format(dat2$Date, format = "%A")

uber_lyft_december_merge_dates <- uber_lyft_rides_per_day_december_altered_date %>%
        full_join(dat2, by="Date")

uber_lyft_december_merge_dates$weekday3 <- ordered(uber_lyft_december_merge_dates$weekday3, 
                                                   levels=c("Monday","Tuesday", "Wednesday", 
                                                            "Thursday", "Friday", "Saturday", "Sunday"))

uber_lyft_december_merge_dates <- uber_lyft_december_merge_dates %>%
        select(-datetime)
uber_lyft_december_merge_dates
## # A tibble: 374,211 × 62
##    id           timestamp  hour   day month timezone source destination cab_type
##    <chr>            <dbl> <dbl> <dbl> <dbl> <chr>    <chr>  <chr>       <chr>   
##  1 424553bb-71…    1.54e9     9    16    12 America… Hayma… North Stat… Lyft    
##  2 f6f6d7e4-3e…    1.55e9    18    17    12 America… Hayma… North Stat… Lyft    
##  3 474d6376-bc…    1.54e9    19     2    12 America… Back … Northeaste… Lyft    
##  4 4f9fee41-fd…    1.54e9     6     3    12 America… Back … Northeaste… Lyft    
##  5 23f145da-f0…    1.54e9    10    13    12 America… North… West End    Uber    
##  6 357559cb-8c…    1.54e9    19    13    12 America… North… West End    Uber    
##  7 50ef1165-9d…    1.55e9    23    16    12 America… North… West End    Uber    
##  8 91c4861c-17…    1.54e9     0    14    12 America… North… West End    Uber    
##  9 18d580ac-c9…    1.54e9     6    16    12 America… North… Haymarket … Lyft    
## 10 5ef44fdf-c5…    1.55e9    11    18    12 America… North… Haymarket … Lyft    
## # … with 374,201 more rows, and 53 more variables: product_id <chr>,
## #   name <chr>, price <dbl>, distance <dbl>, surge_multiplier <dbl>,
## #   latitude <dbl>, longitude <dbl>, temperature <dbl>,
## #   apparentTemperature <dbl>, short_summary <chr>, long_summary <chr>,
## #   precipIntensity <dbl>, precipProbability <dbl>, humidity <dbl>,
## #   windSpeed <dbl>, windGust <dbl>, windGustTime <dbl>, visibility <dbl>,
## #   temperatureHigh <dbl>, temperatureHighTime <dbl>, temperatureLow <dbl>, …
uber_lyft_december_merge_dates <- uber_lyft_december_merge_dates %>%
        filter(source != "NA")
unique(uber_lyft_december_merge_dates$source)
##  [1] "Haymarket Square"        "Back Bay"               
##  [3] "North End"               "North Station"          
##  [5] "Beacon Hill"             "Boston University"      
##  [7] "Fenway"                  "South Station"          
##  [9] "Theatre District"        "West End"               
## [11] "Financial District"      "Northeastern University"
daily_number_rides_st_dec <- ggplot(uber_lyft_december_merge_dates, aes(source, fill=weekday3)) + geom_bar(position="dodge") + 
        scale_y_continuous(labels=comma) + theme(axis.text.x = element_text(angle=90, vjust=.5, hjust=1)) + 
        ggtitle("Daily Number of Total Rides Per Station - December") + xlab("Station") + ylab("Total Rides") +
        scale_fill_discrete(name = "Day of the Week")

uber_lyft_december_merge_dates_imp_info<- uber_lyft_december_merge_dates %>%
        dplyr::group_by(Date, weekday1, weekday2, weekday3) %>%
        dplyr::summarize(total_rides=n())
## `summarise()` has grouped output by 'Date', 'weekday1', 'weekday2'. You can
## override using the `.groups` argument.
uber_lyft_december_merge_dates_imp_info
## # A tibble: 12 × 5
## # Groups:   Date, weekday1, weekday2 [12]
##    Date                weekday1 weekday2 weekday3 total_rides
##    <dttm>                 <dbl> <chr>    <ord>          <int>
##  1 2018-12-01 00:00:00        6 Sat      Saturday       41680
##  2 2018-12-02 00:00:00        7 Sun      Sunday         41298
##  3 2018-12-03 00:00:00        1 Mon      Monday         41323
##  4 2018-12-04 00:00:00        2 Tue      Tuesday        11627
##  5 2018-12-09 00:00:00        7 Sun      Sunday          1529
##  6 2018-12-10 00:00:00        1 Mon      Monday          2534
##  7 2018-12-13 00:00:00        4 Thu      Thursday       35496
##  8 2018-12-14 00:00:00        5 Fri      Friday         41344
##  9 2018-12-15 00:00:00        6 Sat      Saturday       41332
## 10 2018-12-16 00:00:00        7 Sun      Sunday         41359
## 11 2018-12-17 00:00:00        1 Mon      Monday         41354
## 12 2018-12-18 00:00:00        2 Tue      Tuesday        33329
full_nov_dec_df <- rbind(uber_lyft_november_merge_dates, uber_lyft_december_merge_dates)

day_Hour <- full_nov_dec_df %>%
        filter(hour != "NA") %>%
        group_by(day, weekday3, hour) %>%
        dplyr::summarize(total_rides=n())
## `summarise()` has grouped output by 'day', 'weekday3'. You can override using
## the `.groups` argument.
day_hour_week_plto <- ggplot(day_Hour, aes(hour, total_rides, fill=weekday3)) + geom_bar(stat = "identity") + 
        ggtitle("Rides by Hour and Day of the Week") + 
        theme(plot.title = element_text(hjust=0.5)) +
        scale_fill_discrete(name = "Day of the Week") + ylab("Total Rides") + xlab("Hour")

II. Rides per Day of the Week and per Hour:

This section first examines the distribution of total number of cab rides per source and then per hour.

#daily number of rides per station 
daily_rides_data <- full_nov_dec_df %>%
        group_by(source, hour, weekday3) %>%
        dplyr::summarize(total_rides = n())
## `summarise()` has grouped output by 'source', 'hour'. You can override using
## the `.groups` argument.
daily_num_rides_station <- ggplot(daily_rides_data, aes(source, total_rides)) + 
        geom_bar(stat = "identity", position = "dodge", aes(fill = weekday3)) + 
        theme(axis.text.x = element_text(angle=90)) + xlab("Source") + ylab("Total Rides") +
        ggtitle("Total Number of Rides pet Day of the Week by Source") +
        scale_fill_discrete(name = "Day of the Week")

Explanation

This graph clearly shows that regardless of source, the largest number of cab rides took place on Monday and Tuesday. Specifically, Tuesday tended to have even more cab rides than Monday.

#total number of rides by hour 
hour_rides_data <- lyft.uber.remove.na %>%
        dplyr::group_by(hour) %>%
        dplyr::summarize(total_rides = n())
datatable(hour_rides_data)
hour_rides_data
## # A tibble: 24 × 2
##     hour total_rides
##    <dbl>       <int>
##  1     0       29872
##  2     1       26310
##  3     2       26323
##  4     3       25530
##  5     4       26125
##  6     5       22995
##  7     6       25147
##  8     7       22930
##  9     8       22337
## 10     9       26673
## # … with 14 more rows
total_ride_per_hr_plot <- ggplot(hour_rides_data, aes(hour, total_rides)) + 
        geom_bar(stat="identity", fill="blue", color="orange") + 
        theme(legend.position="none") + ggtitle("Total Rides Per Hour") + 
        scale_y_continuous(label=comma) + ylab("Total Rides") + 
        theme(plot.title = element_text(hjust=0.5))

total_ride_per_hr_plot

Explanation

This graph shows that the majority of rides occurred between 11PM and 12AM. The second highest number of rides occurred between 5PM and 6PM.

#total lyft rides by hour
hour_lyft_data <- lyft.uber.remove.na %>%
        filter(cab_type == "Lyft") %>%
        dplyr::group_by(hour) %>%
        dplyr::summarize(total_rides = n())

hour_lyft_data_plot <- ggplot(hour_lyft_data, aes(hour, total_rides)) + 
        geom_bar(stat="identity", fill="blue", color="orange") + 
        theme(legend.position="none") + ggtitle("Lyft Rides Per Hour") + 
        scale_y_continuous(labels=comma) + theme(plot.title = element_text(hjust=0.5)) + 
        xlab("Hour") + ylab("Total Rides")

hour_lyft_data_plot

Explanation

Similarly, this graph shows that the majority of rides occurred between 11PM and 12AM. The second highest number of rides occurred at 5PM.

#total uber rides by hour
hour_uber_data <- lyft.uber.remove.na %>%
        filter(cab_type == "Uber") %>%
        dplyr::group_by(hour) %>%
        dplyr::summarize(total_rides = n())

hour_uber_data_plot <- ggplot(hour_uber_data, aes(hour, total_rides)) + 
        geom_bar(stat="identity", fill="blue", color="orange") + 
        theme(legend.position="none") + ggtitle("Uber Rides Per Hour") + 
        scale_y_continuous(labels=comma) + theme(plot.title = element_text(hjust=0.5)) +
        xlab("Hour") + ylab("Total Rides")

hour_uber_data_plot

Explanation

This graph shows the majority of rides occurred between 11PM and 12AM.

#Price vs hour - uber - distance/price to start
lyft.uber.price.vs.hour <- lyft.uber.remove.na %>%
        group_by(price, distance) %>%
        dplyr::select(-apparentTemperatureMaxTime, -apparentTemperatureMax, -apparentTemperatureMinTime, -apparentTemperatureMin,
                      -temperatureMaxTime, -temperatureMax, -temperatureMinTime, -temperatureMin, -uvIndexTime, -precipIntensityMax,
                      -moonPhase, -sunsetTime, -sunriseTime, -ozone, -visibility.1, -uvIndex, -cloudCover, -windBearing, -pressure, 
                      -dewPoint, -icon, -apparentTemperatureLowTime, -apparentTemperatureLow, -apparentTemperatureHighTime, 
                      -apparentTemperatureHigh, -temperatureLowTime, -temperatureLow, -temperatureHighTime, -temperatureHigh, -visibility,
                      -windGustTime, -windGust, -windSpeed, -humidity, -precipProbability, -precipIntensity, -long_summary, -short_summary,
                      -apparentTemperature) %>%
        filter(surge_multiplier <= 1) 

lyft.uber.avg.dist.price <- lyft.uber.price.vs.hour %>%
        group_by(distance, cab_type) %>%
        summarize_at(vars(price), list(name = mean))

lyft.uber.avg.dist.price_plot <- ggplot(lyft.uber.avg.dist.price, aes(distance, name, color=cab_type)) + geom_line() + ggtitle("Average Price by Distance for both Lyft and Uber")+
        scale_fill_discrete("Type of Cab") + xlab("Distance") + ylab("Average Price")

lyft.uber.avg.dist.hour <- lyft.uber.price.vs.hour %>%
        group_by(hour) %>%
        summarize_at(vars(distance), list(name = mean))

lyft.uber.avg.dist.hour_plot <- ggplot(lyft.uber.avg.dist.hour, aes(hour, name)) + geom_line() + ggtitle("Average Ride Distance by Hour") + xlab("Hour of the Day") + 
        ylab("Average Distance (Miles)")

lyft.uber.avg.dist.hour_plot

Explanation

This graph shows that on average, the longest ride distance took place at roughly 3PM and 9PM. The next longest ride distance intervals were at 7:30AM and 10AM. The points all make sense because the morning data correlates to the early commute and the evening data correlates to the latter commute.

#Lyft.Uber.TotalRides vs Temperature 
temp_rides_data <- lyft.uber.remove.na %>%
        dplyr::group_by(temperature, cab_type) %>%
        dplyr::summarize(total_rides = n())
## `summarise()` has grouped output by 'temperature'. You can override using the
## `.groups` argument.
total_rides_data_plot <- ggplot(temp_rides_data, aes(temperature, total_rides, color=cab_type)) + 
        geom_line() +
        theme(legend.position="right") + ggtitle("Total Rides vs Temperature (F)") + 
        ylab("Total Rides") + 
        theme(plot.title = element_text(hjust=0.5)) + xlim(30,50) + ylim(0,2000)

total_rides_data_plot
## Warning: Removed 96 row(s) containing missing values (geom_path).

wind_gust_rides_data <- lyft.uber.remove.na %>%
        dplyr::group_by(windGust, temperature, cab_type) %>%
        dplyr::summarize(total_rides = n())
## `summarise()` has grouped output by 'windGust', 'temperature'. You can override
## using the `.groups` argument.

Explanation

I narrowed the temperature range of this graph in order to more accurately show the spikes in the number of cab rides. Regardless of cab type (Uber or Lyft), there were similar major surges and drops in the number of cab rides right around 35, 38, and 42 degrees. There were surges in the number of both Uber and Lyft rides ordered between 43 and 47 degrees Fahrenheit.

#Lyft.Uber_Wind_Gust
wind_gust_rides_data_plot <- ggplot(wind_gust_rides_data, aes(temperature, total_rides, color=windGust)) + 
        geom_line() + geom_point() +
        theme(legend.position="right") + ggtitle("Total Rides vs Temperature (F) Colored by Wind Gust (mph") + 
        ylab("Total Rides") + xlab("Temperature") +
        theme(plot.title = element_text(hjust=0.5)) + xlim(30,50) + ylim(0,2000)

wind_gust_rides_data_plot
## Warning: Removed 100 row(s) containing missing values (geom_path).
## Warning: Removed 131 rows containing missing values (geom_point).

cor(wind_gust_rides_data$windGust, wind_gust_rides_data$total_rides)
## [1] 0.2456817

Explanation

To explore the temperature and total rides relationship further, I made another plot looking again at temperature vs rides, but colored by wind gust speed (mph). The higher speeds correlate to a lighter blue color and these surges in wind gust speeds fall between 42 and 45 degrees Fahrenheit, which was when there was an increase in both Uber and Lyft cab rides in the previous plot. In other words, this means that there is a positive correlation between an increase in rides and higher wind gusts although it is small (0.246).

lyft.uber.avg.dist.price_plot

Explanation

This graph shows that regardless of distance traveled, riders prefer to use Uber over Lyft.

III. Lyft/Uber Prices (Correlation):

Before diving into the analysis, I wanted to determine if there was an obvious correlation between distance and price for both types of cabs. I noticed that the surge multiplier only applied to Lyfts, so I made two graphs. The first (Uber) was a correlation plot between distance and price and the second (Lyft) was a correlation plot between distance, price, and surge multiplier.

lyft.uber.remove.na_LYFT <- lyft.uber.remove.na %>%
        filter(cab_type == "Lyft")
mydata_LYFT <- lyft.uber.remove.na_LYFT[, c(13,14,15)]
corrplot(cor(mydata_LYFT), tl.col = "brown", tl.srt = 30, bg = "White",
         title = "\n\n Lyft: Correlation Between Price, Distance, and Surge Multiplier",
         type = "full")
cor(lyft.uber.remove.na_LYFT[, c(13,14,15)])
##                      price   distance surge_multiplier
## price            1.0000000 0.36156662       0.30823712
## distance         0.3615666 1.00000000       0.04007585
## surge_multiplier 0.3082371 0.04007585       1.00000000
lyft.uber.remove.na_LYFT <- lyft.uber.remove.na %>%
        filter(cab_type == "Lyft")
mydata_LYFT <- lyft.uber.remove.na_LYFT[, c(13,14,15)]
corrplot(cor(mydata_LYFT), tl.col = "brown", tl.srt = 30, bg = "White",
         title = "\n\n Lyft: Correlation Between Price, Distance, and Surge Multiplier",
         type = "full")

cor(lyft.uber.remove.na_LYFT[, c(13,14,15)])
##                      price   distance surge_multiplier
## price            1.0000000 0.36156662       0.30823712
## distance         0.3615666 1.00000000       0.04007585
## surge_multiplier 0.3082371 0.04007585       1.00000000

Explanation

In addition to creating these correlation plots, I also found the direct correlation data for each cab type. Looking at the Uber graph, price and distance are weakly correlated with an exact correlation value of 0.3362. The Lyft graph shows that price and distance too aren’t strongly correlated (correlation value of 0.3616) and similarly surge_multiplier and price are weakly correlated (correlation value of 0.3082) and surge_multiplier and distance are even more weakly correlated (correlation value of 0.04008).

IV. Relationship Between Route and Number of Cab Rides:

I was interested in seeing if there was a correlation between a cab route and the total number of rides. This section includes all available ride data for both Uber and Lyft. With respect to Lyft, this means that I’ve included both surge and non-surge data.

#avg price by source & neighborhood map, Both cab types, Both Surge and no Surge
avg_price_simulator <- full_nov_dec_df
avg_price_simulator <- avg_price_simulator[-c(1:2, 4:6, 10:11, 15:50)]
avg_price_simulator_reduced <- avg_price_simulator[-c(8:18)]

avg_price_simulator_reduced_2 <- avg_price_simulator_reduced 

source_destination_lyft_surge_2 <- avg_price_simulator_reduced_2 %>%
        unite('Route', source:destination, remove = FALSE)

source_destination_lyft_surge_2_adj<- source_destination_lyft_surge_2 %>%
        group_by(Route) %>%
        dplyr::summarize(total_rides = n(), mean_price = mean(price))
source_destination_lyft_surge_2_adj
## # A tibble: 72 × 3
##    Route                            total_rides mean_price
##    <chr>                                  <int>      <dbl>
##  1 Back Bay_Boston University              8682       14.0
##  2 Back Bay_Fenway                         8718       13.7
##  3 Back Bay_Haymarket Square               8838       18.0
##  4 Back Bay_North End                      9414       19.5
##  5 Back Bay_Northeastern University        8825       13.2
##  6 Back Bay_South Station                  8724       17.7
##  7 Beacon Hill_Boston University           8490       16.4
##  8 Beacon Hill_Fenway                      8688       16.2
##  9 Beacon Hill_Haymarket Square            8802       13.8
## 10 Beacon Hill_North End                   9240       15.3
## # … with 62 more rows
price_rides_route <- source_destination_lyft_surge_2_adj %>%
        arrange(desc(mean_price))
price_rides_route_desc <- price_rides_route[1:5,]
price_rides_route_asc <- price_rides_route[68:72,]
price_rides_route_top_lowest <- rbind(price_rides_route_desc, price_rides_route_asc)

price_rides_route_desc_plot <- ggplot(data = price_rides_route_desc, aes(forcats::fct_reorder(Route, desc(mean_price)), mean_price)) +
        geom_bar(stat = "identity", position = "dodge", aes(fill=Route)) + xlab("Route") + ylab("Total Rides") +
        theme(axis.text.x = element_text(angle=90)) + ggtitle("Mean Ride Price vs Cab Route") + 
        geom_text(aes(label=round(mean_price, 1)))

gprice_rides_route_top_lowest_plot <- ggplot(data = price_rides_route_top_lowest, aes(forcats::fct_reorder(Route, desc(mean_price)), mean_price)) +
        geom_bar(stat = "identity", position = "dodge", aes(fill=Route)) + xlab("Route") + ylab("Total Rides") +
        theme(axis.text.x = element_text(angle=90), legend.position = "none") + ggtitle("Five Priciest and Five Least Priciest Ride Price vs Cab Route") + 
        geom_text(aes(label=round(mean_price, 1)))

gprice_rides_route_top_lowest_plot

Explanation

Above is a graph of the top ten mean cab rides in the aforementioned Boston neighborhoods. This plot shows that the Financial District to Boston University route has the highest average cab ride price. Inversely, the Haymarket Square to North Station route has the lowest average cab ride price. This is reasonable as the distance between the Financial District and Boston University is 4 miles whereas the distance between Haymarket Square and North Station is 0.5 miles. However, this translates to the most expensive ride equating to $6.38 per mile and the least expensive ride being a whopping $24.60 per mile.

#hour of day shortest ride and longest ride
#previous graph shows most expensive avg ride being financial district to boston university
#and least expensive being both Cab Types
source_destination_lyft_surge_2_adj_hour_shortest_ride <- source_destination_lyft_surge_2 %>%
        filter(Route=="Haymarket Square_North Station") %>%
        group_by(hour) %>%
        dplyr::summarize(total_rides = n())

source_destination_lyft_surge_2_adj_hour_longest_ride <- source_destination_lyft_surge_2 %>%
        filter(Route=="Financial District_Boston University") %>%
        group_by(hour) %>%
        dplyr::summarize(total_rides = n())

longest_shortest_combined <- rbind(source_destination_lyft_surge_2_adj_hour_shortest_ride,
                                   source_destination_lyft_surge_2_adj_hour_longest_ride)

source_destination_lyft_surge_2_adj_hour_shortest_ride_plot <- ggplot(data = source_destination_lyft_surge_2_adj_hour_shortest_ride, aes(hour, total_rides, color="green")) +
        geom_bar(stat = "identity", position = "dodge", fill="purple") + xlab("Hour") + ylab("Total Rides") +
        theme(axis.text.x = element_text(angle=90), legend.position = "none") + 
        ggtitle("Haymarket Sqare to North Station - Number of Rides Per Hour")

source_destination_lyft_surge_2_adj_hour_shortest_ride_plot

source_destination_lyft_surge_2_adj_hour_longest_ride_plot <- ggplot(data = source_destination_lyft_surge_2_adj_hour_longest_ride, aes(hour, total_rides, color="green")) +
        geom_bar(stat = "identity", position = "dodge", fill="purple") + xlab("Hour") + ylab("Total Rides") +
        theme(axis.text.x = element_text(angle=90)) + 
        ggtitle("Financial District to Boston University - Number of Rides Per Hour") +
        theme(legend.position="none")

source_destination_lyft_surge_2_adj_hour_longest_ride_plot

Explanation

The previous plot (which included the five most expensive and least expensive routes on average) showed that the priciest route was from the Financial District to Boston University. Here, I filtered for this route looking at the total number of rides per hour. The plot shows that the highest number of rides (464) occurred during the 2 o’clock hour (PM).

The aforementioned price plot (which included the five most expensive and least expensive routes on average) showed that the least expensive route was from the Haymarket Square to North Shore. Here, I filtered for this route looking at the total number of rides per hour. The plot shows that the highest number of rides (453) occurred during the 1 o’clock hour (PM). Interestingly enough, the highest number of rides took place during the same time of the day for both the most expensive and least expensive route.

V. Relationship Between Route and Number of Cab Rides:

In this section, I separated the data in order to look at the different cab types (Uber, Lyft) individually. Specifically, I focused on all Uber data and the Lyft data where no surge (surge_multiplier = 1.00) was present.

#Uber Avg Price vs Route Info
avg_price_simulator_reduced_3 <- avg_price_simulator_reduced
source_destination_lyft_surge_3 <- avg_price_simulator_reduced_3 %>%
        unite('Route', source:destination, remove = FALSE)

source_destination_lyft_surge_3_adj<- source_destination_lyft_surge_3 %>%
        filter(cab_type == "Uber") %>%
        group_by(Route) %>%
        dplyr::summarize(total_rides = n(), mean_price = mean(price))

price_rides_route_SURGE <- source_destination_lyft_surge_3_adj %>%
        arrange(desc(mean_price))

price_rides_route_desc_SURGE <- price_rides_route_SURGE[1:5,]
price_rides_route_asc_SURGE <- price_rides_route_SURGE[68:72,]
price_rides_route_top_lowest_SURGE <- rbind(price_rides_route_desc_SURGE, price_rides_route_asc_SURGE)


price_rides_route_top_lowest_SURGE_plot <- ggplot(data = price_rides_route_top_lowest_SURGE, aes(forcats::fct_reorder(Route, desc(mean_price)), mean_price)) +
        geom_bar(stat = "identity", position = "dodge", aes(fill=Route)) + xlab("Route") + ylab("Total Rides") +
        theme(axis.text.x = element_text(angle=90), legend.position = "none") + 
        ggtitle("Uber: Five Priciest and Five Least Priciest Ride Price vs Cab Route") + 
        geom_text(aes(label=round(mean_price, 1)))

price_rides_route_top_lowest_SURGE_plot

Explanation

This graph shows that the most expensive Uber average cab route was from the Financial District to Boston University (4 miles) and the least expensive route was from the Financial District to South Station (0.6 miles). This translates to the most expensive route being $6.08 per mile and the least expensive route being $20.17 per mile.

#Lyft Avg Price vs Route (no surge)
avg_price_simulator_reduced_4 <- avg_price_simulator_reduced
source_destination_lyft_surge_4 <- avg_price_simulator_reduced_4 %>%
        unite('Route', source:destination, remove = FALSE)

source_destination_lyft_surge_4_adj<- source_destination_lyft_surge_4 %>%
        filter(cab_type == "Lyft", surge_multiplier == 1) %>%
        group_by(Route) %>%
        dplyr::summarize(total_rides = n(), mean_price = mean(price))
source_destination_lyft_surge_4_adj
## # A tibble: 72 × 3
##    Route                            total_rides mean_price
##    <chr>                                  <int>      <dbl>
##  1 Back Bay_Boston University              3634       13.7
##  2 Back Bay_Fenway                         3682       13.3
##  3 Back Bay_Haymarket Square               3791       17.4
##  4 Back Bay_North End                      4053       19.0
##  5 Back Bay_Northeastern University        3811       12.8
##  6 Back Bay_South Station                  3814       15.4
##  7 Beacon Hill_Boston University           3758       16.4
##  8 Beacon Hill_Fenway                      3855       16.2
##  9 Beacon Hill_Haymarket Square            3948       13.4
## 10 Beacon Hill_North End                   4026       15.0
## # … with 62 more rows
lyft_no_surge_price_rides_route <- source_destination_lyft_surge_4_adj %>%
        arrange(desc(mean_price))

lyft_no_surge_price_rides_route_asc <- lyft_no_surge_price_rides_route[1:5,]
lyft_no_surge_price_rides_route_desc <- lyft_no_surge_price_rides_route[68:72,]
lyft_no_surge_price_rides_route_top_lowest <- rbind(lyft_no_surge_price_rides_route_asc, lyft_no_surge_price_rides_route_desc)

lyft_no_surge_price_rides_route_top_lowest_plot <- ggplot(data = lyft_no_surge_price_rides_route_top_lowest, aes(forcats::fct_reorder(Route, desc(mean_price)), mean_price)) +
        geom_bar(stat = "identity", position = "dodge", aes(fill=Route)) + xlab("Route") + ylab("Total Rides") +
        theme(axis.text.x = element_text(angle=90), legend.position = "none") + 
        ggtitle("Lyft (No Surge): Five Priciest and Five Least Priciest Ride Price vs Cab Route") + 
        geom_text(aes(label=round(mean_price, 1)))

lyft_no_surge_price_rides_route_top_lowest_plot

Explanation

This graph shows that the most expensive Lyft average cab route was from the Financial District to Boston University (4 miles) and the least expensive route was from the Financial District to South Station (0.6 miles). This translates to the most expensive route being $6.45 per mile and the least expensive route being $20.50 per mile.

VI. Lyft: Relationship Between Surge Multiplier, Price, and Total Rides

The surge multiplier only applied to Lyft rides, so Uber data wasn’t considered in this analysis. In this section, I separated the surge multiplier into two groups (1.25 - 1.75) and (2.00 - 3.00) in order to examine the most expensive cab route on average with respect to the two surge groups.

#Lyft Avg Price vs Route (SURGE = 1.25-1.75)
avg_price_simulator_reduced_1.25.75 <- avg_price_simulator_reduced
source_destination_lyft_surge_1.25.75 <- avg_price_simulator_reduced_1.25.75 %>%
        unite('Route', source:destination, remove = FALSE)

source_destination_lyft_surge_1.25.75_adj<- source_destination_lyft_surge_1.25.75 %>%
        filter(cab_type == "Lyft", surge_multiplier >= 1.25, surge_multiplier <= 1.75) %>%
        group_by(Route) %>%
        dplyr::summarize(total_rides = n(), mean_price = mean(price))
source_destination_lyft_surge_1.25.75_adj
## # A tibble: 72 × 3
##    Route                            total_rides mean_price
##    <chr>                                  <int>      <dbl>
##  1 Back Bay_Boston University               415       21.3
##  2 Back Bay_Fenway                          435       21.2
##  3 Back Bay_Haymarket Square                350       28.7
##  4 Back Bay_North End                       395       30.0
##  5 Back Bay_Northeastern University         370       20.2
##  6 Back Bay_South Station                   325       24.9
##  7 Beacon Hill_Boston University            320       24.8
##  8 Beacon Hill_Fenway                       325       25.4
##  9 Beacon Hill_Haymarket Square             240       20.7
## 10 Beacon Hill_North End                    375       23.2
## # … with 62 more rows
lyft_1.25.75_surge_price_rides_route <- source_destination_lyft_surge_1.25.75_adj %>%
        arrange(desc(mean_price))

lyft_1.25.75_surge_price_rides_route_asc <- lyft_1.25.75_surge_price_rides_route[1:5,]
lyft_1.25.75_surge_price_rides_route_desc <- lyft_1.25.75_surge_price_rides_route[68:72,]
lyft_1.25.75_surge_price_rides_route_top_lowest <- rbind(lyft_1.25.75_surge_price_rides_route_asc, lyft_1.25.75_surge_price_rides_route_desc)


lyft_1.25.75_surge_price_rides_route_top_lowest_plot <- ggplot(data = lyft_1.25.75_surge_price_rides_route_top_lowest, aes(forcats::fct_reorder(Route, desc(mean_price)), mean_price)) +
        geom_bar(stat = "identity", position = "dodge", aes(fill=Route)) + xlab("Route") + ylab("Total Rides") +
        theme(axis.text.x = element_text(angle=90), legend.position = "none") + 
        ggtitle("1.25-1.75 Surge Data: Five Priciest and Five Least Priciest Ride Price vs Cab Route") + 
        geom_text(aes(label=round(mean_price, 1)))

lyft_1.25.75_surge_price_rides_route_top_lowest_plot

Explanation

This graph shows that the most expensive route on average was from Fenway Park to the Financial District (4.4 miles) and the least expensive route was from Haymarket Square to North Station (0.5 miles). This translates to the most expensive route being $9.00 per hour and the least expensive being $35.40 per hour. The average surge multiplier in this group is 1.5, which makes sense as the most expensive route is roughly 1.5 times the ride price from the no surge data. However, the least expensive route is actually 1.75 times the ride price from the no surge data.

#Lyft Avg Price vs Route (SURGE = 2.00 - 3.00)
avg_price_simulator_reduced_2.3 <- avg_price_simulator_reduced
source_destination_lyft_surge_2.3 <- avg_price_simulator_reduced_2.3 %>%
        unite('Route', source:destination, remove = FALSE)

source_destination_lyft_surge_2.3_adj<- source_destination_lyft_surge_2.3 %>%
        filter(cab_type == "Lyft", surge_multiplier >= 2.00, surge_multiplier <= 3.00) %>%
        group_by(Route) %>%
        dplyr::summarize(total_rides = n(), mean_price = mean(price))

lyft_2.3_surge_price_rides_route <- source_destination_lyft_surge_2.3_adj %>%
        arrange(desc(mean_price))

lyft_2.3_surge_price_rides_route_asc <- lyft_2.3_surge_price_rides_route[1:5,]
lyft_2.3_surge_price_rides_route_desc <- lyft_2.3_surge_price_rides_route[50:54,]
lyft_2.3_surge_price_rides_route_top_lowest <- rbind(lyft_2.3_surge_price_rides_route_asc, lyft_2.3_surge_price_rides_route_desc)


lyft_2.3_surge_price_rides_route_top_lowest_plot <- ggplot(data = lyft_2.3_surge_price_rides_route_top_lowest, aes(forcats::fct_reorder(Route, desc(mean_price)), mean_price)) +
        geom_bar(stat = "identity", position = "dodge", aes(fill=Route)) + xlab("Route") + ylab("Total Rides") +
        theme(axis.text.x = element_text(angle=90), legend.position = "none") + 
        ggtitle("2.00-3.00 Surge: Five Priciest and Five Least Priciest Ride Price vs Cab Route") + 
        geom_text(aes(label=round(mean_price, 1)))

lyft_2.3_surge_price_rides_route_top_lowest_plot

Explanation

This graph shows that the most expensive route on average was from the Financial District to Boston University (4 miles) and the least expensive route was from the Financial District to South Station (0.6 miles). This translates to the most expensive route being $14.20 per hour and the least expensive being $42.50 per hour. The average surge multiplier in this group is 2.5, which means that the most expensive route is actually almost $2.00 cheaper than anticipated. Similarly, the least expensive route is $10 cheaper than anticipated. This can likely be explained by the difference in number of cab_rides. The first surge multiplier group contained 18,570 samples whereas this group only contained $2,405 samples. This difference well explains the variance in cab route price data.

#Surge-Day of Week-Total Rides Lyft
day_Hour_lyft_surge <- full_nov_dec_df %>%
        filter(weekday3 != "NA", surge_multiplier > 1.00) %>%
        group_by(day, weekday3, surge_multiplier) %>%
        dplyr::summarize(total_rides=n())
## `summarise()` has grouped output by 'day', 'weekday3'. You can override using
## the `.groups` argument.
day_Hour_lyft_surge_plot <- ggplot(data = day_Hour_lyft_surge, aes(x = weekday3, y = total_rides, fill = factor(surge_multiplier))) + 
        geom_bar(stat = "identity", position = "dodge") + 
        xlab("Day of the Week") + ylab("Total Rides") + ggtitle("Total Rides vs Day of the Week Per Surge Multiplier") + 
        labs(fill="Surge Multiplier")

day_Hour_lyft_surge_plot

Explanation

This plot examined the total number of Lyft rides per day of the week and their respective surge multiplier. Regardless of surge multiplier, the highest number of rides occurred on Tuesday and Wednesday.

#Lyft - Surge Per Hour
lyft.uber.remove.na_surge <- lyft.uber.remove.na %>%
        filter(cab_type == "Lyft", surge_multiplier > 1.00) %>%
        dplyr::group_by(hour, surge_multiplier) %>%
        dplyr::summarize(total_rides = n())
## `summarise()` has grouped output by 'hour'. You can override using the
## `.groups` argument.
lyft.uber.remove.na_surge$surge_multiplier <- as.factor(lyft.uber.remove.na_surge$surge_multiplier)

lyft.uber.remove.na_surge_plot <- ggplot(lyft.uber.remove.na_surge, aes(hour, total_rides, color = surge_multiplier)) + 
        geom_point(alpha=0.8, size=1, aes(color = surge_multiplier)) + 
        geom_line(aes(color = surge_multiplier)) + ggtitle("Lyft: Total Rides vs Hour of the Day Per Surge Multiplier") +
        facet_wrap(~surge_multiplier, ncol=1, scales="free") + xlab("Hour") + ylab("Total Rides") +
        guides(color=guide_legend(ncol=1)) + theme(legend.position="none",
                                                   panel.border = element_blank(), 
                                                   panel.spacing.x = unit(0,"line"))

lyft.uber.remove.na_surge_plot

Explanation

This plot examined the total number of Lyft rides per hour of the day and their respective surge multiplier. Regardless of surge multiplier, the highest number of rides took place between 12 and 1 o’clock in the afternoon.

#Surge Rides Source - Lyft
source_destination_lyft_surge <- full_nov_dec_df %>%
        filter(source != "NA", destination != "NA", surge_multiplier > 1.00) %>%
        group_by(source, destination, surge_multiplier, weekday3) %>%
        dplyr::summarize(total_rides = n()) %>%
        arrange(desc(total_rides))
## `summarise()` has grouped output by 'source', 'destination',
## 'surge_multiplier'. You can override using the `.groups` argument.
source_destination_lyft_surge_combine <- source_destination_lyft_surge %>%
        unite('Route', source:destination, remove = FALSE)

data_source_destination_route <- source_destination_lyft_surge_combine[with(source_destination_lyft_surge_combine,order(-total_rides)),]

data_source_destination_route_2 <- data_source_destination_route

total_rides_route <- ddply(data_source_destination_route_2,"Route",numcolwise(sum))
total_rides_route_desc <- total_rides_route %>%
        arrange(desc(total_rides))
data_source_destination_route <- total_rides_route_desc[1:10,]
data_source_destination_route_lowest <- total_rides_route_desc[63:72,]


data_source_destination_route_up_plot <- ggplot(data = data_source_destination_route, aes(forcats::fct_reorder(Route, desc(total_rides)), total_rides)) +
        geom_bar(stat = "identity", position = "dodge", aes(fill=Route)) + xlab("Route") + ylab("Total Rides") +
        theme(axis.text.x = element_text(angle=90), legend.position = "none") + ggtitle("Surge: Relationship Between Lyft Rides and Routes - Upper Bound") +
        scale_y_continuous(breaks = c(100, 200, 300, 400, 500, 600))

data_source_destination_route_up_plot

Explanation

This plot shows that the routes from Back Bay to Boston University and Fenway more often correlated with a surge in Lyft prices. In addition, rides originating in Back Bay constituted forty percent of this graph.

data_source_destination_route_lowest_lb_plot <- ggplot(data = data_source_destination_route_lowest, aes(forcats::fct_reorder(Route, desc(total_rides)), total_rides)) +
        geom_bar(stat = "identity", position = "dodge", aes(fill=Route)) + xlab("Route") + ylab("Total Rides") +
        theme(axis.text.x = element_text(angle=90), legend.position = "none") + ggtitle("Surge: Relationship Between Lyft Rides and Routes - Lower Bound") +
        scale_y_continuous(breaks = c(20, 40, 60, 80))

data_source_destination_route_lowest_lb_plot

Explanation

This plot shows that routes from Haymarket Square to Beacon Hill less likely experienced a surge in Lyft prices. Furthermore, routes originating in Haymarket Square and North End constituted this entire graph with respect to less common surge prices.