Google-Capstone-Project.R

#1.ASK
#1.0 Business task
#(a).How do annual members and casual riders use Cyclistic bikes differently?
#(b).Why would casual riders buy Cyclistic annual memberships?
#(c).How can Cyclistic use digital media to influence casual riders to become members?
#1.1 Stakeholders
#stakeholders include the following
#director of marketing
#cyclistic marketing analytics team
#cyclistic exectuive team
#2.PREPARE
#The data is public and has been made available by motivate international inc
#The link.https://divvy-tripdata.s3.amazonaws.com/index.html
#.Data is organized in csv files
#.Credibility of data not in question
#.This data has been stripped of all identifying information ensuring its privacy

#3.PROCESS
#.For this project I choose RStudio Desktop in order to prepare, process, clean, analyze and create the visualizations.
#Data review involved the following:
#.Checking column names across all the 12 original files.
#.Checking for missing values.
#.Checking of white spaces.
#.Checking of duplicate records
#3.1 COLLECT & DATA WRANGLING
#load readr for reading rectangular data
#load lubridate for data wrangling
#load ggplot2 for data visualization
#install packages
#tidyverse for data wrangling
#lubridate for dealing with dates
#ggplot2 for data visualisation

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.1.1     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(ggplot2)
library(dplyr)
#3.1.1.COLLECT  AND READ DATA
#Upload data here
b_Trips_2019_Q1<- read_csv("C:/Users/user/Desktop/DATA SETS/BICYCLE DATA/Divvy_Trips_2019_Q1.csv")

## Rows: 365069 Columns: 12

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (4): from_station_name, to_station_name, usertype, gender
## dbl  (5): trip_id, bikeid, from_station_id, to_station_id, birthyear
## dttm (2): start_time, end_time

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

b_Trips_2019_Q2<- read_csv("C:/Users/user/Desktop/DATA SETS/BICYCLE DATA/Divvy_Trips_2019_Q2.csv")

## Rows: 1108163 Columns: 12

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (4): 03 - Rental Start Station Name, 02 - Rental End Station Name, User...
## dbl  (5): 01 - Rental Details Rental ID, 01 - Rental Details Bike ID, 03 - R...
## dttm (2): 01 - Rental Details Local Start Time, 01 - Rental Details Local En...

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

b_Trips_2019_Q3<- read_csv("C:/Users/user/Desktop/DATA SETS/BICYCLE DATA/Divvy_Trips_2019_Q3.csv")

## Rows: 1640718 Columns: 12

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (4): from_station_name, to_station_name, usertype, gender
## dbl  (5): trip_id, bikeid, from_station_id, to_station_id, birthyear
## dttm (2): start_time, end_time

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

b_Trips_2019_Q4<- read_csv("C:/Users/user/Desktop/DATA SETS/BICYCLE DATA/Divvy_Trips_2019_Q4.csv")

## Rows: 704054 Columns: 12

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (4): from_station_name, to_station_name, usertype, gender
## dbl  (5): trip_id, bikeid, from_station_id, to_station_id, birthyear
## dttm (2): start_time, end_time

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

head(b_Trips_2019_Q1)

## # A tibble: 6 x 12
##    trip_id start_time          end_time            bikeid tripduration
##      <dbl> <dttm>              <dttm>               <dbl>        <dbl>
## 1 21742443 2019-01-01 00:04:37 2019-01-01 00:11:07   2167          390
## 2 21742444 2019-01-01 00:08:13 2019-01-01 00:15:34   4386          441
## 3 21742445 2019-01-01 00:13:23 2019-01-01 00:27:12   1524          829
## 4 21742446 2019-01-01 00:13:45 2019-01-01 00:43:28    252         1783
## 5 21742447 2019-01-01 00:14:52 2019-01-01 00:20:56   1170          364
## 6 21742448 2019-01-01 00:15:33 2019-01-01 00:19:09   2437          216
## # ... with 7 more variables: from_station_id <dbl>, from_station_name <chr>,
## #   to_station_id <dbl>, to_station_name <chr>, usertype <chr>, gender <chr>,
## #   birthyear <dbl>

#3.2.2.WRANGLE DATA AND COMBINE INTO A SINGLE FILE
#Compare column names
colnames(b_Trips_2019_Q1)

##  [1] "trip_id"           "start_time"        "end_time"         
##  [4] "bikeid"            "tripduration"      "from_station_id"  
##  [7] "from_station_name" "to_station_id"     "to_station_name"  
## [10] "usertype"          "gender"            "birthyear"

colnames(b_Trips_2019_Q2)

##  [1] "01 - Rental Details Rental ID"                   
##  [2] "01 - Rental Details Local Start Time"            
##  [3] "01 - Rental Details Local End Time"              
##  [4] "01 - Rental Details Bike ID"                     
##  [5] "01 - Rental Details Duration In Seconds Uncapped"
##  [6] "03 - Rental Start Station ID"                    
##  [7] "03 - Rental Start Station Name"                  
##  [8] "02 - Rental End Station ID"                      
##  [9] "02 - Rental End Station Name"                    
## [10] "User Type"                                       
## [11] "Member Gender"                                   
## [12] "05 - Member Details Member Birthday Year"

colnames(b_Trips_2019_Q3)

##  [1] "trip_id"           "start_time"        "end_time"         
##  [4] "bikeid"            "tripduration"      "from_station_id"  
##  [7] "from_station_name" "to_station_id"     "to_station_name"  
## [10] "usertype"          "gender"            "birthyear"

colnames(b_Trips_2019_Q4)

##  [1] "trip_id"           "start_time"        "end_time"         
##  [4] "bikeid"            "tripduration"      "from_station_id"  
##  [7] "from_station_name" "to_station_id"     "to_station_name"  
## [10] "usertype"          "gender"            "birthyear"

#colnames for dataset Divvy_Trips_2019_Q1,Divvy_Trips_2019_Q3 & Divvy_Trips_2019_Q4 match
# While the names don't have to be in the same order, they DO need to match perfectly before we can use a command to join them into one file
#Rename columns  to make them consisent
b_Trips_2019_Q2 <- b_Trips_2019_Q2%>%rename(trip_id= "01 - Rental Details Rental ID"
         ,bikeid = "01 - Rental Details Bike ID"
         ,start_time ="01 - Rental Details Local Start Time" 
         ,end_time = "01 - Rental Details Local End Time"
         ,from_station_name ="03 - Rental Start Station Name"
         ,from_station_id = "03 - Rental Start Station ID"
         ,to_station_name = "02 - Rental End Station Name"
         ,to_station_id = "02 - Rental End Station ID"
         ,usertype="User Type"
         ,gender="Member Gender"
         ,birthyear="05 - Member Details Member Birthday Year"
         ,tripduration="01 - Rental Details Duration In Seconds Uncapped")

#confirm the colnames have changed
colnames(b_Trips_2019_Q2)

##  [1] "trip_id"           "start_time"        "end_time"         
##  [4] "bikeid"            "tripduration"      "from_station_id"  
##  [7] "from_station_name" "to_station_id"     "to_station_name"  
## [10] "usertype"          "gender"            "birthyear"

table(b_Trips_2019_Q2$Rental.Details.Rental.ID)

## Warning: Unknown or uninitialised column: `Rental.Details.Rental.ID`.

## < table of extent 0 >

# Inspect the dataframes and look for incongruencies
 
# Convert ride_id and rideable_type to character so that they can stack correctly
b_Trips_2019_Q1<- b_Trips_2019_Q1%>%
                  mutate(trip_id = as.character(trip_id)
                  ,bikeid = as.character(bikeid)) 

b_Trips_2019_Q2 <- b_Trips_2019_Q2%>%
                   mutate(trip_id = as.character(trip_id)
                  ,bikeid = as.character(bikeid)) 

b_Trips_2019_Q3 <-  b_Trips_2019_Q3%>%
                    mutate(trip_id = as.character(trip_id)
                   ,bikeid = as.character(bikeid)) 

b_Trips_2019_Q4<- b_Trips_2019_Q4%>%
                  mutate(trip_id = as.character(trip_id)
                  ,bikeid = as.character(bikeid))

#rbind combines columns that match.
#use bind_rows instead
all_b_trips<-bind_rows(b_Trips_2019_Q1,b_Trips_2019_Q2,b_Trips_2019_Q3,b_Trips_2019_Q4)
#3.2 Data Validation
# Inspect the new table that has been created
colnames(all_b_trips)  #List of column names

##  [1] "trip_id"           "start_time"        "end_time"         
##  [4] "bikeid"            "tripduration"      "from_station_id"  
##  [7] "from_station_name" "to_station_id"     "to_station_name"  
## [10] "usertype"          "gender"            "birthyear"

nrow(all_b_trips)  #How many rows are in data frame?

## [1] 3818004

dim(all_b_trips)  #Dimensions of the data frame?

## [1] 3818004      12

head(all_b_trips)  #See the first 6 rows of data frame.  Also tail(all_trips)

## # A tibble: 6 x 12
##   trip_id  start_time          end_time            bikeid tripduration
##   <chr>    <dttm>              <dttm>              <chr>         <dbl>
## 1 21742443 2019-01-01 00:04:37 2019-01-01 00:11:07 2167            390
## 2 21742444 2019-01-01 00:08:13 2019-01-01 00:15:34 4386            441
## 3 21742445 2019-01-01 00:13:23 2019-01-01 00:27:12 1524            829
## 4 21742446 2019-01-01 00:13:45 2019-01-01 00:43:28 252            1783
## 5 21742447 2019-01-01 00:14:52 2019-01-01 00:20:56 1170            364
## 6 21742448 2019-01-01 00:15:33 2019-01-01 00:19:09 2437            216
## # ... with 7 more variables: from_station_id <dbl>, from_station_name <chr>,
## #   to_station_id <dbl>, to_station_name <chr>, usertype <chr>, gender <chr>,
## #   birthyear <dbl>

str(all_b_trips)  #See list of columns and data types (numeric, character, etc)

## spec_tbl_df [3,818,004 x 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ trip_id          : chr [1:3818004] "21742443" "21742444" "21742445" "21742446" ...
##  $ start_time       : POSIXct[1:3818004], format: "2019-01-01 00:04:37" "2019-01-01 00:08:13" ...
##  $ end_time         : POSIXct[1:3818004], format: "2019-01-01 00:11:07" "2019-01-01 00:15:34" ...
##  $ bikeid           : chr [1:3818004] "2167" "4386" "1524" "252" ...
##  $ tripduration     : num [1:3818004] 390 441 829 1783 364 ...
##  $ from_station_id  : num [1:3818004] 199 44 15 123 173 98 98 211 150 268 ...
##  $ from_station_name: chr [1:3818004] "Wabash Ave & Grand Ave" "State St & Randolph St" "Racine Ave & 18th St" "California Ave & Milwaukee Ave" ...
##  $ to_station_id    : num [1:3818004] 84 624 644 176 35 49 49 142 148 141 ...
##  $ to_station_name  : chr [1:3818004] "Milwaukee Ave & Grand Ave" "Dearborn St & Van Buren St (*)" "Western Ave & Fillmore St (*)" "Clark St & Elm St" ...
##  $ usertype         : chr [1:3818004] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
##  $ gender           : chr [1:3818004] "Male" "Female" "Female" "Male" ...
##  $ birthyear        : num [1:3818004] 1989 1990 1994 1993 1994 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   trip_id = col_double(),
##   ..   start_time = col_datetime(format = ""),
##   ..   end_time = col_datetime(format = ""),
##   ..   bikeid = col_double(),
##   ..   tripduration = col_number(),
##   ..   from_station_id = col_double(),
##   ..   from_station_name = col_character(),
##   ..   to_station_id = col_double(),
##   ..   to_station_name = col_character(),
##   ..   usertype = col_character(),
##   ..   gender = col_character(),
##   ..   birthyear = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

summary(all_b_trips)  #Statistical summary of data. Mainly for numerics

##    trip_id            start_time                     end_time                  
##  Length:3818004     Min.   :2019-01-01 00:04:37   Min.   :2019-01-01 00:11:07  
##  Class :character   1st Qu.:2019-05-29 15:49:26   1st Qu.:2019-05-29 16:09:28  
##  Mode  :character   Median :2019-07-25 17:50:54   Median :2019-07-25 18:12:23  
##                     Mean   :2019-07-19 21:47:37   Mean   :2019-07-19 22:11:47  
##                     3rd Qu.:2019-09-15 06:48:05   3rd Qu.:2019-09-15 08:30:13  
##                     Max.   :2019-12-31 23:57:17   Max.   :2020-01-21 13:54:35  
##                                                                                
##     bikeid           tripduration      from_station_id from_station_name 
##  Length:3818004     Min.   :      61   Min.   :  1.0   Length:3818004    
##  Class :character   1st Qu.:     411   1st Qu.: 77.0   Class :character  
##  Mode  :character   Median :     709   Median :174.0   Mode  :character  
##                     Mean   :    1450   Mean   :201.7                     
##                     3rd Qu.:    1283   3rd Qu.:289.0                     
##                     Max.   :10628400   Max.   :673.0                     
##                                                                          
##  to_station_id   to_station_name      usertype            gender         
##  Min.   :  1.0   Length:3818004     Length:3818004     Length:3818004    
##  1st Qu.: 77.0   Class :character   Class :character   Class :character  
##  Median :174.0   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :202.6                                                           
##  3rd Qu.:291.0                                                           
##  Max.   :673.0                                                           
##                                                                          
##    birthyear     
##  Min.   :1759    
##  1st Qu.:1979    
##  Median :1987    
##  Mean   :1984    
##  3rd Qu.:1992    
##  Max.   :2014    
##  NA's   :538751

# STEP 4: CLEAN UP AND ADD DATA TO PREPARE FOR ANALYSIS
# Inspect the new table that has been created

# There are a few problems we will need to fix:
# (1) In the "member_casual" column, there are two names for members ("member" and "Subscriber") and two names for casual riders ("Customer" and "casual"). We will need to consolidate that from four to two labels.
# (2) The data can only be aggregated at the ride-level, which is too granular. We will want to add some additional columns of data -- such as day, month, year -- that provide additional opportunities to aggregate the data.
# (3) We will want to add a calculated field for length of ride since the 2020Q1 data did not have the "tripduration" column. We will add "ride_length" to the entire dataframe for consistency.
# (4) There are some rides where tripduration shows up as negative, including several hundred rides where Divvy took bikes out of circulation for Quality Control reasons. We will want to delete these rides.

#Converting the start_time and end_time columns to
all_b_trips<- all_b_trips%>%
              mutate(start_time=as.POSIXlt(start_time))
all_b_trips<- all_b_trips%>%
              mutate(end_time=as.POSIXlt(end_time))

# Add columns that list the date, month, day, and year of each ride
# This will allow us to aggregate ride data for each month, day, or year ... before completing these operations we could only aggregate at the ride level
# more on date formats in R found at that link
all_b_trips$date <- as.Date(all_b_trips$start_time) #The default format is yyyy-mm-dd
all_b_trips$month <- format(as.Date(all_b_trips$date), "%m")
all_b_trips$day <- format(as.Date(all_b_trips$date), "%d")
all_b_trips$year <- format(as.Date(all_b_trips$date), "%Y")
all_b_trips$day_of_week <- format(as.Date(all_b_trips$date), "%A")
# Add a "ride_length" calculation to all_trips (in seconds)
# https://stat.ethz.ch/R-manual/R-devel/library/base/html/difftime.html
all_b_trips$ride_length <- difftime(all_b_trips$end_time,all_b_trips$start_time)

# Remove "bad" data
# The dataframe includes a few hundred entries when bikes were taken out of docks and checked for quality by Divvy or ride_length was negative
#Check for missing values
colSums(is.na(all_b_trips))

##           trip_id        start_time          end_time            bikeid 
##                 0                 0                 0                 0 
##      tripduration   from_station_id from_station_name     to_station_id 
##                 0                 0                 0                 0 
##   to_station_name          usertype            gender         birthyear 
##                 0                 0            559206            538751 
##              date             month               day              year 
##                 0                 0                 0                 0 
##       day_of_week       ride_length 
##                 0                 0

#check for duplicates
distinct(all_b_trips)

## # A tibble: 3,818,004 x 18
##    trip_id  start_time          end_time            bikeid tripduration
##    <chr>    <dttm>              <dttm>              <chr>         <dbl>
##  1 21742443 2019-01-01 00:04:37 2019-01-01 00:11:07 2167            390
##  2 21742444 2019-01-01 00:08:13 2019-01-01 00:15:34 4386            441
##  3 21742445 2019-01-01 00:13:23 2019-01-01 00:27:12 1524            829
##  4 21742446 2019-01-01 00:13:45 2019-01-01 00:43:28 252            1783
##  5 21742447 2019-01-01 00:14:52 2019-01-01 00:20:56 1170            364
##  6 21742448 2019-01-01 00:15:33 2019-01-01 00:19:09 2437            216
##  7 21742449 2019-01-01 00:16:06 2019-01-01 00:19:03 2708            177
##  8 21742450 2019-01-01 00:18:41 2019-01-01 00:20:21 2796            100
##  9 21742451 2019-01-01 00:18:43 2019-01-01 00:47:30 6205           1727
## 10 21742452 2019-01-01 00:19:18 2019-01-01 00:24:54 3939            336
## # ... with 3,817,994 more rows, and 13 more variables: from_station_id <dbl>,
## #   from_station_name <chr>, to_station_id <dbl>, to_station_name <chr>,
## #   usertype <chr>, gender <chr>, birthyear <dbl>, date <date>, month <chr>,
## #   day <chr>, year <chr>, day_of_week <chr>, ride_length <drtn>

#View column data types
str(all_b_trips)

## spec_tbl_df [3,818,004 x 18] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ trip_id          : chr [1:3818004] "21742443" "21742444" "21742445" "21742446" ...
##  $ start_time       : POSIXlt[1:3818004], format: "2019-01-01 00:04:37" "2019-01-01 00:08:13" ...
##  $ end_time         : POSIXlt[1:3818004], format: "2019-01-01 00:11:07" "2019-01-01 00:15:34" ...
##  $ bikeid           : chr [1:3818004] "2167" "4386" "1524" "252" ...
##  $ tripduration     : num [1:3818004] 390 441 829 1783 364 ...
##  $ from_station_id  : num [1:3818004] 199 44 15 123 173 98 98 211 150 268 ...
##  $ from_station_name: chr [1:3818004] "Wabash Ave & Grand Ave" "State St & Randolph St" "Racine Ave & 18th St" "California Ave & Milwaukee Ave" ...
##  $ to_station_id    : num [1:3818004] 84 624 644 176 35 49 49 142 148 141 ...
##  $ to_station_name  : chr [1:3818004] "Milwaukee Ave & Grand Ave" "Dearborn St & Van Buren St (*)" "Western Ave & Fillmore St (*)" "Clark St & Elm St" ...
##  $ usertype         : chr [1:3818004] "Subscriber" "Subscriber" "Subscriber" "Subscriber" ...
##  $ gender           : chr [1:3818004] "Male" "Female" "Female" "Male" ...
##  $ birthyear        : num [1:3818004] 1989 1990 1994 1993 1994 ...
##  $ date             : Date[1:3818004], format: "2019-01-01" "2019-01-01" ...
##  $ month            : chr [1:3818004] "01" "01" "01" "01" ...
##  $ day              : chr [1:3818004] "01" "01" "01" "01" ...
##  $ year             : chr [1:3818004] "2019" "2019" "2019" "2019" ...
##  $ day_of_week      : chr [1:3818004] "Tuesday" "Tuesday" "Tuesday" "Tuesday" ...
##  $ ride_length      : 'difftime' num [1:3818004] 6.5 7.35 13.8166666666667 29.7166666666667 ...
##   ..- attr(*, "units")= chr "mins"
##  - attr(*, "spec")=
##   .. cols(
##   ..   trip_id = col_double(),
##   ..   start_time = col_datetime(format = ""),
##   ..   end_time = col_datetime(format = ""),
##   ..   bikeid = col_double(),
##   ..   tripduration = col_number(),
##   ..   from_station_id = col_double(),
##   ..   from_station_name = col_character(),
##   ..   to_station_id = col_double(),
##   ..   to_station_name = col_character(),
##   ..   usertype = col_character(),
##   ..   gender = col_character(),
##   ..   birthyear = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

# 4: CONDUCT DESCRIPTIVE ANALYSIS
#=====================================
# Descriptive analysis on ride_length (all figures in seconds)
mean(all_b_trips$ride_length) #straight average (total ride length / rides)

## Time difference of 24.17419 mins

median(all_b_trips$ride_length) #midpoint number in the ascending array of ride lengths

## Time difference of 11.81667 mins

max(all_b_trips$ride_length) #longest ride

## Time difference of 177200.4 mins

min(all_b_trips$ride_length) #shortest ride

## Time difference of -56.36667 mins

# You can condense the four lines above to one line using summary() on the specific attribute
summary(all_b_trips$ride_length)

##   Length    Class     Mode 
##  3818004 difftime  numeric

# Compare members and casual users
aggregate(all_b_trips$ride_length ~ all_b_trips$usertype, FUN = mean)

##   all_b_trips$usertype all_b_trips$ride_length
## 1             Customer           57.01734 mins
## 2           Subscriber           14.32765 mins

aggregate(all_b_trips$ride_length ~ all_b_trips$usertype, FUN = median)

##   all_b_trips$usertype all_b_trips$ride_length
## 1             Customer           25.83333 mins
## 2           Subscriber            9.80000 mins

aggregate(all_b_trips$ride_length ~ all_b_trips$usertype, FUN = max)

##   all_b_trips$usertype all_b_trips$ride_length
## 1             Customer           177200.4 mins
## 2           Subscriber           150943.9 mins

aggregate(all_b_trips$ride_length ~ all_b_trips$usertype, FUN = min)

##   all_b_trips$usertype all_b_trips$ride_length
## 1             Customer          -48.28333 mins
## 2           Subscriber          -56.36667 mins

# See the average ride time by each day for members vs casual users
aggregate(all_b_trips$ride_length ~ all_b_trips$usertype + all_b_trips$day_of_week, FUN = mean)

##    all_b_trips$usertype all_b_trips$day_of_week all_b_trips$ride_length
## 1              Customer                  Friday           60.17561 mins
## 2            Subscriber                  Friday           13.89748 mins
## 3              Customer                  Monday           54.49989 mins
## 4            Subscriber                  Monday           14.24928 mins
## 5              Customer                Saturday           54.06111 mins
## 6            Subscriber                Saturday           16.30271 mins
## 7              Customer                  Sunday           56.18169 mins
## 8            Subscriber                  Sunday           15.40119 mins
## 9              Customer                Thursday           59.95112 mins
## 10           Subscriber                Thursday           13.77979 mins
## 11             Customer                 Tuesday           57.41328 mins
## 12           Subscriber                 Tuesday           14.15259 mins
## 13             Customer               Wednesday           60.33407 mins
## 14           Subscriber               Wednesday           13.80984 mins

# Notice that the days of the week are out of order. Let's fix that.
all_b_trips$day_of_week <- ordered(all_b_trips$day_of_week, levels=c("Sunday","Monday","Tuesday","Wednesday","Thursday","Friday","Saturday"))

# Now, let's run the average ride time by each day for members vs casual users
aggregate(all_b_trips$ride_length ~ all_b_trips$usertype + all_b_trips$day_of_week, FUN = mean)

##    all_b_trips$usertype all_b_trips$day_of_week all_b_trips$ride_length
## 1              Customer                  Sunday           56.18169 mins
## 2            Subscriber                  Sunday           15.40119 mins
## 3              Customer                  Monday           54.49989 mins
## 4            Subscriber                  Monday           14.24928 mins
## 5              Customer                 Tuesday           57.41328 mins
## 6            Subscriber                 Tuesday           14.15259 mins
## 7              Customer               Wednesday           60.33407 mins
## 8            Subscriber               Wednesday           13.80984 mins
## 9              Customer                Thursday           59.95112 mins
## 10           Subscriber                Thursday           13.77979 mins
## 11             Customer                  Friday           60.17561 mins
## 12           Subscriber                  Friday           13.89748 mins
## 13             Customer                Saturday           54.06111 mins
## 14           Subscriber                Saturday           16.30271 mins

# analyze ridership data by type and weekday
all_b_trips %>% 
  mutate(weekday = wday(start_time, label = TRUE)) %>%  #creates weekday field using wday()
  group_by(usertype, weekday) %>%  #groups by usertype and weekday
  summarise(number_of_rides = n(),                          #calculates the number of rides and average duration 
  average_duration = mean(ride_length)) %>%         # calculates the average duration
  arrange(usertype, weekday)                                # sorts

## `summarise()` has grouped output by 'usertype'. You can override using the `.groups` argument.

## # A tibble: 14 x 4
## # Groups:   usertype [2]
##    usertype   weekday number_of_rides average_duration
##    <chr>      <ord>             <int> <drtn>          
##  1 Customer   Sun              170179 56.18169 mins   
##  2 Customer   Mon              101489 54.49989 mins   
##  3 Customer   Tue               88655 57.41328 mins   
##  4 Customer   Wed               89745 60.33407 mins   
##  5 Customer   Thu              101372 59.95112 mins   
##  6 Customer   Fri              121141 60.17561 mins   
##  7 Customer   Sat              208056 54.06111 mins   
##  8 Subscriber Sun              256241 15.40119 mins   
##  9 Subscriber Mon              458780 14.24928 mins   
## 10 Subscriber Tue              497025 14.15259 mins   
## 11 Subscriber Wed              494277 13.80984 mins   
## 12 Subscriber Thu              486915 13.77979 mins   
## 13 Subscriber Fri              456966 13.89748 mins   
## 14 Subscriber Sat              287163 16.30271 mins

# Let's visualize the number of rides by rider type
all_b_trips %>% 
  mutate(weekday = wday(start_time, label = TRUE))%>%
  group_by(usertype, weekday) %>% 
  summarise(number_of_rides = n()
 ,average_duration = mean(ride_length))%>%
  arrange(usertype, weekday)  %>% 
  ggplot(aes(x = weekday, y = number_of_rides, fill = usertype)) +
  geom_col(position = "dodge")

## `summarise()` has grouped output by 'usertype'. You can override using the `.groups` argument.

# Let's create a visualization for average duration
all_b_trips %>% 
  mutate(weekday = wday(start_time, label = TRUE)) %>% 
  group_by(usertype, weekday) %>% 
  summarise(number_of_rides = n()
  ,average_duration = mean(ride_length)) %>% 
  arrange(usertype, weekday)  %>% 
  ggplot(aes(x = weekday, y = average_duration, fill = usertype)) +
  geom_col(position = "dodge")

## `summarise()` has grouped output by 'usertype'. You can override using the `.groups` argument.

## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

#=================================================
# 5: EXPORT SUMMARY FILE FOR FURTHER ANALYSIS
#=================================================
# Create a csv file that we will visualize in Excel, Tableau, or my presentation software
# N.B.: This file location is for a Mac. If you are working on a PC, change the file location accordingly (most likely "C:\Users\YOUR_USERNAME\Desktop\...") to export the data. You can read more here: https://datatofish.com/export-dataframe-to-csv-in-r/
counts <- aggregate(all_b_trips$ride_length ~ all_b_trips$usertype + all_b_trips$day_of_week, FUN = mean)
write.csv(counts,'C:\\Users\\user\\Desktop\\RSTUDIO\\avg_ride_length.csv')

Google-Capstone-Project.R

user

2022-10-29