SETTING UP ENVIRONMENT.

INSTALL PACKAGES.

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(data.table)
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
library(hms)
## 
## Attaching package: 'hms'
## 
## The following object is masked from 'package:lubridate':
## 
##     hms
library(here)
## here() starts at C:/Users/SWill/Documents/JAN TO DEC 2021 CYCLISTIC BIKES/JAN TO DEC 2021
library(skimr)
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(conflicted)
library(gtsummary)
## #BlackLivesMatter
library(scales)
library(RColorBrewer)
library(ggthemes)
SCIENTIFIC NOTATION RUINING YOUR GGPLOT CHARTS? TRY THE LINE OF CODE BELOW
options(scipen = 999)
USE ‘getwd()’ FUNCTION TO DISPLAY WORKING DIRECTORY.
getwd()
## [1] "C:/Users/SWill/Documents/JAN TO DEC 2021 CYCLISTIC BIKES/JAN TO DEC 2021"
USE ‘setwd()’ FUNCTION TO SET WORKING DIRECTORY TO SIMPLIFY CALLS TO DATA.
setwd("C:/Users/SWill/Documents/JAN TO DEC 2021 CYCLISTIC BIKES/JAN TO DEC 2021")
USE ‘spec_csv()’ FUNCTION TO CHECK THE DATA TYPES BEFORE READING THE DATA.
NOTICE ‘started_at’ AND ‘ended_at’ COLUMNS ARE ‘datetime’ DATA TYPE.
spec_csv("C:/Users/SWill/Desktop/CYCLISTIC BIKES/divvy-trip-data 01-12/202101-divvy-tripdata.csv")
## cols(
##   ride_id = col_character(),
##   rideable_type = col_character(),
##   started_at = col_datetime(format = ""),
##   ended_at = col_datetime(format = ""),
##   start_station_name = col_character(),
##   start_station_id = col_character(),
##   end_station_name = col_character(),
##   end_station_id = col_character(),
##   start_lat = col_double(),
##   start_lng = col_double(),
##   end_lat = col_double(),
##   end_lng = col_double(),
##   member_casual = col_character()
## )
spec_csv("C:/Users/SWill/Desktop/CYCLISTIC BIKES/divvy-trip-data 01-12/202102-divvy-tripdata.csv")
## cols(
##   ride_id = col_character(),
##   rideable_type = col_character(),
##   started_at = col_datetime(format = ""),
##   ended_at = col_datetime(format = ""),
##   start_station_name = col_character(),
##   start_station_id = col_character(),
##   end_station_name = col_character(),
##   end_station_id = col_character(),
##   start_lat = col_double(),
##   start_lng = col_double(),
##   end_lat = col_double(),
##   end_lng = col_double(),
##   member_casual = col_character()
## )
spec_csv("C:/Users/SWill/Desktop/CYCLISTIC BIKES/divvy-trip-data 01-12/202103-divvy-tripdata.csv")
## cols(
##   ride_id = col_character(),
##   rideable_type = col_character(),
##   started_at = col_datetime(format = ""),
##   ended_at = col_datetime(format = ""),
##   start_station_name = col_character(),
##   start_station_id = col_character(),
##   end_station_name = col_character(),
##   end_station_id = col_character(),
##   start_lat = col_double(),
##   start_lng = col_double(),
##   end_lat = col_double(),
##   end_lng = col_double(),
##   member_casual = col_character()
## )
spec_csv("C:/Users/SWill/Desktop/CYCLISTIC BIKES/divvy-trip-data 01-12/202104-divvy-tripdata.csv")
## cols(
##   ride_id = col_character(),
##   rideable_type = col_character(),
##   started_at = col_datetime(format = ""),
##   ended_at = col_datetime(format = ""),
##   start_station_name = col_character(),
##   start_station_id = col_character(),
##   end_station_name = col_character(),
##   end_station_id = col_character(),
##   start_lat = col_double(),
##   start_lng = col_double(),
##   end_lat = col_double(),
##   end_lng = col_double(),
##   member_casual = col_character()
## )
spec_csv("C:/Users/SWill/Desktop/CYCLISTIC BIKES/divvy-trip-data 01-12/202105-divvy-tripdata.csv")
## cols(
##   ride_id = col_character(),
##   rideable_type = col_character(),
##   started_at = col_datetime(format = ""),
##   ended_at = col_datetime(format = ""),
##   start_station_name = col_character(),
##   start_station_id = col_character(),
##   end_station_name = col_character(),
##   end_station_id = col_character(),
##   start_lat = col_double(),
##   start_lng = col_double(),
##   end_lat = col_double(),
##   end_lng = col_double(),
##   member_casual = col_character()
## )
spec_csv("C:/Users/SWill/Desktop/CYCLISTIC BIKES/divvy-trip-data 01-12/202106-divvy-tripdata.csv")
## cols(
##   ride_id = col_character(),
##   rideable_type = col_character(),
##   started_at = col_datetime(format = ""),
##   ended_at = col_datetime(format = ""),
##   start_station_name = col_character(),
##   start_station_id = col_character(),
##   end_station_name = col_character(),
##   end_station_id = col_character(),
##   start_lat = col_double(),
##   start_lng = col_double(),
##   end_lat = col_double(),
##   end_lng = col_double(),
##   member_casual = col_character()
## )
spec_csv("C:/Users/SWill/Desktop/CYCLISTIC BIKES/divvy-trip-data 01-12/202107-divvy-tripdata.csv")
## cols(
##   ride_id = col_character(),
##   rideable_type = col_character(),
##   started_at = col_datetime(format = ""),
##   ended_at = col_datetime(format = ""),
##   start_station_name = col_character(),
##   start_station_id = col_character(),
##   end_station_name = col_character(),
##   end_station_id = col_character(),
##   start_lat = col_double(),
##   start_lng = col_double(),
##   end_lat = col_double(),
##   end_lng = col_double(),
##   member_casual = col_character()
## )
spec_csv("C:/Users/SWill/Desktop/CYCLISTIC BIKES/divvy-trip-data 01-12/202108-divvy-tripdata.csv")
## cols(
##   ride_id = col_character(),
##   rideable_type = col_character(),
##   started_at = col_datetime(format = ""),
##   ended_at = col_datetime(format = ""),
##   start_station_name = col_character(),
##   start_station_id = col_character(),
##   end_station_name = col_character(),
##   end_station_id = col_character(),
##   start_lat = col_double(),
##   start_lng = col_double(),
##   end_lat = col_double(),
##   end_lng = col_double(),
##   member_casual = col_character()
## )
spec_csv("C:/Users/SWill/Desktop/CYCLISTIC BIKES/divvy-trip-data 01-12/202109-divvy-tripdata.csv")
## cols(
##   ride_id = col_character(),
##   rideable_type = col_character(),
##   started_at = col_datetime(format = ""),
##   ended_at = col_datetime(format = ""),
##   start_station_name = col_character(),
##   start_station_id = col_character(),
##   end_station_name = col_character(),
##   end_station_id = col_character(),
##   start_lat = col_double(),
##   start_lng = col_double(),
##   end_lat = col_double(),
##   end_lng = col_double(),
##   member_casual = col_character()
## )
spec_csv("C:/Users/SWill/Desktop/CYCLISTIC BIKES/divvy-trip-data 01-12/202110-divvy-tripdata.csv")
## cols(
##   ride_id = col_character(),
##   rideable_type = col_character(),
##   started_at = col_datetime(format = ""),
##   ended_at = col_datetime(format = ""),
##   start_station_name = col_character(),
##   start_station_id = col_character(),
##   end_station_name = col_character(),
##   end_station_id = col_character(),
##   start_lat = col_double(),
##   start_lng = col_double(),
##   end_lat = col_double(),
##   end_lng = col_double(),
##   member_casual = col_character()
## )
spec_csv("C:/Users/SWill/Desktop/CYCLISTIC BIKES/divvy-trip-data 01-12/202111-divvy-tripdata.csv")
## cols(
##   ride_id = col_character(),
##   rideable_type = col_character(),
##   started_at = col_datetime(format = ""),
##   ended_at = col_datetime(format = ""),
##   start_station_name = col_character(),
##   start_station_id = col_character(),
##   end_station_name = col_character(),
##   end_station_id = col_character(),
##   start_lat = col_double(),
##   start_lng = col_double(),
##   end_lat = col_double(),
##   end_lng = col_double(),
##   member_casual = col_character()
## )
spec_csv("C:/Users/SWill/Desktop/CYCLISTIC BIKES/divvy-trip-data 01-12/202112-divvy-tripdata.csv")
## cols(
##   ride_id = col_character(),
##   rideable_type = col_character(),
##   started_at = col_datetime(format = ""),
##   ended_at = col_datetime(format = ""),
##   start_station_name = col_character(),
##   start_station_id = col_character(),
##   end_station_name = col_character(),
##   end_station_id = col_character(),
##   start_lat = col_double(),
##   start_lng = col_double(),
##   end_lat = col_double(),
##   end_lng = col_double(),
##   member_casual = col_character()
## )
UPLOAD DATASETS divvy-trip-data.csv FILES.
df_01 <- read.csv("C:/Users/SWill/Desktop/CYCLISTIC BIKES/divvy-trip-data 01-12/202101-divvy-tripdata.csv")
df_02 <- read.csv("C:/Users/SWill/Desktop/CYCLISTIC BIKES/divvy-trip-data 01-12/202102-divvy-tripdata.csv")
df_03 <- read.csv("C:/Users/SWill/Desktop/CYCLISTIC BIKES/divvy-trip-data 01-12/202103-divvy-tripdata.csv")
df_04 <- read.csv("C:/Users/SWill/Desktop/CYCLISTIC BIKES/divvy-trip-data 01-12/202104-divvy-tripdata.csv")
df_05 <- read.csv("C:/Users/SWill/Desktop/CYCLISTIC BIKES/divvy-trip-data 01-12/202105-divvy-tripdata.csv")
df_06 <- read.csv("C:/Users/SWill/Desktop/CYCLISTIC BIKES/divvy-trip-data 01-12/202106-divvy-tripdata.csv")
df_07 <- read.csv("C:/Users/SWill/Desktop/CYCLISTIC BIKES/divvy-trip-data 01-12/202107-divvy-tripdata.csv")
df_08 <- read.csv("C:/Users/SWill/Desktop/CYCLISTIC BIKES/divvy-trip-data 01-12/202108-divvy-tripdata.csv")
df_09 <- read.csv("C:/Users/SWill/Desktop/CYCLISTIC BIKES/divvy-trip-data 01-12/202109-divvy-tripdata.csv")
df_10 <- read.csv("C:/Users/SWill/Desktop/CYCLISTIC BIKES/divvy-trip-data 01-12/202110-divvy-tripdata.csv")
df_11 <- read.csv("C:/Users/SWill/Desktop/CYCLISTIC BIKES/divvy-trip-data 01-12/202111-divvy-tripdata.csv")
df_12 <- read.csv("C:/Users/SWill/Desktop/CYCLISTIC BIKES/divvy-trip-data 01-12/202112-divvy-tripdata.csv")
USE ‘bind_rows()’ FUNCTION TO STACK DATA FRAMES INTO ONE BIG DATA FRAME.
all_trips <- bind_rows(df_01,df_02,df_03,df_04,df_05,df_06,df_07,df_08,df_09,df_10,df_11,df_12)
CHECK COLUMNS.
colnames(all_trips)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"
USE ‘glimpse()’ FUNCTION TO GET A BETTER UNDERSTANDING OF THE DATA.
Rows: 5,595,063 Columns: 13
COLUMNS ‘started_at’ AND ‘ended_at’ ARE NOW ‘character’ DATA TYPE.
COLUMNS ‘end_station_name’ AND ‘end_station_id’ HAVE BLANK ROWS THAT NEED TO BE REMOVED.
glimpse(all_trips)
## Rows: 5,595,063
## Columns: 13
## $ ride_id            <chr> "E19E6F1B8D4C42ED", "DC88F20C2C55F27F", "EC45C94683…
## $ rideable_type      <chr> "electric_bike", "electric_bike", "electric_bike", …
## $ started_at         <chr> "2021-01-23 16:14:19", "2021-01-27 18:43:08", "2021…
## $ ended_at           <chr> "2021-01-23 16:24:44", "2021-01-27 18:47:12", "2021…
## $ start_station_name <chr> "California Ave & Cortez St", "California Ave & Cor…
## $ start_station_id   <chr> "17660", "17660", "17660", "17660", "17660", "17660…
## $ end_station_name   <chr> "", "", "", "", "", "", "", "", "", "Wood St & Augu…
## $ end_station_id     <chr> "", "", "", "", "", "", "", "", "", "657", "13258",…
## $ start_lat          <dbl> 41.90034, 41.90033, 41.90031, 41.90040, 41.90033, 4…
## $ start_lng          <dbl> -87.69674, -87.69671, -87.69664, -87.69666, -87.696…
## $ end_lat            <dbl> 41.89000, 41.90000, 41.90000, 41.92000, 41.90000, 4…
## $ end_lng            <dbl> -87.72000, -87.69000, -87.70000, -87.69000, -87.700…
## $ member_casual      <chr> "member", "member", "member", "member", "casual", "…
USE ‘str()’ FUNCTION TO SEE LIST OF COLUMNS AND DATA TYPES NUMERIC, CHARACTER, DATETIME ETC.
‘data.frame’: 5595063 obs. of 13 variables:
str(all_trips)
## 'data.frame':    5595063 obs. of  13 variables:
##  $ ride_id           : chr  "E19E6F1B8D4C42ED" "DC88F20C2C55F27F" "EC45C94683FE3F27" "4FA453A75AE377DB" ...
##  $ rideable_type     : chr  "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
##  $ started_at        : chr  "2021-01-23 16:14:19" "2021-01-27 18:43:08" "2021-01-21 22:35:54" "2021-01-07 13:31:13" ...
##  $ ended_at          : chr  "2021-01-23 16:24:44" "2021-01-27 18:47:12" "2021-01-21 22:37:14" "2021-01-07 13:42:55" ...
##  $ start_station_name: chr  "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" ...
##  $ start_station_id  : chr  "17660" "17660" "17660" "17660" ...
##  $ end_station_name  : chr  "" "" "" "" ...
##  $ end_station_id    : chr  "" "" "" "" ...
##  $ start_lat         : num  41.9 41.9 41.9 41.9 41.9 ...
##  $ start_lng         : num  -87.7 -87.7 -87.7 -87.7 -87.7 ...
##  $ end_lat           : num  41.9 41.9 41.9 41.9 41.9 ...
##  $ end_lng           : num  -87.7 -87.7 -87.7 -87.7 -87.7 ...
##  $ member_casual     : chr  "member" "member" "member" "member" ...
USE TIDYR TO SEPARATE “started_at” COLUMN TO A NEW COLUMN CALLED “start_date” and “start_time”.
USE TIDYR TO SEPARATE “ended_at” COLUMN TO A NEW COLUMN CALLED “end_date” and “end_time”.
all_trips <- tidyr::separate(all_trips, started_at, c("start_date", "start_time"), sep = " ", remove = FALSE)
all_trips <- tidyr::separate(all_trips, ended_at, c("end_date", "end_time"), sep = " ", remove = FALSE)
USE ‘colnames()’ FUNCTION TO INSPECT THE NEW COLUMNS THAT HAVE BEEN CREATED.
colnames(all_trips)
##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "start_date"         "start_time"         "ended_at"          
##  [7] "end_date"           "end_time"           "start_station_name"
## [10] "start_station_id"   "end_station_name"   "end_station_id"    
## [13] "start_lat"          "start_lng"          "end_lat"           
## [16] "end_lng"            "member_casual"
‘data.frame’: 5595063 obs. of 17 variables:
str(all_trips)
## 'data.frame':    5595063 obs. of  17 variables:
##  $ ride_id           : chr  "E19E6F1B8D4C42ED" "DC88F20C2C55F27F" "EC45C94683FE3F27" "4FA453A75AE377DB" ...
##  $ rideable_type     : chr  "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
##  $ started_at        : chr  "2021-01-23 16:14:19" "2021-01-27 18:43:08" "2021-01-21 22:35:54" "2021-01-07 13:31:13" ...
##  $ start_date        : chr  "2021-01-23" "2021-01-27" "2021-01-21" "2021-01-07" ...
##  $ start_time        : chr  "16:14:19" "18:43:08" "22:35:54" "13:31:13" ...
##  $ ended_at          : chr  "2021-01-23 16:24:44" "2021-01-27 18:47:12" "2021-01-21 22:37:14" "2021-01-07 13:42:55" ...
##  $ end_date          : chr  "2021-01-23" "2021-01-27" "2021-01-21" "2021-01-07" ...
##  $ end_time          : chr  "16:24:44" "18:47:12" "22:37:14" "13:42:55" ...
##  $ start_station_name: chr  "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" ...
##  $ start_station_id  : chr  "17660" "17660" "17660" "17660" ...
##  $ end_station_name  : chr  "" "" "" "" ...
##  $ end_station_id    : chr  "" "" "" "" ...
##  $ start_lat         : num  41.9 41.9 41.9 41.9 41.9 ...
##  $ start_lng         : num  -87.7 -87.7 -87.7 -87.7 -87.7 ...
##  $ end_lat           : num  41.9 41.9 41.9 41.9 41.9 ...
##  $ end_lng           : num  -87.7 -87.7 -87.7 -87.7 -87.7 ...
##  $ member_casual     : chr  "member" "member" "member" "member" ...

EXPLORE AND MANIPULATE DATA FRAME JAN TO DEC 2021.

COLUMN RIDEABLE TYPE.

EXPLORE CHARACTER VARIABLE TYPE IN “rideable_type” COLUMN.
USE ‘class’ FUNCTION TO CHECK DATA TYPE IN COLUMN.
class(all_trips$rideable_type)
## [1] "character"
USE ‘unique ()’ FUNCTION TO FIND INDIVIDUAL VALUES IN COLUMN.
unique(all_trips$rideable_type)
## [1] "electric_bike" "classic_bike"  "docked_bike"
HOW MANY OBSERVATIONS FALL UNDER EACH USER TYPE?
table(all_trips$rideable_type)
## 
##  classic_bike   docked_bike electric_bike 
##       3251028        312343       2031692
sort(table(all_trips$rideable_type), decreasing = TRUE)
## 
##  classic_bike electric_bike   docked_bike 
##       3251028       2031692        312343
BAR PLOT OF DATA DISTRIBUTION OF ‘rideable_type’ COLUMN.
barplot(sort(table(all_trips$rideable_type), decreasing = TRUE))

CHANGE VARIABLE FROM CHARACTER TO FACTOR.
all_trips$rideable_type <- as.factor(all_trips$rideable_type)
USE ‘class’ FUNCTION TO CHECK DATA TYPE IN COLUMN.
class(all_trips$rideable_type)
## [1] "factor"
USE ‘levels’ FUNCTION TO CHECK FACTOR.
levels(all_trips$rideable_type)
## [1] "classic_bike"  "docked_bike"   "electric_bike"
NOTE RIDEABLE TYPE IS NOW A FACTOR.
glimpse(all_trips)
## Rows: 5,595,063
## Columns: 17
## $ ride_id            <chr> "E19E6F1B8D4C42ED", "DC88F20C2C55F27F", "EC45C94683…
## $ rideable_type      <fct> electric_bike, electric_bike, electric_bike, electr…
## $ started_at         <chr> "2021-01-23 16:14:19", "2021-01-27 18:43:08", "2021…
## $ start_date         <chr> "2021-01-23", "2021-01-27", "2021-01-21", "2021-01-…
## $ start_time         <chr> "16:14:19", "18:43:08", "22:35:54", "13:31:13", "02…
## $ ended_at           <chr> "2021-01-23 16:24:44", "2021-01-27 18:47:12", "2021…
## $ end_date           <chr> "2021-01-23", "2021-01-27", "2021-01-21", "2021-01-…
## $ end_time           <chr> "16:24:44", "18:47:12", "22:37:14", "13:42:55", "02…
## $ start_station_name <chr> "California Ave & Cortez St", "California Ave & Cor…
## $ start_station_id   <chr> "17660", "17660", "17660", "17660", "17660", "17660…
## $ end_station_name   <chr> "", "", "", "", "", "", "", "", "", "Wood St & Augu…
## $ end_station_id     <chr> "", "", "", "", "", "", "", "", "", "657", "13258",…
## $ start_lat          <dbl> 41.90034, 41.90033, 41.90031, 41.90040, 41.90033, 4…
## $ start_lng          <dbl> -87.69674, -87.69671, -87.69664, -87.69666, -87.696…
## $ end_lat            <dbl> 41.89000, 41.90000, 41.90000, 41.92000, 41.90000, 4…
## $ end_lng            <dbl> -87.72000, -87.69000, -87.70000, -87.69000, -87.700…
## $ member_casual      <chr> "member", "member", "member", "member", "casual", "…

COLUMN STARTED_AT AND ENDED_AT.

EXPLORE…CHARACTER VARIABLE TYPE IN “started_at” AND ended_at” COLUMN.
DATA TYPE IN COLUMN “started_at” AND “end_at” WAS DATETIME BEFORE UPLOADING.
CONVERT “started_at” AND “ended_at” COLUMN FROM CHARACTER TO DATETIME.
all_trips$started_at <- as.POSIXlt(all_trips$started_at, format="%Y-%m-%d %H:%M:%S", tz="UTC")
all_trips$ended_at <- as.POSIXlt(all_trips$ended_at, format="%Y-%m-%d %H:%M:%S", tz="UTC")
CONVERT “start_date” COLUMN FROM CHARACTER TO DATE FORMAT.
all_trips$start_date <- as.POSIXlt(all_trips$start_date)
USE ‘class’ FUNCTION TO CHECK DATA TYPE IN COLUMN.
class(all_trips$start_date) 
## [1] "POSIXlt" "POSIXt"
CONVERT “end_date” COLUMN FROM CHARACTER TO DATE FORMAT.
all_trips$end_date <- as.POSIXlt(all_trips$end_date)
USE ‘class’ FUNCTION TO CHECK DATA TYPE IN COLUMN.
class(all_trips$end_date) 
## [1] "POSIXlt" "POSIXt"
USE ‘str()’ FUNCTION TO SEE LIST OF COLUMNS AND DATA TYPES NUMERIC, CHARACTER, DATETIME ETC.
‘started_at’AND ’ended_at’ CHARACTER DATA TYPE IS NOW POSIXlt.
‘data.frame’: 5595063 obs. of 17 variables:
str(all_trips)
## 'data.frame':    5595063 obs. of  17 variables:
##  $ ride_id           : chr  "E19E6F1B8D4C42ED" "DC88F20C2C55F27F" "EC45C94683FE3F27" "4FA453A75AE377DB" ...
##  $ rideable_type     : Factor w/ 3 levels "classic_bike",..: 3 3 3 3 3 3 3 3 3 1 ...
##  $ started_at        : POSIXlt, format: "2021-01-23 16:14:19" "2021-01-27 18:43:08" ...
##  $ start_date        : POSIXlt, format: "2021-01-23" "2021-01-27" ...
##  $ start_time        : chr  "16:14:19" "18:43:08" "22:35:54" "13:31:13" ...
##  $ ended_at          : POSIXlt, format: "2021-01-23 16:24:44" "2021-01-27 18:47:12" ...
##  $ end_date          : POSIXlt, format: "2021-01-23" "2021-01-27" ...
##  $ end_time          : chr  "16:24:44" "18:47:12" "22:37:14" "13:42:55" ...
##  $ start_station_name: chr  "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" ...
##  $ start_station_id  : chr  "17660" "17660" "17660" "17660" ...
##  $ end_station_name  : chr  "" "" "" "" ...
##  $ end_station_id    : chr  "" "" "" "" ...
##  $ start_lat         : num  41.9 41.9 41.9 41.9 41.9 ...
##  $ start_lng         : num  -87.7 -87.7 -87.7 -87.7 -87.7 ...
##  $ end_lat           : num  41.9 41.9 41.9 41.9 41.9 ...
##  $ end_lng           : num  -87.7 -87.7 -87.7 -87.7 -87.7 ...
##  $ member_casual     : chr  "member" "member" "member" "member" ...

COLUMN START_STATION_NAME START_STATION_ID END_STATION_NAME AND END_STATION_ID.

EXPLORE…CHARACTER VARIABLE TYPE IN “start_staion_name” AND “end_staion_name”
REPLACE ALL BLANK VALUES IN “start_station_name” COLUMN WITH NA VALUES.
all_trips$start_station_name[all_trips$start_station_name==""] <- NA
REPLACE ALL BLANK VALUES IN “start_station_id” COLUMN WITH NA VALUES.
all_trips$start_station_id[all_trips$start_station_id==""] <- NA
REPLACE ALL BLANK VALUES IN “end_station_name” COLUMN WITH NA VALUES.
all_trips$end_station_name[all_trips$end_station_name==""] <- NA
REPLACE ALL BLANK VALUES IN “end_station_id” COLUMN WITH NA VALUES.
all_trips$end_station_id[all_trips$end_station_id==""] <- NA
glimpse(all_trips)
## Rows: 5,595,063
## Columns: 17
## $ ride_id            <chr> "E19E6F1B8D4C42ED", "DC88F20C2C55F27F", "EC45C94683…
## $ rideable_type      <fct> electric_bike, electric_bike, electric_bike, electr…
## $ started_at         <dttm> 2021-01-23 16:14:19, 2021-01-27 18:43:08, 2021-01-…
## $ start_date         <dttm> 2021-01-23, 2021-01-27, 2021-01-21, 2021-01-07, 20…
## $ start_time         <chr> "16:14:19", "18:43:08", "22:35:54", "13:31:13", "02…
## $ ended_at           <dttm> 2021-01-23 16:24:44, 2021-01-27 18:47:12, 2021-01-…
## $ end_date           <dttm> 2021-01-23, 2021-01-27, 2021-01-21, 2021-01-07, 20…
## $ end_time           <chr> "16:24:44", "18:47:12", "22:37:14", "13:42:55", "02…
## $ start_station_name <chr> "California Ave & Cortez St", "California Ave & Cor…
## $ start_station_id   <chr> "17660", "17660", "17660", "17660", "17660", "17660…
## $ end_station_name   <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, "Wood St & Augu…
## $ end_station_id     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, "657", "13258",…
## $ start_lat          <dbl> 41.90034, 41.90033, 41.90031, 41.90040, 41.90033, 4…
## $ start_lng          <dbl> -87.69674, -87.69671, -87.69664, -87.69666, -87.696…
## $ end_lat            <dbl> 41.89000, 41.90000, 41.90000, 41.92000, 41.90000, 4…
## $ end_lng            <dbl> -87.72000, -87.69000, -87.70000, -87.69000, -87.700…
## $ member_casual      <chr> "member", "member", "member", "member", "casual", "…
REMOVE ROWS WITH NA VALUES IN ALL COLUMNS.
all_trips <- all_trips %>% drop_na()
‘data.frame’: 4588302 obs. of 17 variables:
str(all_trips)
## 'data.frame':    4588302 obs. of  17 variables:
##  $ ride_id           : chr  "B9F73448DFBE0D45" "457C7F4B5D3DA135" "57C750326F9FDABE" "4D518C65E338D070" ...
##  $ rideable_type     : Factor w/ 3 levels "classic_bike",..: 1 3 3 3 1 3 1 1 3 3 ...
##  $ started_at        : POSIXlt, format: "2021-01-24 19:15:38" "2021-01-23 12:57:38" ...
##  $ start_date        : POSIXlt, format: "2021-01-24" "2021-01-23" ...
##  $ start_time        : chr  "19:15:38" "12:57:38" "15:28:04" "15:28:57" ...
##  $ ended_at          : POSIXlt, format: "2021-01-24 19:22:51" "2021-01-23 13:02:10" ...
##  $ end_date          : POSIXlt, format: "2021-01-24" "2021-01-23" ...
##  $ end_time          : chr  "19:22:51" "13:02:10" "15:37:51" "15:37:54" ...
##  $ start_station_name: chr  "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" ...
##  $ start_station_id  : chr  "17660" "17660" "17660" "17660" ...
##  $ end_station_name  : chr  "Wood St & Augusta Blvd" "California Ave & North Ave" "Wood St & Augusta Blvd" "Wood St & Augusta Blvd" ...
##  $ end_station_id    : chr  "657" "13258" "657" "657" ...
##  $ start_lat         : num  41.9 41.9 41.9 41.9 41.9 ...
##  $ start_lng         : num  -87.7 -87.7 -87.7 -87.7 -87.7 ...
##  $ end_lat           : num  41.9 41.9 41.9 41.9 41.9 ...
##  $ end_lng           : num  -87.7 -87.7 -87.7 -87.7 -87.7 ...
##  $ member_casual     : chr  "member" "member" "casual" "casual" ...

COLUMN MEMBER_CASUAL.

EXPLORE…CHARACTER VARIABLE TYPE IN “member_casual” COLUMN.
USE ‘unique ()’ FUNCTION TO FIND INDIVIDUAL VALUES IN COLUMN.
unique(all_trips$member_casual)
## [1] "member" "casual"
HOW MANY OBSERVATIONS FALL UNDER EACH USER TYPE?
table(all_trips$member_casual)
## 
##  casual  member 
## 2048379 2539923
sort(table(all_trips$member_casual), decreasing = TRUE)
## 
##  member  casual 
## 2539923 2048379
BAR PLOT OF DATA DISTRIBUTION OF ‘member_casual’ COLUMN.
barplot(sort(table(all_trips$member_casual), decreasing = TRUE))

CHANGE VARIABLE FROM CHARACTER TO FACTOR.
all_trips$member_casual <- as.factor(all_trips$member_casual)
USE ‘class’ FUNCTION TO CHECK DATA TYPE IN COLUMN.
class(all_trips$member_casual)
## [1] "factor"
USE ‘levels’ FUNCTION TO CHECK FACTOR.
levels(all_trips$member_casual)
## [1] "casual" "member"
NOTE MEMBER CASUAL IS NOW A FACTOR.
glimpse(all_trips)
## Rows: 4,588,302
## Columns: 17
## $ ride_id            <chr> "B9F73448DFBE0D45", "457C7F4B5D3DA135", "57C750326F…
## $ rideable_type      <fct> classic_bike, electric_bike, electric_bike, electri…
## $ started_at         <dttm> 2021-01-24 19:15:38, 2021-01-23 12:57:38, 2021-01-…
## $ start_date         <dttm> 2021-01-24, 2021-01-23, 2021-01-09, 2021-01-09, 20…
## $ start_time         <chr> "19:15:38", "12:57:38", "15:28:04", "15:28:57", "15…
## $ ended_at           <dttm> 2021-01-24 19:22:51, 2021-01-23 13:02:10, 2021-01-…
## $ end_date           <dttm> 2021-01-24, 2021-01-23, 2021-01-09, 2021-01-09, 20…
## $ end_time           <chr> "19:22:51", "13:02:10", "15:37:51", "15:37:54", "16…
## $ start_station_name <chr> "California Ave & Cortez St", "California Ave & Cor…
## $ start_station_id   <chr> "17660", "17660", "17660", "17660", "17660", "17660…
## $ end_station_name   <chr> "Wood St & Augusta Blvd", "California Ave & North A…
## $ end_station_id     <chr> "657", "13258", "657", "657", "657", "KA1504000135"…
## $ start_lat          <dbl> 41.90036, 41.90041, 41.90037, 41.90038, 41.90036, 4…
## $ start_lng          <dbl> -87.69670, -87.69673, -87.69669, -87.69672, -87.696…
## $ end_lat            <dbl> 41.89918, 41.91044, 41.89918, 41.89915, 41.89918, 4…
## $ end_lng            <dbl> -87.67220, -87.69689, -87.67218, -87.67218, -87.672…
## $ member_casual      <fct> member, member, casual, casual, casual, member, mem…

ADD A CALCULATED FIELD FOR NEW COLUMN “ride_length_secs”.

all_trips$ride_length_secs <- difftime(all_trips$ended_at,all_trips$started_at)
CHECK DATA TYPE.
is.numeric(all_trips$ride_length_secs)
## [1] FALSE
USE ‘class’ FUNCTION TO CHECK DATA TYPE IN COLUMN.
class(all_trips$ride_length_secs)
## [1] "difftime"
CONVERT “ride_length_secs” FROM DIFFTIME TO NUMERIC TO RUN CALCULATIONS ON THE DATA.
all_trips$ride_length_secs <- as.numeric(as.character(all_trips$ride_length_secs))
CHECK DATA TYPE.
is.numeric(all_trips$ride_length_secs)
## [1] TRUE

CREATE NEW COLUMN “ride_length_total” USING MUTATE FUNCTION.

all_trips <- mutate(all_trips, ride_length_total = ride_length_secs/60)
CHECK DATA TYPE.
is.numeric(all_trips$ride_length_total)
## [1] TRUE

ADD COLUMN FOR DAY OF WEEK.

NUMERIC VALUE DAY OF WEEK SUNDAY = 1 MONDAY = 2 TUESDAY = 3 ETC, ETC…
all_trips$weekday <- lubridate::wday(all_trips$start_date)
CHARACTER DAY OF WEEK USING ABBREVIATED LABELS MON,TUE,WED ETC ETC…
all_trips$weekday. <- lubridate::wday(all_trips$start_date, label = TRUE)
CHANGE ‘weekday’ DATA TYPE.
all_trips$weekday. <- as.factor(all_trips$weekday.)
USE ‘class’ FUNCTION TO CHECK DATA TYPE IN COLUMN.
class(all_trips$weekday.)
## [1] "ordered" "factor"
USE ‘levels’ FUNCTION TO CHECK FACTOR.
levels(all_trips$weekday.)
## [1] "Sun" "Mon" "Tue" "Wed" "Thu" "Fri" "Sat"
NOTE WEEKDAY. IS AN ORDERED FACTOR.
glimpse(all_trips)
## Rows: 4,588,302
## Columns: 21
## $ ride_id            <chr> "B9F73448DFBE0D45", "457C7F4B5D3DA135", "57C750326F…
## $ rideable_type      <fct> classic_bike, electric_bike, electric_bike, electri…
## $ started_at         <dttm> 2021-01-24 19:15:38, 2021-01-23 12:57:38, 2021-01-…
## $ start_date         <dttm> 2021-01-24, 2021-01-23, 2021-01-09, 2021-01-09, 20…
## $ start_time         <chr> "19:15:38", "12:57:38", "15:28:04", "15:28:57", "15…
## $ ended_at           <dttm> 2021-01-24 19:22:51, 2021-01-23 13:02:10, 2021-01-…
## $ end_date           <dttm> 2021-01-24, 2021-01-23, 2021-01-09, 2021-01-09, 20…
## $ end_time           <chr> "19:22:51", "13:02:10", "15:37:51", "15:37:54", "16…
## $ start_station_name <chr> "California Ave & Cortez St", "California Ave & Cor…
## $ start_station_id   <chr> "17660", "17660", "17660", "17660", "17660", "17660…
## $ end_station_name   <chr> "Wood St & Augusta Blvd", "California Ave & North A…
## $ end_station_id     <chr> "657", "13258", "657", "657", "657", "KA1504000135"…
## $ start_lat          <dbl> 41.90036, 41.90041, 41.90037, 41.90038, 41.90036, 4…
## $ start_lng          <dbl> -87.69670, -87.69673, -87.69669, -87.69672, -87.696…
## $ end_lat            <dbl> 41.89918, 41.91044, 41.89918, 41.89915, 41.89918, 4…
## $ end_lng            <dbl> -87.67220, -87.69689, -87.67218, -87.67218, -87.672…
## $ member_casual      <fct> member, member, casual, casual, casual, member, mem…
## $ ride_length_secs   <dbl> 433, 272, 587, 537, 609, 1233, 360, 268, 1103, 1025…
## $ ride_length_total  <dbl> 7.216667, 4.533333, 9.783333, 8.950000, 10.150000, …
## $ weekday            <dbl> 1, 7, 7, 7, 1, 6, 3, 7, 4, 6, 1, 2, 5, 2, 6, 7, 5, …
## $ weekday.           <ord> Sun, Sat, Sat, Sat, Sun, Fri, Tue, Sat, Wed, Fri, S…
EXPLORE NUMERIC VARIABLE TYPE IN “weekday” COLUMN.
USE ‘class’ FUNCTION TO CHECK DATA TYPE IN COLUMN.
class(all_trips$weekday)
## [1] "numeric"
USE ‘summary()’ FUNCTION TO SUMMARIZE VALUES IN DATA FRAME.
summary(all_trips$weekday)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   4.000   4.106   6.000   7.000
BOX PLOT IS A GRAPHICAL REPRESENTATION TO SUMMARIZE DATA AND IDENTIFY OUTLIERS.
boxplot(all_trips$weekday, col = 'blue') 

HISTOGRAM TO VISUALIZE DISTRIBUTION OF VALUES IN WEEKDAY COLUMN.
hist(all_trips$weekday, col='green') 

NOTE WEEKDAY IS NOW A ‘dbl’.
glimpse(all_trips)
## Rows: 4,588,302
## Columns: 21
## $ ride_id            <chr> "B9F73448DFBE0D45", "457C7F4B5D3DA135", "57C750326F…
## $ rideable_type      <fct> classic_bike, electric_bike, electric_bike, electri…
## $ started_at         <dttm> 2021-01-24 19:15:38, 2021-01-23 12:57:38, 2021-01-…
## $ start_date         <dttm> 2021-01-24, 2021-01-23, 2021-01-09, 2021-01-09, 20…
## $ start_time         <chr> "19:15:38", "12:57:38", "15:28:04", "15:28:57", "15…
## $ ended_at           <dttm> 2021-01-24 19:22:51, 2021-01-23 13:02:10, 2021-01-…
## $ end_date           <dttm> 2021-01-24, 2021-01-23, 2021-01-09, 2021-01-09, 20…
## $ end_time           <chr> "19:22:51", "13:02:10", "15:37:51", "15:37:54", "16…
## $ start_station_name <chr> "California Ave & Cortez St", "California Ave & Cor…
## $ start_station_id   <chr> "17660", "17660", "17660", "17660", "17660", "17660…
## $ end_station_name   <chr> "Wood St & Augusta Blvd", "California Ave & North A…
## $ end_station_id     <chr> "657", "13258", "657", "657", "657", "KA1504000135"…
## $ start_lat          <dbl> 41.90036, 41.90041, 41.90037, 41.90038, 41.90036, 4…
## $ start_lng          <dbl> -87.69670, -87.69673, -87.69669, -87.69672, -87.696…
## $ end_lat            <dbl> 41.89918, 41.91044, 41.89918, 41.89915, 41.89918, 4…
## $ end_lng            <dbl> -87.67220, -87.69689, -87.67218, -87.67218, -87.672…
## $ member_casual      <fct> member, member, casual, casual, casual, member, mem…
## $ ride_length_secs   <dbl> 433, 272, 587, 537, 609, 1233, 360, 268, 1103, 1025…
## $ ride_length_total  <dbl> 7.216667, 4.533333, 9.783333, 8.950000, 10.150000, …
## $ weekday            <dbl> 1, 7, 7, 7, 1, 6, 3, 7, 4, 6, 1, 2, 5, 2, 6, 7, 5, …
## $ weekday.           <ord> Sun, Sat, Sat, Sat, Sun, Fri, Tue, Sat, Wed, Fri, S…
NOTE WEEKDAY IS NOW NUMERIC.
str(all_trips)
## 'data.frame':    4588302 obs. of  21 variables:
##  $ ride_id           : chr  "B9F73448DFBE0D45" "457C7F4B5D3DA135" "57C750326F9FDABE" "4D518C65E338D070" ...
##  $ rideable_type     : Factor w/ 3 levels "classic_bike",..: 1 3 3 3 1 3 1 1 3 3 ...
##  $ started_at        : POSIXlt, format: "2021-01-24 19:15:38" "2021-01-23 12:57:38" ...
##  $ start_date        : POSIXlt, format: "2021-01-24" "2021-01-23" ...
##  $ start_time        : chr  "19:15:38" "12:57:38" "15:28:04" "15:28:57" ...
##  $ ended_at          : POSIXlt, format: "2021-01-24 19:22:51" "2021-01-23 13:02:10" ...
##  $ end_date          : POSIXlt, format: "2021-01-24" "2021-01-23" ...
##  $ end_time          : chr  "19:22:51" "13:02:10" "15:37:51" "15:37:54" ...
##  $ start_station_name: chr  "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" ...
##  $ start_station_id  : chr  "17660" "17660" "17660" "17660" ...
##  $ end_station_name  : chr  "Wood St & Augusta Blvd" "California Ave & North Ave" "Wood St & Augusta Blvd" "Wood St & Augusta Blvd" ...
##  $ end_station_id    : chr  "657" "13258" "657" "657" ...
##  $ start_lat         : num  41.9 41.9 41.9 41.9 41.9 ...
##  $ start_lng         : num  -87.7 -87.7 -87.7 -87.7 -87.7 ...
##  $ end_lat           : num  41.9 41.9 41.9 41.9 41.9 ...
##  $ end_lng           : num  -87.7 -87.7 -87.7 -87.7 -87.7 ...
##  $ member_casual     : Factor w/ 2 levels "casual","member": 2 2 1 1 1 2 2 2 2 2 ...
##  $ ride_length_secs  : num  433 272 587 537 609 ...
##  $ ride_length_total : num  7.22 4.53 9.78 8.95 10.15 ...
##  $ weekday           : num  1 7 7 7 1 6 3 7 4 6 ...
##  $ weekday.          : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tue"<..: 1 7 7 7 1 6 3 7 4 6 ...

NEW COLUMN RIDE_LENGTH_SECS

DELETE RIDES UNDER 2 MINUTES (> 120) 4473893 ROWS REMAIN.
all_trips <- subset(all_trips, ride_length_secs > 120)
‘data.frame’: 4473893 obs. of 21 variables:
str(all_trips)
## 'data.frame':    4473893 obs. of  21 variables:
##  $ ride_id           : chr  "B9F73448DFBE0D45" "457C7F4B5D3DA135" "57C750326F9FDABE" "4D518C65E338D070" ...
##  $ rideable_type     : Factor w/ 3 levels "classic_bike",..: 1 3 3 3 1 3 1 1 3 3 ...
##  $ started_at        : POSIXlt, format: "2021-01-24 19:15:38" "2021-01-23 12:57:38" ...
##  $ start_date        : POSIXlt, format: "2021-01-24" "2021-01-23" ...
##  $ start_time        : chr  "19:15:38" "12:57:38" "15:28:04" "15:28:57" ...
##  $ ended_at          : POSIXlt, format: "2021-01-24 19:22:51" "2021-01-23 13:02:10" ...
##  $ end_date          : POSIXlt, format: "2021-01-24" "2021-01-23" ...
##  $ end_time          : chr  "19:22:51" "13:02:10" "15:37:51" "15:37:54" ...
##  $ start_station_name: chr  "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" ...
##  $ start_station_id  : chr  "17660" "17660" "17660" "17660" ...
##  $ end_station_name  : chr  "Wood St & Augusta Blvd" "California Ave & North Ave" "Wood St & Augusta Blvd" "Wood St & Augusta Blvd" ...
##  $ end_station_id    : chr  "657" "13258" "657" "657" ...
##  $ start_lat         : num  41.9 41.9 41.9 41.9 41.9 ...
##  $ start_lng         : num  -87.7 -87.7 -87.7 -87.7 -87.7 ...
##  $ end_lat           : num  41.9 41.9 41.9 41.9 41.9 ...
##  $ end_lng           : num  -87.7 -87.7 -87.7 -87.7 -87.7 ...
##  $ member_casual     : Factor w/ 2 levels "casual","member": 2 2 1 1 1 2 2 2 2 2 ...
##  $ ride_length_secs  : num  433 272 587 537 609 ...
##  $ ride_length_total : num  7.22 4.53 9.78 8.95 10.15 ...
##  $ weekday           : num  1 7 7 7 1 6 3 7 4 6 ...
##  $ weekday.          : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tue"<..: 1 7 7 7 1 6 3 7 4 6 ...
DELETE RIDES OVER 24 HOURS (> 86400) 4472618 ROWS REMAIN.
all_trips <- subset(all_trips, ride_length_secs < 86400)
‘data.frame’: 4472618 obs. of 21 variables:
str(all_trips)
## 'data.frame':    4472618 obs. of  21 variables:
##  $ ride_id           : chr  "B9F73448DFBE0D45" "457C7F4B5D3DA135" "57C750326F9FDABE" "4D518C65E338D070" ...
##  $ rideable_type     : Factor w/ 3 levels "classic_bike",..: 1 3 3 3 1 3 1 1 3 3 ...
##  $ started_at        : POSIXlt, format: "2021-01-24 19:15:38" "2021-01-23 12:57:38" ...
##  $ start_date        : POSIXlt, format: "2021-01-24" "2021-01-23" ...
##  $ start_time        : chr  "19:15:38" "12:57:38" "15:28:04" "15:28:57" ...
##  $ ended_at          : POSIXlt, format: "2021-01-24 19:22:51" "2021-01-23 13:02:10" ...
##  $ end_date          : POSIXlt, format: "2021-01-24" "2021-01-23" ...
##  $ end_time          : chr  "19:22:51" "13:02:10" "15:37:51" "15:37:54" ...
##  $ start_station_name: chr  "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" "California Ave & Cortez St" ...
##  $ start_station_id  : chr  "17660" "17660" "17660" "17660" ...
##  $ end_station_name  : chr  "Wood St & Augusta Blvd" "California Ave & North Ave" "Wood St & Augusta Blvd" "Wood St & Augusta Blvd" ...
##  $ end_station_id    : chr  "657" "13258" "657" "657" ...
##  $ start_lat         : num  41.9 41.9 41.9 41.9 41.9 ...
##  $ start_lng         : num  -87.7 -87.7 -87.7 -87.7 -87.7 ...
##  $ end_lat           : num  41.9 41.9 41.9 41.9 41.9 ...
##  $ end_lng           : num  -87.7 -87.7 -87.7 -87.7 -87.7 ...
##  $ member_casual     : Factor w/ 2 levels "casual","member": 2 2 1 1 1 2 2 2 2 2 ...
##  $ ride_length_secs  : num  433 272 587 537 609 ...
##  $ ride_length_total : num  7.22 4.53 9.78 8.95 10.15 ...
##  $ weekday           : num  1 7 7 7 1 6 3 7 4 6 ...
##  $ weekday.          : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tue"<..: 1 7 7 7 1 6 3 7 4 6 ...

SORT DATA FRAME BY DATE AND TIMES.

all_trips <- all_trips %>% arrange(ymd_hms(all_trips$started_at))

CREATE NEW DATA FRAME (all_trips_v2) FROM DATA FRAME (all_trips).

all_trips_v2 <- all_trips[c('rideable_type', 'started_at', 'start_date', 'member_casual', 'ride_length_secs', 'ride_length_total', 'weekday', 'weekday.')]

DESCRIPTIVE ANALYSIS ON RIDE LENGTH.

MINIMUM TRIP TIME.
min(all_trips_v2$ride_length_secs)
## [1] 121
MIDDLE VALUE IN OCT TO DEC DATASET.
median(all_trips_v2$ride_length_secs)
## [1] 752
MAXIMUM TRIP TIME.
max(all_trips_v2$ride_length_secs)
## [1] 86362
AVERAGE TRIP.
mean(all_trips_v2$ride_length_secs)
## [1] 1222.077
THE DIFFERENCE BETWEEN MAXIMUM AND MINIMUM TRIP.
range(all_trips_v2$ride_length_secs)
## [1]   121 86362
DIFFERENCE BETWEEN THE FIRST QUARTILE AND THIRD QUARTILE OF JAN TO DEC 2021.
IQR(all_trips_v2$ride_length_secs)
## [1] 913

COMPARE MEMBERS AND CASUAL RIDERS.

MEMBERS Vs CASUAL MINIMUM TRIP TIME.
aggregate(all_trips_v2$ride_length_secs ~ all_trips_v2$member_casual, FUN = min)
##   all_trips_v2$member_casual all_trips_v2$ride_length_secs
## 1                     casual                           121
## 2                     member                           121
MEMBERS Vs CASUAL MIDDLE VALUE IN JAN TO DEC 2021 DATASET.
aggregate(all_trips_v2$ride_length_secs ~ all_trips_v2$member_casual, FUN = median)
##   all_trips_v2$member_casual all_trips_v2$ride_length_secs
## 1                     casual                          1018
## 2                     member                           601
MEMBERS Vs CASUAL MAXIMUM TRIP TIME.
aggregate(all_trips_v2$ride_length_secs ~ all_trips_v2$member_casual, FUN = max)
##   all_trips_v2$member_casual all_trips_v2$ride_length_secs
## 1                     casual                         86362
## 2                     member                         85594
MEMBERS Vs CASUAL AVERAGE TRIP.
aggregate(all_trips_v2$ride_length_secs ~ all_trips_v2$member_casual, FUN = mean)
##   all_trips_v2$member_casual all_trips_v2$ride_length_secs
## 1                     casual                     1721.1465
## 2                     member                      813.9786
AVERAGE RIDE TIME FOR EACH DAY FOR MEMBERS Vs CASUAL RIDERS.
aggregate(all_trips_v2$ride_length_total ~ all_trips_v2$member_casual + all_trips_v2$weekday., FUN = mean)
##    all_trips_v2$member_casual all_trips_v2$weekday.
## 1                      casual                   Sun
## 2                      member                   Sun
## 3                      casual                   Mon
## 4                      member                   Mon
## 5                      casual                   Tue
## 6                      member                   Tue
## 7                      casual                   Wed
## 8                      member                   Wed
## 9                      casual                   Thu
## 10                     member                   Thu
## 11                     casual                   Fri
## 12                     member                   Fri
## 13                     casual                   Sat
## 14                     member                   Sat
##    all_trips_v2$ride_length_total
## 1                        32.92928
## 2                        15.65011
## 3                        29.23191
## 4                        13.10382
## 5                        26.29362
## 6                        12.73985
## 7                        24.78359
## 8                        12.81149
## 9                        24.50382
## 10                       12.70410
## 11                       26.61047
## 12                       13.17231
## 13                       30.96599
## 14                       15.24391
all_trips_v2 %>% 
  group_by(member_casual, weekday.) %>%  
  summarise(number_of_rides = n(),average_duration = mean(ride_length_total)) %>%       
  arrange(member_casual, weekday.)  
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.
## # A tibble: 14 × 4
## # Groups:   member_casual [2]
##    member_casual weekday. number_of_rides average_duration
##    <fct>         <ord>              <int>            <dbl>
##  1 casual        Sun               396387             32.9
##  2 casual        Mon               224750             29.2
##  3 casual        Tue               211179             26.3
##  4 casual        Wed               214303             24.8
##  5 casual        Thu               220299             24.5
##  6 casual        Fri               284966             26.6
##  7 casual        Sat               460167             31.0
##  8 member        Sun               301370             15.7
##  9 member        Mon               335368             13.1
## 10 member        Tue               376192             12.7
## 11 member        Wed               385487             12.8
## 12 member        Thu               361864             12.7
## 13 member        Fri               354222             13.2
## 14 member        Sat               346064             15.2

DATA VISUALIZATIONS AND SUMMARY.

COUNT ‘member_casual’ FOR PIE CHART.
CREATE DATA FRAME FOR PIE CHART.
MEMBER Vs CASUAL JAN TO DEC 2021 PIE CHART.
all_trips_v2_tot <- all_trips_v2 %>% 
  group_by(member_casual) %>% 
  summarise(number_of_rides = n()) 

pie_cvm <- data.frame(group = c("casual", "member"), value = c(2012051, 2460567))

ggplot(pie_cvm, aes(x = "", y = value, fill = group)) +
  geom_col(width = 1) +
  coord_polar("y") +
  geom_text(aes(label = round(value, 3)), position = position_stack(vjust = 0.5))+
  labs(title = "January to December 2021 Totals.")+
  theme_economist()

MEMBER Vs CASUAL JAN TO DEC 2021 DAILY TOTALS.
all_trips_v2 %>% 
  group_by(member_casual, weekday.) %>% 
  summarise(number_of_rides = n()) %>% 
  arrange(member_casual, weekday.)  %>% 
  ggplot(aes(x = weekday., y = number_of_rides, fill = member_casual)) +
  labs(title = "January to December 2021 Daily Totals.",
       x = "Weekday",
       y = "Number of Rides")+
  geom_col(position = "dodge")+
  theme_economist()
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

SUMMARY MEMBER Vs CASUAL JAN TO DEC 2021 DAILY TOTALS.
all_trips_v2 %>% select(!c(ride_length_secs, ride_length_total ,started_at, start_date, weekday, rideable_type)) %>% tbl_summary(by = member_casual)
Characteristic casual, N = 2,012,0511 member, N = 2,460,5671
weekday.
    Sun 396,387 (20%) 301,370 (12%)
    Mon 224,750 (11%) 335,368 (14%)
    Tue 211,179 (10%) 376,192 (15%)
    Wed 214,303 (11%) 385,487 (16%)
    Thu 220,299 (11%) 361,864 (15%)
    Fri 284,966 (14%) 354,222 (14%)
    Sat 460,167 (23%) 346,064 (14%)
1 n (%)
MEMBER Vs CASUAL JAN TO DEC 2021 RIDEABLE TYPE.
all_trips_v2 %>% 
  group_by(member_casual, rideable_type) %>% 
  summarise(number_of_rides = n()) %>% 
  arrange(member_casual, rideable_type)  %>% 
  ggplot(aes(x = rideable_type, y = number_of_rides, fill = member_casual)) +
  labs(title = "January to December 2021 Rideable Type.",
       x = "Rideable Type",
       y = "Number of Bikes")+
  geom_col(position = "dodge")+
  theme_economist()
## `summarise()` has grouped output by 'member_casual'. You can override using the
## `.groups` argument.

SUMMARY MEMBER Vs CASUAL JAN TO DEC 2021 RIDEABLE TYPE.
all_trips_v2 %>% select(!c(ride_length_secs,ride_length_total ,started_at, start_date, weekday, weekday.)) %>% tbl_summary(by = member_casual)
Characteristic casual, N = 2,012,0511 member, N = 2,460,5671
rideable_type
    classic_bike 1,239,033 (62%) 1,921,349 (78%)
    docked_bike 307,731 (15%) 1 (<0.1%)
    electric_bike 465,287 (23%) 539,217 (22%)
1 n (%)
RIDEABLE TYPE JAN TO DEC 2021 DAILY TOTALS.
all_trips_v2 %>% 
  group_by(weekday., rideable_type) %>% 
  summarise(number_of_rides = n()) %>% 
  arrange(weekday.)  %>% 
  ggplot(aes(x = weekday., y = number_of_rides, fill = rideable_type)) +
  facet_wrap(~rideable_type)+
  labs(title = "Rideable Type, January to December 2021 Daily Totals.",
       x = "Day Of Week",
       y = "Number of Bikes")+
  geom_col(position = "dodge")+
  theme_economist()
## `summarise()` has grouped output by 'weekday.'. You can override using the
## `.groups` argument.

RIDEABLE TYPE JAN TO DEC 2021 DAILY TOTALS.
all_trips_v2 %>% 
  group_by(weekday., rideable_type) %>% 
  summarise(number_of_rides = n()) %>% 
  arrange(weekday.)  %>% 
  ggplot(aes(x = weekday., y = number_of_rides, fill = rideable_type)) +
  labs(title = "Rideable Type, January to December 2021 Daily Totals.",
       x = "Day Of Week",
       y = "Number of Bikes")+
  geom_col(position = "dodge")+
  theme_economist()
## `summarise()` has grouped output by 'weekday.'. You can override using the
## `.groups` argument.

SUMMARY RIDEABLE TYPE JAN TO DEC 2021 DAILY TOTALS.
all_trips_v2 %>% select(!c(ride_length_secs,ride_length_total ,started_at, start_date, weekday, member_casual )) %>% tbl_summary(by = rideable_type)
Characteristic classic_bike, N = 3,160,3821 docked_bike, N = 307,7321 electric_bike, N = 1,004,5041
weekday.
    Sun 496,009 (16%) 69,353 (23%) 132,395 (13%)
    Mon 394,584 (12%) 35,927 (12%) 129,607 (13%)
    Tue 412,765 (13%) 29,173 (9.5%) 145,433 (14%)
    Wed 426,417 (13%) 27,262 (8.9%) 146,111 (15%)
    Thu 413,421 (13%) 27,250 (8.9%) 141,492 (14%)
    Fri 445,718 (14%) 41,436 (13%) 152,034 (15%)
    Sat 571,468 (18%) 77,331 (25%) 157,432 (16%)
1 n (%)
RIDEABLE TYPE JAN TO DEC 2021 TOTALS.
all_trips_v2 %>% 
  group_by(rideable_type) %>% 
  summarise(number_of_rides = n()) %>% 
  arrange(rideable_type)  %>% 
  ggplot(aes(x = rideable_type, y = number_of_rides, fill = rideable_type)) +
  labs(title = "Rideable Type, January to December 2021 Totals.",
       x = "Rideable Type",
       y = "Number of Bikes")+
  geom_col(position = "dodge")+
  theme_economist()

SUMMARY RIDEABLE TYPE JAN TO DEC 2021 TOTALS.
all_trips_v2 %>% select(!c(ride_length_secs,ride_length_total ,started_at, start_date, weekday, weekday., member_casual)) %>% tbl_summary()
Characteristic N = 4,472,6181
rideable_type
    classic_bike 3,160,382 (71%)
    docked_bike 307,732 (6.9%)
    electric_bike 1,004,504 (22%)
1 n (%)

MORE TO LEARN

SESSION INFORMATION

sessionInfo()
## R version 4.2.3 (2023-03-15 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 22621)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United Kingdom.utf8 
## [2] LC_CTYPE=English_United Kingdom.utf8   
## [3] LC_MONETARY=English_United Kingdom.utf8
## [4] LC_NUMERIC=C                           
## [5] LC_TIME=English_United Kingdom.utf8    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] ggthemes_4.2.4     RColorBrewer_1.1-3 scales_1.2.1       gtsummary_1.7.1   
##  [5] conflicted_1.2.0   janitor_2.2.0      skimr_2.1.5        here_1.0.1        
##  [9] hms_1.1.3          data.table_1.14.8  lubridate_1.9.2    forcats_1.0.0     
## [13] stringr_1.5.0      dplyr_1.1.2        purrr_1.0.1        readr_2.1.4       
## [17] tidyr_1.3.0        tibble_3.2.1       ggplot2_3.4.2      tidyverse_2.0.0   
## 
## loaded via a namespace (and not attached):
##  [1] rprojroot_2.0.3      digest_0.6.29        utf8_1.2.2          
##  [4] R6_2.5.1             repr_1.1.6           evaluate_0.16       
##  [7] highr_0.9            pillar_1.9.0         rlang_1.1.0         
## [10] rstudioapi_0.14      jquerylib_0.1.4      rmarkdown_2.22      
## [13] labeling_0.4.2       munsell_0.5.0        compiler_4.2.3      
## [16] xfun_0.38            pkgconfig_2.0.3      base64enc_0.1-3     
## [19] htmltools_0.5.5      tidyselect_1.2.0     fansi_1.0.3         
## [22] crayon_1.5.1         tzdb_0.3.0           withr_2.5.0         
## [25] commonmark_1.9.0     grid_4.2.3           jsonlite_1.8.4      
## [28] gtable_0.3.0         lifecycle_1.0.3      magrittr_2.0.3      
## [31] cli_3.6.1            stringi_1.7.8        cachem_1.0.6        
## [34] farver_2.1.1         broom.helpers_1.13.0 snakecase_0.11.0    
## [37] xml2_1.3.3           bslib_0.4.0          generics_0.1.3      
## [40] vctrs_0.6.1          tools_4.2.3          glue_1.6.2          
## [43] markdown_1.5         fastmap_1.1.0        yaml_2.3.5          
## [46] timechange_0.1.1     colorspace_2.0-3     gt_0.9.0            
## [49] memoise_2.0.1        knitr_1.39           sass_0.4.6