Cyclistic Strategy To Get More Annual Members

OBJECTIVES

Cyclistic, a fictional bike-sharing company operating in Chicago, needs to attract more customers and maximize its annual membership numbers in order to increase its profits. We want to understand how casual riders and annual members use Cyclistic bikes differently. We need to design a new marketing strategy to convert casual riders into annual members.

INSTALLING PACKAGES

# Downloading datasets
if (!require(pacman)) install.packages("pacman")
pacman::p_load( tidyverse,
        here,
        inspectdf,
        plotly,
        janitor,
        visdat,
        highcharter
        )


Trips_2020 <- read.csv("~/Coursera/data analytics/Part 8/proje/proje_GSheets/Trips_2020.csv")


Trips1_2019 <- read.csv("~/Coursera/data analytics/Part 8/proje/proje_GSheets/Trips1_2019.csv")


# Calculating duration times for 2020
Trips_2020 <- mutate(Trips_2020,
            duration_time = as.numeric(difftime(
            ended_at , started_at, units =  "sec")))

Trips_2020_dt <- filter(Trips_2020, duration_time > 0)

DATA ANALYSIS

# Data Analysis for 2020
## To get an idea about the data set for 2020

summary(Trips_2020)
##    ride_id          rideable_type      started_month       started_at       
##  Length:426887      Length:426887      Length:426887      Length:426887     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  started_day          ended_at         trip_duration     start_station_name
##  Length:426887      Length:426887      Min.   :   -552   Length:426887     
##  Class :character   Class :character   1st Qu.:    329   Class :character  
##  Mode  :character   Mode  :character   Median :    550   Mode  :character  
##                                        Mean   :   1327                     
##                                        3rd Qu.:    949                     
##                                        Max.   :9387024                     
##                                                                            
##  start_station_id end_station_name   end_station_id    start_lat    
##  Min.   :  2.0    Length:426887      Min.   :  2.0   Min.   :41.74  
##  1st Qu.: 77.0    Class :character   1st Qu.: 77.0   1st Qu.:41.88  
##  Median :176.0    Mode  :character   Median :175.0   Median :41.89  
##  Mean   :209.8                       Mean   :209.3   Mean   :41.90  
##  3rd Qu.:298.0                       3rd Qu.:297.0   3rd Qu.:41.92  
##  Max.   :675.0                       Max.   :675.0   Max.   :42.06  
##                                      NA's   :1                      
##    start_lng         end_lat         end_lng       member_casual     
##  Min.   :-87.77   Min.   :41.74   Min.   :-87.77   Length:426887     
##  1st Qu.:-87.66   1st Qu.:41.88   1st Qu.:-87.66   Class :character  
##  Median :-87.64   Median :41.89   Median :-87.64   Mode  :character  
##  Mean   :-87.64   Mean   :41.90   Mean   :-87.64                     
##  3rd Qu.:-87.63   3rd Qu.:41.92   3rd Qu.:-87.63                     
##  Max.   :-87.55   Max.   :42.06   Max.   :-87.55                     
##                   NA's   :1       NA's   :1                          
##  duration_time    
##  Min.   :   -552  
##  1st Qu.:    329  
##  Median :    550  
##  Mean   :   1326  
##  3rd Qu.:    949  
##  Max.   :9383424  
## 
filter(Trips_2020, duration_time == -552) # to find wrong data
##            ride_id rideable_type started_month          started_at started_day
## 1 6FABADDD595AF922   docked_bike           Mar 2020-03-27 15:15:26      Friday
##              ended_at trip_duration start_station_name start_station_id
## 1 2020-03-27 15:06:14          -552              HQ QR              675
##   end_station_name end_station_id start_lat start_lng end_lat  end_lng
## 1            HQ QR            675   41.8899  -87.6803 41.8899 -87.6803
##   member_casual duration_time
## 1        casual          -552
Trips_2020 %>% slice(1:2000) %>% vis_dat()

# Data Analysis for 2019
## To get an idea about the data set for 2019

summary(Trips1_2019)
##     trip_id         start_month         start_time         start_day        
##  Min.   :21742443   Length:365068      Length:365068      Length:365068     
##  1st Qu.:21848766   Class :character   Class :character   Class :character  
##  Median :21961830   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :21960872                                                           
##  3rd Qu.:22071824                                                           
##  Max.   :22178528                                                           
##                                                                             
##    end_time             bikeid     tripduration         t_duration      
##  Length:365068      Min.   :   1   Length:365068      Min.   :      61  
##  Class :character   1st Qu.:1777   Class :character   1st Qu.:     326  
##  Mode  :character   Median :3489   Mode  :character   Median :     524  
##                     Mean   :3429                      Mean   :    1016  
##                     3rd Qu.:5157                      3rd Qu.:     866  
##                     Max.   :6471                      Max.   :10628400  
##                                                                         
##  from_station_id from_station_name  from_address          Latitude    
##  Min.   :  2.0   Length:365068      Length:365068      Min.   :41.74  
##  1st Qu.: 76.0   Class :character   Class :character   1st Qu.:41.88  
##  Median :170.0   Mode  :character   Mode  :character   Median :41.89  
##  Mean   :198.1                                         Mean   :41.90  
##  3rd Qu.:287.0                                         3rd Qu.:41.91  
##  Max.   :665.0                                         Max.   :42.32  
##                                                        NA's   :39     
##    Longitude      to_station_id   to_station_name     to_address       
##  Min.   :-87.94   Min.   :  2.0   Length:365068      Length:365068     
##  1st Qu.:-87.65   1st Qu.: 76.0   Class :character   Class :character  
##  Median :-87.64   Median :168.5   Mode  :character   Mode  :character  
##  Mean   :-87.64   Mean   :198.6                                        
##  3rd Qu.:-87.63   3rd Qu.:287.0                                        
##  Max.   :-87.55   Max.   :665.0                                        
##  NA's   :39                                                            
##    Latitude.1     Longitude.1       usertype            gender         
##  Min.   :41.74   Min.   :-87.94   Length:365068      Length:365068     
##  1st Qu.:41.88   1st Qu.:-87.65   Class :character   Class :character  
##  Median :41.89   Median :-87.64   Mode  :character   Mode  :character  
##  Mean   :41.90   Mean   :-87.64                                        
##  3rd Qu.:41.91   3rd Qu.:-87.63                                        
##  Max.   :42.32   Max.   :-87.55                                        
##  NA's   :30      NA's   :30                                            
##    birthyear    
##  Min.   :1900   
##  1st Qu.:1975   
##  Median :1985   
##  Mean   :1982   
##  3rd Qu.:1990   
##  Max.   :2003   
##  NA's   :18023
Trips1_2019 %>% slice(1:2000) %>% vis_dat()

# Categorical Analysis for 2020

min(Trips_2020$duration_time, na.rm = TRUE)
## [1] -552
max(Trips_2020$duration_time, na.rm = TRUE)
## [1] 9383424
show_plot(inspect_cat(Trips_2020))

# Numerical Analysis for 2019

show_plot(inspect_num(Trips1_2019))

min(Trips1_2019$t_duration, na.rm = TRUE)
## [1] 61
max(Trips1_2019$t_duration, na.rm = TRUE)
## [1] 10628400
# Numerical Analysis for 2020
show_plot(inspect_num(Trips_2020))

# Categorical Analysis for 2019
show_plot(inspect_cat(Trips1_2019))

Bilboards can be put in the Locations used by Casual users

For 2020 Most used locations

# Most used locations for 2020 

ggplot(data = Trips_2020, mapping = aes(x = start_lat,
                                        y = start_lng,
                                        fill = member_casual)) +
              geom_col() +
  labs(title = "Most Used Location",
                   subtitle = "for the year 2020",
                   x = "Latitude",
                   y = "Longtitude")

Duration times per member for 2020

# Duration times per member for 2020

ggplot(data = Trips_2020_dt, mapping = aes(x = duration_time,
                                        y = end_station_id,
                                        color = member_casual)) +
              geom_line(size = 0.55,
                        alpha = 0.65) +
              scale_color_brewer(palette = "Dark2") +
              theme_light() +
              labs(title = "Showing duration time by end station",
                   subtitle = "For the year 2020",
                   x = "Duration Time",
                   y = "End Station Id",
                   color = "Member Casual")

While members are concentrated in the same stations, casual users are spread across different stations

Most used locations for 2019

# Most used locations for 2019

ggplot(data = Trips1_2019, mapping = aes(x = Latitude,
                                        y = Longitude,
                                        color = usertype)) +
              geom_point(shape = 18, alpha = 0.75) +
              geom_smooth(method = "glm",
                          se = FALSE,
                          color = "orange") +
  labs(title = "Most used locations", 
                   subtitle = "for the year 2019")

While members are concentrated in the same stations, casual users are spread across different stations

Busiest locations for 2019

# Busiest locations for 2019 




ggplot(data = Trips1_2019, mapping = aes(x = Latitude.1,
                                        y = Longitude.1,
                                        color = gender)) +
              geom_line() +
  labs(title = "Busiest locations by gender",
                   subtitle = "for the year 2019",
                   x = "Latitude",
                   y = "Longitiude")

Female users appears to be less frequent

Duration times per member for 2019

# Duration times per member for 2019


ggplot(data = Trips1_2019, mapping = aes(x = t_duration,
                                        y = to_station_id,
                                        color = usertype)) +
              geom_line(size = 0.55,
                        alpha = 0.65) +
              facet_wrap(vars(gender)) +
              scale_color_brewer(palette = "Dark2") +
              theme_light() +
              labs(title = "Showing duration time by end station",
                   subtitle = "for the year 2019",
                   x = "Duration Time",
                   y = "End Station Id",
                   color = "Usertype")

Member and Casual users summary and mean for 2020

# Memeber and Casual users summary and mean for 2020

Trip2020_sum1<- Trips_2020 %>%
                 filter(duration_time > 0) %>%
                 group_by(member_casual) %>%
                 summarize(sum(duration_time),
                 mean(duration_time) , count = n()) %>%
                 arrange(desc(count))

colnames(Trip2020_sum1)[2] <- "sum_duration_time"
colnames(Trip2020_sum1)[3] <- "mean_duration_time"


barplot(table(Trip2020_sum1$member_casual), names.arg = Trip2020_sum1$'sum_duration_time', main= "Duration times by member and casual For 2020", xlab = "member and casual", col = terrain.colors(2))

In 2020, the number of members was greater than the number of casual users

Subscriber and customer users summary and mean for 2019

# Subscriber and  customer  users summary and mean for 2019


Trip2019_sum1 <- Trips1_2019 %>%
                 group_by(usertype, gender) %>%
                 summarize(sum(t_duration),
                 mean(t_duration), count = n()) %>%
                 arrange(desc(gender)) 

colnames(Trip2019_sum1)[3] <- "sum_duration_time" 
colnames(Trip2019_sum1)[4] <- "mean_duration_time"



user = c("Customer", "Subscriber")
colors = c("pink", "blue")
barplot(Trip2019_sum1$sum_duration_time, names.arg = Trip2019_sum1$'gender',
        main = "Duration times by Gender For 2019 ", xlab = "Gender", ylab = "Duration Times", col= colors, beside = TRUE)
       
legend("topleft", user, cex = 0.6, fill = colors)

In 2019, the usage time for Male was longer than the usage time for Female users

Subscriber and customer users duration time by day for 2019

# Subscriber and  customer  users duration time by day for 2019


Trip2019_day <- Trips1_2019 %>%
                 group_by(start_day, usertype) %>%
                 summarize(sum(t_duration),
                 mean(t_duration), count = n()) %>%
                 arrange(desc(start_day)) 

colnames(Trip2019_day)[3] <- "sum_duration_time" 
colnames(Trip2019_day)[4] <- "mean_duration_time"

Trip2019_day <- Trip2019_day %>% 
  mutate(nstart_day = recode(start_day,
                          "Wednesday" = "Wed",
                           "Tuesday"  = "Tue",
                           "Thursday" = "Thu",
                             "Sunday" = "Sun",
                           "Saturday" = "Sat",
                             "Monday" = "Mon",
                             "Friday" = "Fri"))


colors = c("green", "orange")
user = c("Customer", "Subscriber")



barplot(Trip2019_day$sum_duration_time,
   main = "Trip Times by Days for 2019",
        names.arg =  Trip2019_day$'nstart_day', 
                        space=c(0.5,0.1,0.5,0.1,0.5,0.1,0.5,0.1),
                        xlab = "Days", ylab = "Trip Times",
                        col = colors,
                        beside = TRUE)

legend("topleft", user, cex = 0.5, fill = colors)

In 2019 Customers use bicycles more on Tuesdays and Saturdays, while subscribers use them more during the weekdays

Member and Casual users duration time by day for 2020

# Member and Casual  users duration time by day for 2020


Trip2020_day <- Trips_2020_dt %>%
                 group_by(started_day, member_casual) %>%
                 summarize(sum(trip_duration),
                 mean(trip_duration), count = n()) %>%
                 arrange(desc(started_day)) 

colnames(Trip2020_day)[3] <- "sum_duration_time" 
colnames(Trip2020_day)[4] <- "mean_duration_time"

Trip2020_day <- Trip2020_day %>% 
  mutate(nstarted_day = recode(started_day,
                          "Wednesday" = "Wed",
                           "Tuesday"  = "Tue",
                           "Thursday" = "Thu",
                             "Sunday" = "Sun",
                           "Saturday" = "Sat",
                             "Monday" = "Mon",
                             "Friday" = "Fri"))

colors = c("red", "blue")
user = c("casual", "member")


barplot(Trip2020_day$sum_duration_time, names.arg = Trip2020_day$'nstarted_day',
        main = "Trip Times by Days for 2020", 
        space=c(0.5,0.1,0.5,0.1,0.5,0.1,0.5,0.1),
        xlab = "Days", ylab = "Trip Times", col = colors)

legend("topleft", user, cex = 0.5, fill = colors)

In 2020 Casuals use bicycles more on Sundays, while Members use them more during the weekdays

Subscriber and customer users duration time by month for 2019

# Subscriber and  customer  users duration time by month for 2019


Trip2019_month <- Trips1_2019 %>%
                 group_by(start_month, usertype) %>%
                 summarize(sum(t_duration),
                 mean(t_duration), count = n()) %>%
                 arrange(desc(start_month)) 

colnames(Trip2019_month)[3] <- "sum_duration_time" 
colnames(Trip2019_month)[4] <- "mean_duration_time"

colors = c("green", "orange")
user = c("Customer", "Subscriber")

barplot(Trip2019_month$sum_duration_time, main = "Trip Times by Months for 2019",
        names.arg =  Trip2019_month$'start_month',
                        xlab = "Months", ylab = "Trip Times",
                        col = colors, beside = TRUE)

legend("topleft", user, cex = 0.5, fill = colors)

In 2019 Customers and Subscribers use bicycles more in spring times than winter times

Member and Casual users duration time by day for 2020

# Member and  Casual  users duration time by day for 2020


Trip2020_month <- Trips_2020_dt %>%
                 group_by(started_month, member_casual) %>%
                 summarize(sum(trip_duration),
                 mean(trip_duration), count = n()) %>%
                 arrange(desc(started_month)) 

colnames(Trip2020_month)[3] <- "sum_duration_time" 
colnames(Trip2020_month)[4] <- "mean_duration_time"

colors = c("red", "blue")
user = c("casual", "member")

barplot(Trip2020_month$sum_duration_time, main = "Trip Times by Month for 2020",
        names.arg =  Trip2020_month$'started_month',
                        xlab = "Months", ylab = "Trip Times",
                        col = colors, beside = TRUE)

legend("topleft", user, cex = 0.5, fill = colors)

In 2020 Casuals and Members use bicycles more in spring times than winter times

Most of used start stations for casual for 2020

# Most of used start stations  for casual for 2020

Trip2020_sum2 <- Trips_2020 %>%
               filter(member_casual == "casual") %>%
               group_by(start_station_id) %>%
               summarize(count = n()) %>%
               arrange(desc(count))


# highcharter::hchart(Trip2020_sum2$start_station_id)
top15_rows2 <- head(Trip2020_sum2, 15)
top15_rows2
## # A tibble: 15 × 2
##    start_station_id count
##               <int> <int>
##  1              675  3766
##  2               76  1590
##  3               35  1530
##  4                3   998
##  5               90   779
##  6               85   631
##  7              177   567
##  8               43   495
##  9                6   490
## 10              341   465
## 11              199   428
## 12              268   402
## 13              255   398
## 14              623   391
## 15                2   335

Most of used start stations for customer for 2019

# Most of used start stations  for customer for 2019

Trip2019_sum2 <- Trips1_2019 %>%
               filter(usertype == "Customer") %>%
               group_by(from_station_id) %>%
               summarize(count = n()) %>%
               arrange(desc(count))


# highcharter::hchart(Trip2019_sum2$from_station_id)
top15_rows1 <- head(Trip2019_sum2, 15)
top15_rows1
## # A tibble: 15 × 2
##    from_station_id count
##              <int> <int>
##  1              35  1219
##  2              76  1142
##  3               3   834
##  4              90   627
##  5              85   386
##  6             341   362
##  7              43   344
##  8               6   342
##  9              97   299
## 10               2   256
## 11             195   239
## 12             177   228
## 13              77   220
## 14             623   206
## 15             268   204

SUMMARY

Although the total trip time appears higher for members, the average trip time is higher for casual users. To convert casual users into members, billboards could be placed at frequently used stations. Advertisements targeting women would also increase the number of female members. Since casual users use the service more on weekends and during the spring months, seasonal or weekend memberships could be offered. Digital media can be used to inform people about these options.