Memahami Data Analysis lebih dalam

knitr::include_graphics("data_input/cyclistic.png")

Menurut Google Data Analyitics certification, ada beberapa tahapan bisnis yang harus ditempuh untuk menganalisa data, diantaranya:

  • Ask
    Mengidentifikasi keperluan bisnis dari menggunakan data yang akan dianalisis. Termasuk di dalamnya key stakeholders di internal perusahaan

  • Prepare
    Mempertimbangkan kredibilitas data, berapa lama data diambil, bagaimana interval pengambilan data, apa saja variabelnya.

  • Process
    Mekanisme data wrangling sebagaimana mestinya kita membersihkan data agar dapat dipakai dengan leluasa; seperti halnya treatment pada missing values, menambahkan kolom tertentu sebagai fitur

  • Analyze
    Menggunakan semua data dan informasi yang sudah diolah ke dalam kontekstualisasi bisnis

  • Share
    Menyampaikan hasil analisa ke dalam bentuk rekomendasi, dan ide-ide bisnis

Berikut ini beberapa informasi tentang dataset cyclistic

Artikel ini merupakan persiapan saya untuk mengikuti Google Data Analytics Professional Certification,

analisis ini akan saya update hingga satu bulan kedepan

Data Preprocessing

library(tidyverse)
library(lubridate)
library(tidyr)
library(ggmap)

input data

# the environment detects that all the objects has the same length

april20 <- read.csv("data_input/202004-divvy-tripdata.csv")
may20 <- read.csv("data_input/202005-divvy-tripdata.csv")
june20 <- read.csv("data_input/202006-divvy-tripdata.csv")
july20 <- read.csv("data_input/202007-divvy-tripdata.csv")
aug20 <- read.csv("data_input/202008-divvy-tripdata.csv")
sept20 <- read.csv("data_input/202009-divvy-tripdata.csv")
oct20 <- read.csv("data_input/202010-divvy-tripdata.csv")
nov20 <- read.csv("data_input/202011-divvy-tripdata.csv")
dec20 <- read.csv("data_input/202012-divvy-tripdata.csv")
jan21 <- read.csv("data_input/202101-divvy-tripdata.csv")
feb21 <- read.csv("data_input/202102-divvy-tripdata.csv")
march21 <- read.csv("data_input/202103-divvy-tripdata.csv")

Menggabungkan data (April 20 - Maret 21) ke dalam object trip_dbl dan trip_chr

# binding rows 

trip_dbl <- bind_rows(april20, may20, june20, july20, aug20, sept20, oct20, nov20)
trip_chr <- bind_rows(dec20, jan21, feb21, march21)

glimpse(trip_dbl)
#> Rows: 2,983,223
#> Columns: 13
#> $ ride_id            <chr> "A847FADBBC638E45", "5405B80E996FF60D", "5DD24A79A4~
#> $ rideable_type      <chr> "docked_bike", "docked_bike", "docked_bike", "docke~
#> $ started_at         <chr> "2020-04-26 17:45:14", "2020-04-17 17:08:54", "2020~
#> $ ended_at           <chr> "2020-04-26 18:12:03", "2020-04-17 17:17:03", "2020~
#> $ start_station_name <chr> "Eckhart Park", "Drake Ave & Fullerton Ave", "McClu~
#> $ start_station_id   <int> 86, 503, 142, 216, 125, 173, 35, 434, 627, 377, 508~
#> $ end_station_name   <chr> "Lincoln Ave & Diversey Pkwy", "Kosciuszko Park", "~
#> $ end_station_id     <int> 152, 499, 255, 657, 323, 35, 635, 382, 359, 508, 37~
#> $ start_lat          <dbl> 41.8964, 41.9244, 41.8945, 41.9030, 41.8902, 41.896~
#> $ start_lng          <dbl> -87.6610, -87.7154, -87.6179, -87.6975, -87.6262, -~
#> $ end_lat            <dbl> 41.9322, 41.9306, 41.8679, 41.8992, 41.9695, 41.892~
#> $ end_lng            <dbl> -87.6586, -87.7238, -87.6230, -87.6722, -87.6547, -~
#> $ member_casual      <chr> "member", "member", "member", "member", "casual", "~
glimpse(trip_chr)
#> Rows: 506,525
#> Columns: 13
#> $ ride_id            <chr> "70B6A9A437D4C30D", "158A465D4E74C54A", "5262016E0F~
#> $ rideable_type      <chr> "classic_bike", "electric_bike", "electric_bike", "~
#> $ started_at         <chr> "2020-12-27 12:44:29", "2020-12-18 17:37:15", "2020~
#> $ ended_at           <chr> "2020-12-27 12:55:06", "2020-12-18 17:44:19", "2020~
#> $ start_station_name <chr> "Aberdeen St & Jackson Blvd", "", "", "", "", "", "~
#> $ start_station_id   <chr> "13157", "", "", "", "", "", "", "", "", "", "", ""~
#> $ end_station_name   <chr> "Desplaines St & Kinzie St", "", "", "", "", "", ""~
#> $ end_station_id     <chr> "TA1306000003", "", "", "", "", "", "", "", "", "",~
#> $ start_lat          <dbl> 41.87773, 41.93000, 41.91000, 41.92000, 41.80000, 4~
#> $ start_lng          <dbl> -87.65479, -87.70000, -87.69000, -87.70000, -87.590~
#> $ end_lat            <dbl> 41.88872, 41.91000, 41.93000, 41.91000, 41.80000, 4~
#> $ end_lng            <dbl> -87.64445, -87.70000, -87.70000, -87.70000, -87.590~
#> $ member_casual      <chr> "member", "member", "member", "member", "member", "~

Menyesuaikan trip_dbl dan trip_chr, kemudian menggabungkan keduanya

trip_dbl <-  mutate(
  trip_dbl,
  start_station_id = as.character(start_station_id),
  end_station_id = as.character(end_station_id)
) 

# joining all the data

all_tripdata <- bind_rows(trip_chr, trip_dbl)

Melakukan sedikit rekayasa pada nama stasiun yang kosong dengan value NA

all_tripdata <- all_tripdata %>%
  mutate(started_at = as_datetime(started_at),
         ended_at = as_datetime(ended_at)) %>%
  mutate(start_station_name = ifelse(start_station_name == "",
                                     NA,
                                     as.character(start_station_name))) %>%
  mutate(start_station_id = ifelse(start_station_id == "",
                                   NA,
                                   as.character(start_station_id))) %>%
  mutate(end_station_name = ifelse(end_station_name == "",
                                   NA,
                                   as.character(end_station_name))) %>%
  mutate(end_station_id = ifelse(end_station_id == "",
                                 NA,
                                 as.character(end_station_id))) %>%
  fill(c( #mengisi NA dengan value yang muncul sebelumnya
    start_station_name,
    start_station_id,
    end_station_name,
    end_station_id
  ))

adding new feature: duration

all_tripdata$duration_mins <-
  difftime(
    as.POSIXct(all_tripdata[[4]], format = "%Y-%m-%d %H:%M:%S"),
    as.POSIXct(all_tripdata[[3]], format = "%Y-%m-%d %H:%M:%S"),
    units = "min"
  )

head(all_tripdata)

Menyesuaikan tipe data, merapikan urutan kolom

options(scipen = 100)
clean_data <- 
all_tripdata %>%
  mutate(rideable_type = as.factor(rideable_type)) %>% 
  mutate(start_station_name = as.factor(start_station_name)) %>%
  mutate(end_station_name = as.factor(end_station_name)) %>%
  mutate(end_station_id = as.factor(end_station_id)) %>%
  mutate(member_casual = as.factor(member_casual)) %>%  
  mutate(duration_mins = as.double(duration_mins)) %>% 
  select(
    
    ride_id, rideable_type, ended_at, started_at, duration_mins, start_station_name, 
    start_station_id, end_station_name, end_station_id, start_lat, start_lng,
    end_lat, end_lng, member_casual
    
  )
  
head(clean_data)

Menyimpan objek clean_data ke dalam data_lite.rds

RDS merupakan jenis file layaknya .csv, namun mampu mengkompresi besaran file size secara signifikan

# jangan di run

# saveRDS(clean_data, file = "data_lite.rds")

Memahami Dataset

Load data .rds

cyclistic_data <- readRDS("data_lite.rds")
summary(cyclistic_data)
#>    ride_id                rideable_type        ended_at                  
#>  Length:3489748     classic_bike : 319873   Min.   :2020-04-01 00:10:45  
#>  Class :character   docked_bike  :2558469   1st Qu.:2020-07-14 20:13:07  
#>  Mode  :character   electric_bike: 611406   Median :2020-08-29 15:21:13  
#>                                             Mean   :2020-09-10 01:46:31  
#>                                             3rd Qu.:2020-10-20 18:28:46  
#>                                             Max.   :2021-04-06 11:00:11  
#>                                                                          
#>    started_at                  duration_mins      
#>  Min.   :2020-04-01 00:00:30   Min.   :-29049.97  
#>  1st Qu.:2020-07-14 19:38:28   1st Qu.:     7.88  
#>  Median :2020-08-29 14:50:36   Median :    14.52  
#>  Mean   :2020-09-10 01:21:45   Mean   :    24.77  
#>  3rd Qu.:2020-10-20 18:14:13   3rd Qu.:    26.63  
#>  Max.   :2021-03-31 23:59:08   Max.   : 58720.03  
#>                                                   
#>                   start_station_name  start_station_id  
#>  Streeter Dr & Grand Ave   :  35717   Length:3489748    
#>  Clark St & Elm St         :  33093   Class :character  
#>  Lake Shore Dr & Monroe St :  30373   Mode  :character  
#>  Theater on the Lake       :  30256                     
#>  Michigan Ave & Oak St     :  27755                     
#>  Lake Shore Dr & North Blvd:  27539                     
#>  (Other)                   :3305015                     
#>                    end_station_name   end_station_id      start_lat    
#>  Streeter Dr & Grand Ave   :  37895   35     :  35020   Min.   :41.64  
#>  Clark St & Elm St         :  33117   177    :  29468   1st Qu.:41.88  
#>  Theater on the Lake       :  31860   176    :  28342   Median :41.90  
#>  Michigan Ave & Oak St     :  30600   76     :  26328   Mean   :41.90  
#>  Lake Shore Dr & Monroe St :  29820   268    :  25974   3rd Qu.:41.93  
#>  Lake Shore Dr & North Blvd:  28071   90     :  23053   Max.   :42.08  
#>  (Other)                   :3298385   (Other):3321563                  
#>    start_lng         end_lat         end_lng       member_casual   
#>  Min.   :-87.87   Min.   :41.54   Min.   :-88.07   casual:1430376  
#>  1st Qu.:-87.66   1st Qu.:41.88   1st Qu.:-87.66   member:2059372  
#>  Median :-87.64   Median :41.90   Median :-87.64                   
#>  Mean   :-87.64   Mean   :41.90   Mean   :-87.64                   
#>  3rd Qu.:-87.63   3rd Qu.:41.93   3rd Qu.:-87.63                   
#>  Max.   :-87.52   Max.   :42.16   Max.   :-87.44                   
#>                   NA's   :4738    NA's   :4738

Memisahkan tanggal, bulan, tahun, hari dari kolom started_at

cyclistic_data$date <- as.Date(cyclistic_data$started_at) 
cyclistic_data$month <- format(as.Date(cyclistic_data$date), "%m")
cyclistic_data$day <- format(as.Date(cyclistic_data$date), "%d")
cyclistic_data$year <- format(as.Date(cyclistic_data$date), "%Y")
cyclistic_data$day_of_week <- format(as.Date(cyclistic_data$date), "%A")

Memuat kolom durasi perjalanan dan jarak tempuh (menggunakan selisih koordinat)

# duration in seconds
cyclistic_data$ride_length <- difftime(cyclistic_data$ended_at, cyclistic_data$started_at)

# the ride distance traveled in meter
cyclistic_data$ride_distance <-
  geosphere::distGeo(matrix(
    c(cyclistic_data$start_lng, cyclistic_data$start_lat),
    ncol = 2
  ), matrix(
    c(cyclistic_data$end_lng, cyclistic_data$end_lat),
    ncol = 2
  ))

# distance in KM
cyclistic_data$ride_distance <- cyclistic_data$ride_distance/1000

#At last the speed in Km/h
cyclistic_data$ride_speed = c(cyclistic_data$ride_distance)/as.numeric(c(cyclistic_data$ride_length), units="hours")

cyclistic_data <- cyclistic_data[!(cyclistic_data$start_station_name == "HQ QR" | cyclistic_data$ride_length<0),]

cyclistic_data
## Inspirasi: kalau ride distance dan ride speed nya == 0, kita samakan saja start station name dan end station name nya 

Menyamakan start_station_name dan end_station_name apabila ride distance dan ride speed == 0

cyclistic_data <- cyclistic_data %>%
  mutate(
    end_station_name = case_when(
      ride_distance == 0 &
        ride_speed == 0 ~ start_station_name,
      TRUE ~ end_station_name
    ) 
    
  ) 

cyclistic_data <- cyclistic_data %>% 
    mutate(start_station_id = as.factor(start_station_id)) %>%
    mutate(
    end_station_id = case_when(
      ride_distance == 0 &
        ride_speed == 0 ~ start_station_id,
      TRUE ~ end_station_id
    )

  )

Memeriksa Missing Value dengan Formula Khusus

options(scipen = 100)
dfcheck <- cyclistic_data %>% 
  is.na() %>% 
  colSums() %>% 
  as.data.frame() %>% 
  rownames_to_column(var = "var") %>% 
  rename(total = 2) %>% 
  filter(total !=0) %>%
  arrange(desc(total)) %>% 
  mutate(percent_missing = total/nrow(cyclistic_data))

dfcheck

Masih terdapat missing values pada beberapa kolom. Kita ingin menghilangkan saja beberapa entry data yang masih memiliki NA dengan operasi di bawah ini:

cyclistic_data <- drop_na(cyclistic_data)

Memeriksa kembali apakah masih terdapat missing values

dfcheck2 <- cyclistic_data %>% 
  is.na() %>% 
  colSums() 
dfcheck2
#>            ride_id      rideable_type           ended_at         started_at 
#>                  0                  0                  0                  0 
#>      duration_mins start_station_name   start_station_id   end_station_name 
#>                  0                  0                  0                  0 
#>     end_station_id          start_lat          start_lng            end_lat 
#>                  0                  0                  0                  0 
#>            end_lng      member_casual               date              month 
#>                  0                  0                  0                  0 
#>                day               year        day_of_week        ride_length 
#>                  0                  0                  0                  0 
#>      ride_distance         ride_speed 
#>                  0                  0

Menghitung jarak rata-rata pada class member_casual

mean_each_usertype <-
  cyclistic_data %>% group_by(member_casual) %>% summarise(mean_time = mean(duration_mins),
                                                           mean_distance = mean(ride_distance))



# transferring into viz

time_spend_member <- ggplot(mean_each_usertype) +
                geom_col(mapping=aes(x=member_casual, y=mean_time, fill= member_casual), show.legend = FALSE)+
                labs(title = "Travel time by User type",x="User Type",y="rata-rata penggunaan dalam menit")


distance_spend_member <- ggplot(mean_each_usertype) +
  geom_col(
    mapping = aes(x = member_casual, y = mean_distance, fill = member_casual),
    show.legend = FALSE
  ) +
  labs(
    title = "Travel distance by User type",
    x = "User Type",
    y = "rata-rata jarak tempuh dalam Km"
  )




gridExtra::grid.arrange(time_spend_member, distance_spend_member, ncol = 2)

## Memvisualisasikan banyaknya perjalanan berdasarkan hari

# checking  the number of rides diferences by weekday:
causal_v_member <- 
cyclistic_data %>%
  mutate(weekday = wday(started_at, label = TRUE)) %>%
  group_by(member_casual, weekday) %>%
  summarise(
    number_of_rides = n()
    ,
    average_duration = mean(ride_length),
    .groups = 'drop'
  ) %>%
  arrange(member_casual, weekday)  %>%
  ggplot(aes(x = weekday, y = number_of_rides, fill = member_casual)) +
  geom_col(position = "dodge") +
  labs(
    title = "Number of rides by User type during the week",
    x = "Days of the week",
    y = "Number of rides",
    caption = "Data by Motivate International Inc",
    fill = "User type"
  ) +
  theme(legend.position = "top")

causal_v_member

Membandingkan banyaknya perjalanan user dengan tipe layanan

options(scipen = 100)

filter_biketype <-
  cyclistic_data %>% filter(rideable_type == "classic_bike" |
                                  rideable_type == "electric_bike")

# melihat berdasarkan user type:

type_by_user <-
  filter_biketype %>%
  group_by(member_casual, rideable_type) %>%
  summarise(totals = n(), .groups = "drop")  %>%
  ggplot() +
  geom_col(aes(x = member_casual, y = totals, fill = rideable_type), position = "dodge") +
  labs(title = "Bike type usage by user type",
       x = "User type",
       y = NULL,
       fill = "Bike type") +
  scale_fill_manual(values = c("classic_bike" = "#2A363B", "electric_bike" = "#99B898")) +
  theme_minimal() +
  theme(legend.position = "top")

# melihat pola mingguan

types_comparison_weekly <-
  filter_biketype %>%
  mutate(weekday = wday(started_at, label = TRUE)) %>%
  group_by(member_casual, rideable_type, weekday) %>%
  summarise(totals = n(), .groups = "drop") %>%
  ggplot(aes(x = weekday, y = totals, fill = rideable_type)) +
  geom_col(position = "dodge") +
  facet_wrap( ~ member_casual) +
  labs(
    title = "Bike type usage by user type during a week",
    x = "User type",
    y = NULL,
    fill = "Bike type"
  ) +
  scale_fill_manual(values = c("classic_bike" = "#2A363B", "electric_bike" = "#99B898")) +
  theme_minimal() +
  theme(legend.position = "top")

type_by_user

## Pola mingguan penggunaan layanan

types_comparison_weekly

# inspirasi: inspect perjalanan dari dan ke stasiun yang sama; juga perjalanan yang 0 miles 0 speed

memeriksa anomali sistem: perjalanan yang memiliki ride_distance dan ride speed = 0

system_anomaly <- cyclistic_data %>% 
  filter(ride_distance == 0 & ride_speed == 0)

system_anomaly

Terdapat 10 persen data yang tidak valid (305.043 entry : 3.474.416 entry) kita perlu menanyakan apakah invaliditas data ini disebabkan oleh error dari sistem/database ataukah dari sisi user

anomaly_inspect_st <-
  system_anomaly %>%
  group_by(start_station_name, end_station_name) %>%
  summarise(totals = n(), .groups = "drop") %>% arrange(desc(totals))
  
anomaly_inspect_type <-
  system_anomaly %>%
  group_by(rideable_type) %>%
  summarise(totals = n(), .groups = "drop") %>% arrange(desc(totals))
anomaly_inspect_st

Kita dapat melihat bahwa beberapa stasiun di atas memuat perjalanan yang anomali. Berdasarkan data ini kita dapat menanyakan kepada divisi lain apakah ini merupakan kesalahan sistem yang bersifat database-generated atau user-generated.

anomaly_inspect_type

Berdasarkan tipe, ternyata sepeda yang terparkir mendominasi anomali sistem.

Melihat pola perjalanan

#Lets check now the coordinates data of the rides, to see if is there any interesting pattern:

#First we create a table only for the most popular routes (>250 times)

coordinates_table <- cyclistic_data %>%
  filter(start_lng != end_lng & start_lat != end_lat) %>%
  group_by(start_lng,
           start_lat,
           end_lng,
           end_lat,
           member_casual,
           rideable_type) %>%
  summarise(total = n(), .groups = "drop") %>%
  filter(total > 250)

#Then we create two sub tables for each user type
casual <- coordinates_table %>% filter(member_casual == "casual")
member <- coordinates_table %>% filter(member_casual == "member")

#Lets store bounding box coordinates for ggmap:
chicago_boundingbox <- c(
  left = -87.700424,
  bottom = 41.790769,
  right = -87.554855,
  top = 41.990119
)

#Here we store the stamen map of Chicago
chicago_stamen <- get_stamenmap(bbox = chicago_boundingbox,
                                zoom = 12,
                                maptype = "toner-2011")
route_casual_members <- 
ggmap(chicago_stamen, darken = c(0.7, "white")) +
  geom_curve(
    casual,
    mapping = aes(
      x = start_lng,
      y = start_lat,
      xend = end_lng,
      yend = end_lat,
      alpha = total,
      color = rideable_type
    ),
    size = 0.8,
    curvature = .1,
    arrow = arrow(
      length = unit(0.7, "cm"),
      ends = "first",
      type = "closed"
    )
  ) +
  coord_cartesian() +
  labs(
    title = "Most popular routes by casual users",
    x = NULL,
    y = NULL,
    color = "User type",
    caption = "Data by Motivate International Inc, retrieved from kaggle"
  ) +
  theme(legend.position = "none")


route_annual_members <- 
ggmap(chicago_stamen, darken = c(0.7, "white")) +
  geom_curve(
    member,
    mapping = aes(
      x = start_lng,
      y = start_lat,
      xend = end_lng,
      yend = end_lat,
      alpha = total,
      color = rideable_type
    ),
    size = 0.5,
    curvature = .2,
    arrow = arrow(
      length = unit(0.5, "cm"),
      ends = "first",
      type = "closed"
    )
  ) +
  coord_cartesian() +
  labs(
    title = "Most popular routes by annual members",
    x = NULL,
    y = NULL,
    caption = "Data by Motivate International Inc, retrieved from kaggle"
  ) +
  theme(legend.position = "none") 
route_annual_members

route_casual_members

Perjalanan non member lebih banyak terfokus pada coastal area, sekitar pelabuhan. Sedangkan para member lebih tersebar. Ini dapat menunjukkan bahwa para member didominasi oleh komuter, dan para non member untuk keperluan rekreasi.

Business Question: Bagaimana pengguna non member dapat dijadikan member?