knitr::include_graphics("data_input/cyclistic.png")Menurut Google Data Analyitics certification, ada beberapa tahapan bisnis yang harus ditempuh untuk menganalisa data, diantaranya:
Ask
Mengidentifikasi keperluan bisnis dari menggunakan data yang akan dianalisis. Termasuk di dalamnya key stakeholders di internal perusahaan
Prepare
Mempertimbangkan kredibilitas data, berapa lama data diambil, bagaimana interval pengambilan data, apa saja variabelnya.
Process
Mekanisme data wrangling sebagaimana mestinya kita membersihkan data agar dapat dipakai dengan leluasa; seperti halnya treatment pada missing values, menambahkan kolom tertentu sebagai fitur
Analyze
Menggunakan semua data dan informasi yang sudah diolah ke dalam kontekstualisasi bisnis
Share
Menyampaikan hasil analisa ke dalam bentuk rekomendasi, dan ide-ide bisnis
Berikut ini beberapa informasi tentang dataset cyclistic
Artikel ini merupakan persiapan saya untuk mengikuti Google Data Analytics Professional Certification,
analisis ini akan saya update hingga satu bulan kedepan
library(tidyverse)
library(lubridate)
library(tidyr)
library(ggmap)# the environment detects that all the objects has the same length
april20 <- read.csv("data_input/202004-divvy-tripdata.csv")
may20 <- read.csv("data_input/202005-divvy-tripdata.csv")
june20 <- read.csv("data_input/202006-divvy-tripdata.csv")
july20 <- read.csv("data_input/202007-divvy-tripdata.csv")
aug20 <- read.csv("data_input/202008-divvy-tripdata.csv")
sept20 <- read.csv("data_input/202009-divvy-tripdata.csv")
oct20 <- read.csv("data_input/202010-divvy-tripdata.csv")
nov20 <- read.csv("data_input/202011-divvy-tripdata.csv")
dec20 <- read.csv("data_input/202012-divvy-tripdata.csv")
jan21 <- read.csv("data_input/202101-divvy-tripdata.csv")
feb21 <- read.csv("data_input/202102-divvy-tripdata.csv")
march21 <- read.csv("data_input/202103-divvy-tripdata.csv")trip_dbl dan trip_chr# binding rows
trip_dbl <- bind_rows(april20, may20, june20, july20, aug20, sept20, oct20, nov20)
trip_chr <- bind_rows(dec20, jan21, feb21, march21)
glimpse(trip_dbl)#> Rows: 2,983,223
#> Columns: 13
#> $ ride_id <chr> "A847FADBBC638E45", "5405B80E996FF60D", "5DD24A79A4~
#> $ rideable_type <chr> "docked_bike", "docked_bike", "docked_bike", "docke~
#> $ started_at <chr> "2020-04-26 17:45:14", "2020-04-17 17:08:54", "2020~
#> $ ended_at <chr> "2020-04-26 18:12:03", "2020-04-17 17:17:03", "2020~
#> $ start_station_name <chr> "Eckhart Park", "Drake Ave & Fullerton Ave", "McClu~
#> $ start_station_id <int> 86, 503, 142, 216, 125, 173, 35, 434, 627, 377, 508~
#> $ end_station_name <chr> "Lincoln Ave & Diversey Pkwy", "Kosciuszko Park", "~
#> $ end_station_id <int> 152, 499, 255, 657, 323, 35, 635, 382, 359, 508, 37~
#> $ start_lat <dbl> 41.8964, 41.9244, 41.8945, 41.9030, 41.8902, 41.896~
#> $ start_lng <dbl> -87.6610, -87.7154, -87.6179, -87.6975, -87.6262, -~
#> $ end_lat <dbl> 41.9322, 41.9306, 41.8679, 41.8992, 41.9695, 41.892~
#> $ end_lng <dbl> -87.6586, -87.7238, -87.6230, -87.6722, -87.6547, -~
#> $ member_casual <chr> "member", "member", "member", "member", "casual", "~
glimpse(trip_chr)#> Rows: 506,525
#> Columns: 13
#> $ ride_id <chr> "70B6A9A437D4C30D", "158A465D4E74C54A", "5262016E0F~
#> $ rideable_type <chr> "classic_bike", "electric_bike", "electric_bike", "~
#> $ started_at <chr> "2020-12-27 12:44:29", "2020-12-18 17:37:15", "2020~
#> $ ended_at <chr> "2020-12-27 12:55:06", "2020-12-18 17:44:19", "2020~
#> $ start_station_name <chr> "Aberdeen St & Jackson Blvd", "", "", "", "", "", "~
#> $ start_station_id <chr> "13157", "", "", "", "", "", "", "", "", "", "", ""~
#> $ end_station_name <chr> "Desplaines St & Kinzie St", "", "", "", "", "", ""~
#> $ end_station_id <chr> "TA1306000003", "", "", "", "", "", "", "", "", "",~
#> $ start_lat <dbl> 41.87773, 41.93000, 41.91000, 41.92000, 41.80000, 4~
#> $ start_lng <dbl> -87.65479, -87.70000, -87.69000, -87.70000, -87.590~
#> $ end_lat <dbl> 41.88872, 41.91000, 41.93000, 41.91000, 41.80000, 4~
#> $ end_lng <dbl> -87.64445, -87.70000, -87.70000, -87.70000, -87.590~
#> $ member_casual <chr> "member", "member", "member", "member", "member", "~
trip_dbl dan trip_chr, kemudian menggabungkan keduanyatrip_dbl <- mutate(
trip_dbl,
start_station_id = as.character(start_station_id),
end_station_id = as.character(end_station_id)
)
# joining all the data
all_tripdata <- bind_rows(trip_chr, trip_dbl)NAall_tripdata <- all_tripdata %>%
mutate(started_at = as_datetime(started_at),
ended_at = as_datetime(ended_at)) %>%
mutate(start_station_name = ifelse(start_station_name == "",
NA,
as.character(start_station_name))) %>%
mutate(start_station_id = ifelse(start_station_id == "",
NA,
as.character(start_station_id))) %>%
mutate(end_station_name = ifelse(end_station_name == "",
NA,
as.character(end_station_name))) %>%
mutate(end_station_id = ifelse(end_station_id == "",
NA,
as.character(end_station_id))) %>%
fill(c( #mengisi NA dengan value yang muncul sebelumnya
start_station_name,
start_station_id,
end_station_name,
end_station_id
))all_tripdata$duration_mins <-
difftime(
as.POSIXct(all_tripdata[[4]], format = "%Y-%m-%d %H:%M:%S"),
as.POSIXct(all_tripdata[[3]], format = "%Y-%m-%d %H:%M:%S"),
units = "min"
)
head(all_tripdata)options(scipen = 100)
clean_data <-
all_tripdata %>%
mutate(rideable_type = as.factor(rideable_type)) %>%
mutate(start_station_name = as.factor(start_station_name)) %>%
mutate(end_station_name = as.factor(end_station_name)) %>%
mutate(end_station_id = as.factor(end_station_id)) %>%
mutate(member_casual = as.factor(member_casual)) %>%
mutate(duration_mins = as.double(duration_mins)) %>%
select(
ride_id, rideable_type, ended_at, started_at, duration_mins, start_station_name,
start_station_id, end_station_name, end_station_id, start_lat, start_lng,
end_lat, end_lng, member_casual
)
head(clean_data)clean_data ke dalam data_lite.rdsRDS merupakan jenis file layaknya .csv, namun mampu mengkompresi besaran file size secara signifikan
# jangan di run
# saveRDS(clean_data, file = "data_lite.rds")cyclistic_data <- readRDS("data_lite.rds")summary(cyclistic_data)#> ride_id rideable_type ended_at
#> Length:3489748 classic_bike : 319873 Min. :2020-04-01 00:10:45
#> Class :character docked_bike :2558469 1st Qu.:2020-07-14 20:13:07
#> Mode :character electric_bike: 611406 Median :2020-08-29 15:21:13
#> Mean :2020-09-10 01:46:31
#> 3rd Qu.:2020-10-20 18:28:46
#> Max. :2021-04-06 11:00:11
#>
#> started_at duration_mins
#> Min. :2020-04-01 00:00:30 Min. :-29049.97
#> 1st Qu.:2020-07-14 19:38:28 1st Qu.: 7.88
#> Median :2020-08-29 14:50:36 Median : 14.52
#> Mean :2020-09-10 01:21:45 Mean : 24.77
#> 3rd Qu.:2020-10-20 18:14:13 3rd Qu.: 26.63
#> Max. :2021-03-31 23:59:08 Max. : 58720.03
#>
#> start_station_name start_station_id
#> Streeter Dr & Grand Ave : 35717 Length:3489748
#> Clark St & Elm St : 33093 Class :character
#> Lake Shore Dr & Monroe St : 30373 Mode :character
#> Theater on the Lake : 30256
#> Michigan Ave & Oak St : 27755
#> Lake Shore Dr & North Blvd: 27539
#> (Other) :3305015
#> end_station_name end_station_id start_lat
#> Streeter Dr & Grand Ave : 37895 35 : 35020 Min. :41.64
#> Clark St & Elm St : 33117 177 : 29468 1st Qu.:41.88
#> Theater on the Lake : 31860 176 : 28342 Median :41.90
#> Michigan Ave & Oak St : 30600 76 : 26328 Mean :41.90
#> Lake Shore Dr & Monroe St : 29820 268 : 25974 3rd Qu.:41.93
#> Lake Shore Dr & North Blvd: 28071 90 : 23053 Max. :42.08
#> (Other) :3298385 (Other):3321563
#> start_lng end_lat end_lng member_casual
#> Min. :-87.87 Min. :41.54 Min. :-88.07 casual:1430376
#> 1st Qu.:-87.66 1st Qu.:41.88 1st Qu.:-87.66 member:2059372
#> Median :-87.64 Median :41.90 Median :-87.64
#> Mean :-87.64 Mean :41.90 Mean :-87.64
#> 3rd Qu.:-87.63 3rd Qu.:41.93 3rd Qu.:-87.63
#> Max. :-87.52 Max. :42.16 Max. :-87.44
#> NA's :4738 NA's :4738
started_atcyclistic_data$date <- as.Date(cyclistic_data$started_at)
cyclistic_data$month <- format(as.Date(cyclistic_data$date), "%m")
cyclistic_data$day <- format(as.Date(cyclistic_data$date), "%d")
cyclistic_data$year <- format(as.Date(cyclistic_data$date), "%Y")
cyclistic_data$day_of_week <- format(as.Date(cyclistic_data$date), "%A")# duration in seconds
cyclistic_data$ride_length <- difftime(cyclistic_data$ended_at, cyclistic_data$started_at)
# the ride distance traveled in meter
cyclistic_data$ride_distance <-
geosphere::distGeo(matrix(
c(cyclistic_data$start_lng, cyclistic_data$start_lat),
ncol = 2
), matrix(
c(cyclistic_data$end_lng, cyclistic_data$end_lat),
ncol = 2
))
# distance in KM
cyclistic_data$ride_distance <- cyclistic_data$ride_distance/1000
#At last the speed in Km/h
cyclistic_data$ride_speed = c(cyclistic_data$ride_distance)/as.numeric(c(cyclistic_data$ride_length), units="hours")
cyclistic_data <- cyclistic_data[!(cyclistic_data$start_station_name == "HQ QR" | cyclistic_data$ride_length<0),]
cyclistic_data## Inspirasi: kalau ride distance dan ride speed nya == 0, kita samakan saja start station name dan end station name nya start_station_name dan end_station_name apabila ride distance dan ride speed == 0cyclistic_data <- cyclistic_data %>%
mutate(
end_station_name = case_when(
ride_distance == 0 &
ride_speed == 0 ~ start_station_name,
TRUE ~ end_station_name
)
)
cyclistic_data <- cyclistic_data %>%
mutate(start_station_id = as.factor(start_station_id)) %>%
mutate(
end_station_id = case_when(
ride_distance == 0 &
ride_speed == 0 ~ start_station_id,
TRUE ~ end_station_id
)
)options(scipen = 100)
dfcheck <- cyclistic_data %>%
is.na() %>%
colSums() %>%
as.data.frame() %>%
rownames_to_column(var = "var") %>%
rename(total = 2) %>%
filter(total !=0) %>%
arrange(desc(total)) %>%
mutate(percent_missing = total/nrow(cyclistic_data))
dfcheckMasih terdapat missing values pada beberapa kolom. Kita ingin menghilangkan saja beberapa entry data yang masih memiliki NA dengan operasi di bawah ini:
cyclistic_data <- drop_na(cyclistic_data)dfcheck2 <- cyclistic_data %>%
is.na() %>%
colSums()
dfcheck2#> ride_id rideable_type ended_at started_at
#> 0 0 0 0
#> duration_mins start_station_name start_station_id end_station_name
#> 0 0 0 0
#> end_station_id start_lat start_lng end_lat
#> 0 0 0 0
#> end_lng member_casual date month
#> 0 0 0 0
#> day year day_of_week ride_length
#> 0 0 0 0
#> ride_distance ride_speed
#> 0 0
member_casualmean_each_usertype <-
cyclistic_data %>% group_by(member_casual) %>% summarise(mean_time = mean(duration_mins),
mean_distance = mean(ride_distance))
# transferring into viz
time_spend_member <- ggplot(mean_each_usertype) +
geom_col(mapping=aes(x=member_casual, y=mean_time, fill= member_casual), show.legend = FALSE)+
labs(title = "Travel time by User type",x="User Type",y="rata-rata penggunaan dalam menit")
distance_spend_member <- ggplot(mean_each_usertype) +
geom_col(
mapping = aes(x = member_casual, y = mean_distance, fill = member_casual),
show.legend = FALSE
) +
labs(
title = "Travel distance by User type",
x = "User Type",
y = "rata-rata jarak tempuh dalam Km"
)
gridExtra::grid.arrange(time_spend_member, distance_spend_member, ncol = 2) ## Memvisualisasikan banyaknya perjalanan berdasarkan hari
# checking the number of rides diferences by weekday:
causal_v_member <-
cyclistic_data %>%
mutate(weekday = wday(started_at, label = TRUE)) %>%
group_by(member_casual, weekday) %>%
summarise(
number_of_rides = n()
,
average_duration = mean(ride_length),
.groups = 'drop'
) %>%
arrange(member_casual, weekday) %>%
ggplot(aes(x = weekday, y = number_of_rides, fill = member_casual)) +
geom_col(position = "dodge") +
labs(
title = "Number of rides by User type during the week",
x = "Days of the week",
y = "Number of rides",
caption = "Data by Motivate International Inc",
fill = "User type"
) +
theme(legend.position = "top")
causal_v_memberoptions(scipen = 100)
filter_biketype <-
cyclistic_data %>% filter(rideable_type == "classic_bike" |
rideable_type == "electric_bike")
# melihat berdasarkan user type:
type_by_user <-
filter_biketype %>%
group_by(member_casual, rideable_type) %>%
summarise(totals = n(), .groups = "drop") %>%
ggplot() +
geom_col(aes(x = member_casual, y = totals, fill = rideable_type), position = "dodge") +
labs(title = "Bike type usage by user type",
x = "User type",
y = NULL,
fill = "Bike type") +
scale_fill_manual(values = c("classic_bike" = "#2A363B", "electric_bike" = "#99B898")) +
theme_minimal() +
theme(legend.position = "top")
# melihat pola mingguan
types_comparison_weekly <-
filter_biketype %>%
mutate(weekday = wday(started_at, label = TRUE)) %>%
group_by(member_casual, rideable_type, weekday) %>%
summarise(totals = n(), .groups = "drop") %>%
ggplot(aes(x = weekday, y = totals, fill = rideable_type)) +
geom_col(position = "dodge") +
facet_wrap( ~ member_casual) +
labs(
title = "Bike type usage by user type during a week",
x = "User type",
y = NULL,
fill = "Bike type"
) +
scale_fill_manual(values = c("classic_bike" = "#2A363B", "electric_bike" = "#99B898")) +
theme_minimal() +
theme(legend.position = "top")
type_by_user ## Pola mingguan penggunaan layanan
types_comparison_weekly# inspirasi: inspect perjalanan dari dan ke stasiun yang sama; juga perjalanan yang 0 miles 0 speedride_distance dan ride speed = 0system_anomaly <- cyclistic_data %>%
filter(ride_distance == 0 & ride_speed == 0)
system_anomalyTerdapat 10 persen data yang tidak valid (305.043 entry : 3.474.416 entry) kita perlu menanyakan apakah invaliditas data ini disebabkan oleh error dari sistem/database ataukah dari sisi user
anomaly_inspect_st <-
system_anomaly %>%
group_by(start_station_name, end_station_name) %>%
summarise(totals = n(), .groups = "drop") %>% arrange(desc(totals))
anomaly_inspect_type <-
system_anomaly %>%
group_by(rideable_type) %>%
summarise(totals = n(), .groups = "drop") %>% arrange(desc(totals))anomaly_inspect_stKita dapat melihat bahwa beberapa stasiun di atas memuat perjalanan yang anomali. Berdasarkan data ini kita dapat menanyakan kepada divisi lain apakah ini merupakan kesalahan sistem yang bersifat database-generated atau user-generated.
anomaly_inspect_typeBerdasarkan tipe, ternyata sepeda yang terparkir mendominasi anomali sistem.
#Lets check now the coordinates data of the rides, to see if is there any interesting pattern:
#First we create a table only for the most popular routes (>250 times)
coordinates_table <- cyclistic_data %>%
filter(start_lng != end_lng & start_lat != end_lat) %>%
group_by(start_lng,
start_lat,
end_lng,
end_lat,
member_casual,
rideable_type) %>%
summarise(total = n(), .groups = "drop") %>%
filter(total > 250)
#Then we create two sub tables for each user type
casual <- coordinates_table %>% filter(member_casual == "casual")
member <- coordinates_table %>% filter(member_casual == "member")
#Lets store bounding box coordinates for ggmap:
chicago_boundingbox <- c(
left = -87.700424,
bottom = 41.790769,
right = -87.554855,
top = 41.990119
)
#Here we store the stamen map of Chicago
chicago_stamen <- get_stamenmap(bbox = chicago_boundingbox,
zoom = 12,
maptype = "toner-2011")route_casual_members <-
ggmap(chicago_stamen, darken = c(0.7, "white")) +
geom_curve(
casual,
mapping = aes(
x = start_lng,
y = start_lat,
xend = end_lng,
yend = end_lat,
alpha = total,
color = rideable_type
),
size = 0.8,
curvature = .1,
arrow = arrow(
length = unit(0.7, "cm"),
ends = "first",
type = "closed"
)
) +
coord_cartesian() +
labs(
title = "Most popular routes by casual users",
x = NULL,
y = NULL,
color = "User type",
caption = "Data by Motivate International Inc, retrieved from kaggle"
) +
theme(legend.position = "none")
route_annual_members <-
ggmap(chicago_stamen, darken = c(0.7, "white")) +
geom_curve(
member,
mapping = aes(
x = start_lng,
y = start_lat,
xend = end_lng,
yend = end_lat,
alpha = total,
color = rideable_type
),
size = 0.5,
curvature = .2,
arrow = arrow(
length = unit(0.5, "cm"),
ends = "first",
type = "closed"
)
) +
coord_cartesian() +
labs(
title = "Most popular routes by annual members",
x = NULL,
y = NULL,
caption = "Data by Motivate International Inc, retrieved from kaggle"
) +
theme(legend.position = "none") route_annual_membersroute_casual_membersPerjalanan non member lebih banyak terfokus pada coastal area, sekitar pelabuhan. Sedangkan para member lebih tersebar. Ini dapat menunjukkan bahwa para member didominasi oleh komuter, dan para non member untuk keperluan rekreasi.
Business Question: Bagaimana pengguna non member dapat dijadikan member?