A brief tutorial on real data analyst situation you may get stuck with Importing, Merging, Tidying Data, DateTime Format Variable, Parsing, Exploring, Summarizing, Visualizing
Let’s load some data from different files.
library(tidyverse)
## -- Attaching packages --------------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ------------------------------------------------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
tripdata_202004 <- read_csv("202004-divvy-tripdata.csv")
## Parsed with column specification:
## cols(
## ride_id = col_character(),
## rideable_type = col_character(),
## started_at = col_datetime(format = ""),
## ended_at = col_datetime(format = ""),
## start_station_name = col_character(),
## start_station_id = col_double(),
## end_station_name = col_character(),
## end_station_id = col_double(),
## start_lat = col_double(),
## start_lng = col_double(),
## end_lat = col_double(),
## end_lng = col_double(),
## member_casual = col_character()
## )
tripdata_202005 <- read_csv("202005-divvy-tripdata.csv")
## Parsed with column specification:
## cols(
## ride_id = col_character(),
## rideable_type = col_character(),
## started_at = col_datetime(format = ""),
## ended_at = col_datetime(format = ""),
## start_station_name = col_character(),
## start_station_id = col_double(),
## end_station_name = col_character(),
## end_station_id = col_double(),
## start_lat = col_double(),
## start_lng = col_double(),
## end_lat = col_double(),
## end_lng = col_double(),
## member_casual = col_character()
## )
tripdata_202006 <- read_csv("202006-divvy-tripdata.csv")
## Parsed with column specification:
## cols(
## ride_id = col_character(),
## rideable_type = col_character(),
## started_at = col_datetime(format = ""),
## ended_at = col_datetime(format = ""),
## start_station_name = col_character(),
## start_station_id = col_double(),
## end_station_name = col_character(),
## end_station_id = col_double(),
## start_lat = col_double(),
## start_lng = col_double(),
## end_lat = col_double(),
## end_lng = col_double(),
## member_casual = col_character()
## )
tripdata_202007 <- read_csv("202007-divvy-tripdata.csv")
## Parsed with column specification:
## cols(
## ride_id = col_character(),
## rideable_type = col_character(),
## started_at = col_datetime(format = ""),
## ended_at = col_datetime(format = ""),
## start_station_name = col_character(),
## start_station_id = col_double(),
## end_station_name = col_character(),
## end_station_id = col_double(),
## start_lat = col_double(),
## start_lng = col_double(),
## end_lat = col_double(),
## end_lng = col_double(),
## member_casual = col_character()
## )
tripdata_202008 <- read_csv("202008-divvy-tripdata.csv")
## Parsed with column specification:
## cols(
## ride_id = col_character(),
## rideable_type = col_character(),
## started_at = col_datetime(format = ""),
## ended_at = col_datetime(format = ""),
## start_station_name = col_character(),
## start_station_id = col_double(),
## end_station_name = col_character(),
## end_station_id = col_double(),
## start_lat = col_double(),
## start_lng = col_double(),
## end_lat = col_double(),
## end_lng = col_double(),
## member_casual = col_character()
## )
tripdata_202009 <- read_csv("202009-divvy-tripdata.csv")
## Parsed with column specification:
## cols(
## ride_id = col_character(),
## rideable_type = col_character(),
## started_at = col_datetime(format = ""),
## ended_at = col_datetime(format = ""),
## start_station_name = col_character(),
## start_station_id = col_double(),
## end_station_name = col_character(),
## end_station_id = col_double(),
## start_lat = col_double(),
## start_lng = col_double(),
## end_lat = col_double(),
## end_lng = col_double(),
## member_casual = col_character()
## )
tripdata_202010 <- read_csv("202010-divvy-tripdata.csv")
## Parsed with column specification:
## cols(
## ride_id = col_character(),
## rideable_type = col_character(),
## started_at = col_datetime(format = ""),
## ended_at = col_datetime(format = ""),
## start_station_name = col_character(),
## start_station_id = col_double(),
## end_station_name = col_character(),
## end_station_id = col_double(),
## start_lat = col_double(),
## start_lng = col_double(),
## end_lat = col_double(),
## end_lng = col_double(),
## member_casual = col_character()
## )
tripdata_202011 <- read_csv("202011-divvy-tripdata.csv")
## Parsed with column specification:
## cols(
## ride_id = col_character(),
## rideable_type = col_character(),
## started_at = col_datetime(format = ""),
## ended_at = col_datetime(format = ""),
## start_station_name = col_character(),
## start_station_id = col_double(),
## end_station_name = col_character(),
## end_station_id = col_double(),
## start_lat = col_double(),
## start_lng = col_double(),
## end_lat = col_double(),
## end_lng = col_double(),
## member_casual = col_character()
## )
tripdata_202012 <- read_csv("202012-divvy-tripdata.csv")
## Parsed with column specification:
## cols(
## ride_id = col_character(),
## rideable_type = col_character(),
## started_at = col_datetime(format = ""),
## ended_at = col_datetime(format = ""),
## start_station_name = col_character(),
## start_station_id = col_character(),
## end_station_name = col_character(),
## end_station_id = col_character(),
## start_lat = col_double(),
## start_lng = col_double(),
## end_lat = col_double(),
## end_lng = col_double(),
## member_casual = col_character()
## )
tripdata_202101 <- read_csv("202101-divvy-tripdata.csv")
## Parsed with column specification:
## cols(
## ride_id = col_character(),
## rideable_type = col_character(),
## started_at = col_datetime(format = ""),
## ended_at = col_datetime(format = ""),
## start_station_name = col_character(),
## start_station_id = col_character(),
## end_station_name = col_character(),
## end_station_id = col_character(),
## start_lat = col_double(),
## start_lng = col_double(),
## end_lat = col_double(),
## end_lng = col_double(),
## member_casual = col_character()
## )
tripdata_202102 <- read_csv("202102-divvy-tripdata.csv")
## Parsed with column specification:
## cols(
## ride_id = col_character(),
## rideable_type = col_character(),
## started_at = col_datetime(format = ""),
## ended_at = col_datetime(format = ""),
## start_station_name = col_character(),
## start_station_id = col_character(),
## end_station_name = col_character(),
## end_station_id = col_character(),
## start_lat = col_double(),
## start_lng = col_double(),
## end_lat = col_double(),
## end_lng = col_double(),
## member_casual = col_character()
## )
tripdata_202103 <- read_csv("202103-divvy-tripdata.csv")
## Parsed with column specification:
## cols(
## ride_id = col_character(),
## rideable_type = col_character(),
## started_at = col_datetime(format = ""),
## ended_at = col_datetime(format = ""),
## start_station_name = col_character(),
## start_station_id = col_character(),
## end_station_name = col_character(),
## end_station_id = col_character(),
## start_lat = col_double(),
## start_lng = col_double(),
## end_lat = col_double(),
## end_lng = col_double(),
## member_casual = col_character()
## )
tripdata_202104 <- read_csv("202104-divvy-tripdata.csv")
## Parsed with column specification:
## cols(
## ride_id = col_character(),
## rideable_type = col_character(),
## started_at = col_datetime(format = ""),
## ended_at = col_datetime(format = ""),
## start_station_name = col_character(),
## start_station_id = col_character(),
## end_station_name = col_character(),
## end_station_id = col_character(),
## start_lat = col_double(),
## start_lng = col_double(),
## end_lat = col_double(),
## end_lng = col_double(),
## member_casual = col_character()
## )
tripdata_202105 <- read_csv("202105-divvy-tripdata.csv")
## Parsed with column specification:
## cols(
## ride_id = col_character(),
## rideable_type = col_character(),
## started_at = col_datetime(format = ""),
## ended_at = col_datetime(format = ""),
## start_station_name = col_character(),
## start_station_id = col_character(),
## end_station_name = col_character(),
## end_station_id = col_character(),
## start_lat = col_double(),
## start_lng = col_double(),
## end_lat = col_double(),
## end_lng = col_double(),
## member_casual = col_character()
## )
tripdata_202106 <- read_csv("202106-divvy-tripdata.csv")
## Parsed with column specification:
## cols(
## ride_id = col_character(),
## rideable_type = col_character(),
## started_at = col_datetime(format = ""),
## ended_at = col_datetime(format = ""),
## start_station_name = col_character(),
## start_station_id = col_character(),
## end_station_name = col_character(),
## end_station_id = col_character(),
## start_lat = col_double(),
## start_lng = col_double(),
## end_lat = col_double(),
## end_lng = col_double(),
## member_casual = col_character()
## )
tripdata_202107 <- read_csv("202107-divvy-tripdata.csv")
## Parsed with column specification:
## cols(
## ride_id = col_character(),
## rideable_type = col_character(),
## started_at = col_datetime(format = ""),
## ended_at = col_datetime(format = ""),
## start_station_name = col_character(),
## start_station_id = col_character(),
## end_station_name = col_character(),
## end_station_id = col_character(),
## start_lat = col_double(),
## start_lng = col_double(),
## end_lat = col_double(),
## end_lng = col_double(),
## member_casual = col_character()
## )
Then let’s merge the database (in this example,all database have the same variables, so we can use rbind function, otherwise is always a better idea to use a left-join option after we have identifyed primary keys)
tripdata_total <- rbind(tripdata_202004,
tripdata_202005,
tripdata_202006,
tripdata_202007,
tripdata_202008,
tripdata_202009,
tripdata_202010,
tripdata_202011,
tripdata_202012,
tripdata_202101,
tripdata_202102,
tripdata_202103,
tripdata_202104,
tripdata_202105,
tripdata_202106,
tripdata_202107)
Let’s see our merged dataframe and have a first look at our data
as_tibble(tripdata_total)
## # A tibble: 5,910,616 x 13
## ride_id rideable_type started_at ended_at
## <chr> <chr> <dttm> <dttm>
## 1 A847FA~ docked_bike 2020-04-26 17:45:14 2020-04-26 18:12:03
## 2 5405B8~ docked_bike 2020-04-17 17:08:54 2020-04-17 17:17:03
## 3 5DD24A~ docked_bike 2020-04-01 17:54:13 2020-04-01 18:08:36
## 4 2A59BB~ docked_bike 2020-04-07 12:50:19 2020-04-07 13:02:31
## 5 27AD30~ docked_bike 2020-04-18 10:22:59 2020-04-18 11:15:54
## 6 356216~ docked_bike 2020-04-30 17:55:47 2020-04-30 18:01:11
## 7 A2759C~ docked_bike 2020-04-02 14:47:19 2020-04-02 14:52:32
## 8 FC8BC2~ docked_bike 2020-04-07 12:22:20 2020-04-07 13:38:09
## 9 9EC564~ docked_bike 2020-04-15 10:30:11 2020-04-15 10:35:55
## 10 A8FFF8~ docked_bike 2020-04-04 15:02:28 2020-04-04 15:19:47
## # ... with 5,910,606 more rows, and 9 more variables: start_station_name <chr>,
## # start_station_id <chr>, end_station_name <chr>, end_station_id <chr>,
## # start_lat <dbl>, start_lng <dbl>, end_lat <dbl>, end_lng <dbl>,
## # member_casual <chr>
glimpse(tripdata_total)
## Rows: 5,910,616
## Columns: 13
## $ ride_id <chr> "A847FADBBC638E45", "5405B80E996FF60D", "5DD24A7...
## $ rideable_type <chr> "docked_bike", "docked_bike", "docked_bike", "do...
## $ started_at <dttm> 2020-04-26 17:45:14, 2020-04-17 17:08:54, 2020-...
## $ ended_at <dttm> 2020-04-26 18:12:03, 2020-04-17 17:17:03, 2020-...
## $ start_station_name <chr> "Eckhart Park", "Drake Ave & Fullerton Ave", "Mc...
## $ start_station_id <chr> "86", "503", "142", "216", "125", "173", "35", "...
## $ end_station_name <chr> "Lincoln Ave & Diversey Pkwy", "Kosciuszko Park"...
## $ end_station_id <chr> "152", "499", "255", "657", "323", "35", "635", ...
## $ start_lat <dbl> 41.8964, 41.9244, 41.8945, 41.9030, 41.8902, 41....
## $ start_lng <dbl> -87.6610, -87.7154, -87.6179, -87.6975, -87.6262...
## $ end_lat <dbl> 41.9322, 41.9306, 41.8679, 41.8992, 41.9695, 41....
## $ end_lng <dbl> -87.6586, -87.7238, -87.6230, -87.6722, -87.6547...
## $ member_casual <chr> "member", "member", "member", "member", "casual"...
Our merged data contain lots of duplicates, let’s remove those duplicated rows from our id column
tripdata_total_clean1<- tripdata_total[!duplicated(tripdata_total$ride_id), ]
And now let’s check for missing value
mean(is.na(tripdata_total_clean1))
## [1] 0.02045758
sum(is.na(tripdata_total_clean1))
## [1] 1571864
Since it’s just for practice, we can remove observation that contains missing values
tripdata_total_clean1 <-na.omit(tripdata_total_clean1)
But what about those DateTime columns? Wouldn’t it be easier and more intresting if we coudl extract different informations and save them for later analysis? FOrtunatly, we can easily accomplished that parsing colums (Like “started at”) from DateTime format to year + month + day + hour + minute + seconds using the lubridate pakage
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
tripdata_total_clean1$started_at <- as.POSIXct(tripdata_total_clean1$started_at, "%Y-%m-%d %H:%M:%S" )
tripdata_total_clean1$ended_at <- as.POSIXct(tripdata_total_clean1$ended_at, "%Y-%m-%d %H:%M:%S" )
tripdata_total_clean1 <- tripdata_total_clean1 %>%
mutate(started_year_month = paste(strftime(tripdata_total_clean1$started_at, "%Y"),
"-",
strftime(tripdata_total_clean1$started_at, "%m")))
tripdata_total_clean1 <- tripdata_total_clean1 %>%
mutate(ended_year_month = paste(strftime(tripdata_total_clean1$ended_at, "%Y"),
"-",
strftime(tripdata_total_clean1$ended_at, "%m")))
tripdata_total_clean1 <- tripdata_total_clean1 %>%
mutate(started_day = paste(strftime(tripdata_total_clean1$started_at, "%u"),
"-",
strftime(tripdata_total_clean1$started_at, "%a")))
tripdata_total_clean1 <- tripdata_total_clean1 %>%
mutate(ended_day = paste(strftime(tripdata_total_clean1$ended_at, "%u"),
"-",
strftime(tripdata_total_clean1$ended_at, "%a")))
tripdata_total_clean1 <- tripdata_total_clean1 %>%
mutate(started_hour = paste(strftime(tripdata_total_clean1$started_at, "%H")))
tripdata_total_clean1 <- tripdata_total_clean1 %>%
mutate(ended_hour = paste(strftime(tripdata_total_clean1$ended_at, "%H")))
Done it! Quite easy wasn’t it? Nowe let’s convert some of those in a numeric format
tripdata_total_clean1$started_hour<- as.numeric(as.character(tripdata_total_clean1$started_hour))
tripdata_total_clean1$ended_hour<- as.numeric(as.character(tripdata_total_clean1$ended_hour))
What if we would like to calculate a new variable? For example it could be useful to know the lenght of the bike ride taken by customers.
tripdata_total_clean1<- tripdata_total_clean1 %>%
mutate(ride_lenght_time= as.numeric(tripdata_total_clean1$ended_at - tripdata_total_clean1$started_at) / 60)
summary(tripdata_total_clean1$ride_lenght_time)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -29049.97 7.88 14.25 26.37 26.02 58720.03
Have you noticed it? Something is not making sense.Let’s check if our data show uncorrect observations…like “started date” bigger than “finished date”
tripdata_total_clean1%>%
filter(started_at > ended_at) %>%
select(ride_id)
## # A tibble: 10,267 x 1
## ride_id
## <chr>
## 1 7C1E92200AEFF70E
## 2 671BB1F73F4CD303
## 3 502B972C6B1FCAE6
## 4 68E70FDF06F0A439
## 5 6EB323BCC83A9D1D
## 6 90105A0FA1F2B0F3
## 7 BDFF2212459A9858
## 8 BFF9D20C42D3B693
## 9 15FE83B5CC494A1C
## 10 F6FDF112F975A216
## # ... with 10,257 more rows
Guess what? it seems that some uncorrect observations are within our data, so let’s drop them out for now
tripdata_total_clean1<- tripdata_total_clean1%>%
filter(started_at < ended_at)
And now let’s visualize our data! How? ggplot of course!
tripdata_total_clean1 %>%
ggplot(aes(ended_day, fill= member_casual)) +
geom_bar() +
coord_flip()
tripdata_total_clean1 %>%
ggplot(aes(ended_year_month, fill= member_casual)) +
geom_bar() +
coord_flip()
tripdata_total_clean1 %>%
ggplot(aes(started_hour, fill= member_casual)) +
geom_bar()
tripdata_total_clean1 %>%
ggplot(aes(started_hour, fill= member_casual)) +
geom_bar() +
facet_wrap(~ ended_day )
tripdata_total_clean1 %>%
group_by(rideable_type) %>%
summarise(count = length(ride_id),
'%' = (length(ride_id) / nrow(tripdata_total_clean1)) * 100,
'members_p' = (sum(member_casual == "member") / length(ride_id)) * 100,
'casual_p' = (sum(member_casual == "casual") / length(ride_id)) * 100)
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 3 x 5
## rideable_type count `%` members_p casual_p
## <chr> <int> <dbl> <dbl> <dbl>
## 1 classic_bike 1780342 33.4 61.1 38.9
## 2 docked_bike 2721193 51.0 52.6 47.4
## 3 electric_bike 832646 15.6 54.7 45.3
tripdata_total_clean1 %>%
group_by(rideable_type,ended_year_month) %>%
summarise(count = length(ride_id),
'%' = (length(ride_id) / nrow(tripdata_total_clean1)) * 100,
'members_p' = (sum(member_casual == "member") / length(ride_id)) * 100,
'casual_p' = (sum(member_casual == "casual") / length(ride_id)) * 100)
## `summarise()` regrouping output by 'rideable_type' (override with `.groups` argument)
## # A tibble: 40 x 6
## # Groups: rideable_type [3]
## rideable_type ended_year_month count `%` members_p casual_p
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 classic_bike 2020 - 12 70381 1.32 84.0 16.0
## 2 classic_bike 2021 - 01 61420 1.15 86.6 13.4
## 3 classic_bike 2021 - 02 34627 0.649 83.8 16.2
## 4 classic_bike 2021 - 03 152068 2.85 70.2 29.8
## 5 classic_bike 2021 - 04 213863 4.01 67.1 32.9
## 6 classic_bike 2021 - 05 308265 5.78 60.0 40.0
## 7 classic_bike 2021 - 06 433341 8.12 56.8 43.2
## 8 classic_bike 2021 - 07 503845 9.45 52.5 47.5
## 9 classic_bike 2021 - 08 2532 0.0475 28.9 71.1
## 10 docked_bike 2020 - 04 84505 1.58 72.2 27.8
## # ... with 30 more rows
What can you tell about the different kind of bike user? How do they differ from each other? Data visualization and Exploration are useful whenever we need to spot differences or answering questions!