Apache Hive and R

library(tidyverse)

## -- Attaching packages -------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.0     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.4
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## Warning: package 'ggplot2' was built under R version 3.6.3

## -- Conflicts ----------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(dplyr)
library(readr)

station <- read_csv("station_data.csv", col_names = FALSE)

## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   X2 = col_character(),
##   X3 = col_double(),
##   X4 = col_double(),
##   X5 = col_double(),
##   X6 = col_character(),
##   X7 = col_character()
## )

trip <- read_csv("trip_data.csv", col_names = FALSE)

## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   X2 = col_double(),
##   X3 = col_character(),
##   X4 = col_character(),
##   X5 = col_double(),
##   X6 = col_character(),
##   X7 = col_character(),
##   X8 = col_double(),
##   X9 = col_double(),
##   X10 = col_character(),
##   X11 = col_character()
## )

colnames(station) <- c("id","Name","lat","long","dockcount","landmark","installation")
colnames(trip) <- c("trip_id","duration","startdate","startstation","startterminal", "enddate","endstation","endterminal","bike", "subscriptionType", "ZzipCode")

The most popular bike, i.e. the bike that has made the highest number of trips (Exclude trips that start and end at the same station)

trip %>% 
  filter(!startterminal == endterminal) %>% 
  group_by(bike) %>% 
  count(sort=TRUE) %>% 
  head(1)

## # A tibble: 1 x 2
## # Groups:   bike [1]
##    bike     n
##   <dbl> <int>
## 1   878  1090

Find the number of trips made by each subscription type

trip %>%
  filter(!startterminal == endterminal) %>% 
  group_by(subscriptionType) %>% 
  count()

## # A tibble: 2 x 2
## # Groups:   subscriptionType [2]
##   subscriptionType      n
##   <chr>             <int>
## 1 Customer          37038
## 2 Subscriber       306838

A table that shows which stations are connected, and the minimum duration between them.

trip %>%
  filter(!startstation == endstation) %>% 
  group_by(startstation, endstation) %>% 
  summarise(minduration=min(duration)) %>% 
  arrange(startstation,endstation, minduration) %>% 
  head(5)

## # A tibble: 5 x 3
## # Groups:   startstation [1]
##   startstation  endstation                minduration
##   <chr>         <chr>                           <dbl>
## 1 2nd at Folsom 2nd at South Park                  61
## 2 2nd at Folsom 2nd at Townsend                   137
## 3 2nd at Folsom 5th at Howard                     215
## 4 2nd at Folsom Beale at Market                   219
## 5 2nd at Folsom Broadway St at Battery St         351

The number of trips originating from each landmark.

table1 <- trip %>%
  left_join(station, by = c("startterminal" = "id"))

table2 <- table1 %>% 
  filter(!startstation == endstation) %>% 
  group_by(landmark) %>% 
  count()

table2

## # A tibble: 5 x 2
## # Groups:   landmark [5]
##   landmark           n
##   <chr>          <int>
## 1 Mountain View   9361
## 2 Palo Alto       2370
## 3 Redwood City    1790
## 4 San Francisco 313528
## 5 San Jose       16827

Number of trips crossing landmarks, i.e trips that originate in one landmark and end in another.

table3 <- trip %>% 
  left_join(station, by = c("startterminal" = "id"))

colnames(table3)[colnames(table3)=="landmark"] <- "start_landmark"

table4 <- trip %>% 
  left_join(station, by =  c("endterminal"= "id"))

colnames(table4)[colnames(table4)=="landmark"] <- "end_landmark"

table3 %>% 
  left_join(table4, by = c("trip_id" = "trip_id"))%>%
  filter(!start_landmark == end_landmark) %>% 
  group_by(start_landmark, end_landmark) %>% 
  count()

## # A tibble: 13 x 3
## # Groups:   start_landmark, end_landmark [13]
##    start_landmark end_landmark      n
##    <chr>          <chr>         <int>
##  1 Mountain View  Palo Alto       198
##  2 Mountain View  Redwood City      3
##  3 Mountain View  San Francisco     4
##  4 Mountain View  San Jose          6
##  5 Palo Alto      Mountain View   182
##  6 Palo Alto      Redwood City     36
##  7 Palo Alto      San Francisco     4
##  8 Redwood City   Mountain View     1
##  9 Redwood City   Palo Alto        64
## 10 San Francisco  Mountain View     2
## 11 San Francisco  Redwood City      2
## 12 San Jose       Mountain View     6
## 13 San Jose       San Francisco     1

Apache Hive and R

Minh Trung DANG

28/12/2020