1. Introduction

Bay Area Bike Share at Berkeley (Berkeleyside 2014)

1.1 Goals and Acknowledgements

Hello everyone! I’m happy to celebrate my kaggle debut with you and these intriguing datasets. The purpose of this kernel is to do a basic exploritory data analysis of the SF bike share data and start doing some basic predictions. More specifically my goals are:

Highlight oddities in data to help with future analysis
Discover variable interactions that may be useful in prediction
Attempt a simple prediction of trip duration

Data was pulled from SF Bay Area Bike Share data contributed by Ben Hamner.

Thank you in advance for your constructive comments, questions, and upvotes.

1.2 Packages

library(data.table) # Reading Data
library(kableExtra) # Table Creation
library(knitr) # Table Creation
library(dplyr) # Glimpsing Data
library(leaflet) # Map Creation
library(plotly) # Visualizations

1.3 Datasets

My analysis uses 4 data sets:

Trips:
Stations:
Weather:

trips <- fread("C:/Users/AK055125/Cerner Corporation/Kaechele, Alex/Analytics Learning Lab Projects/Non Projects - NA/Kaggle EDA Project/trip.csv")
stations <- fread("C:/Users/AK055125/Cerner Corporation/Kaechele, Alex/Analytics Learning Lab Projects/Non Projects - NA/Kaggle EDA Project/station.csv")
weather <- fread("C:/Users/AK055125/Cerner Corporation/Kaechele, Alex/Analytics Learning Lab Projects/Non Projects - NA/Kaggle EDA Project/weather.csv")

Trips

head(trips) %>%
  kable("html") %>%
  kable_styling()

id	duration	start_date	start_station_name	start_station_id	end_date	end_station_name	end_station_id	bike_id	subscription_type	zip_code
4576	63	8/29/2013 14:13	South Van Ness at Market	66	8/29/2013 14:14	South Van Ness at Market	66	520	Subscriber	94127
4607	70	8/29/2013 14:42	San Jose City Hall	10	8/29/2013 14:43	San Jose City Hall	10	661	Subscriber	95138
4130	71	8/29/2013 10:16	Mountain View City Hall	27	8/29/2013 10:17	Mountain View City Hall	27	48	Subscriber	97214
4251	77	8/29/2013 11:29	San Jose City Hall	10	8/29/2013 11:30	San Jose City Hall	10	26	Subscriber	95060
4299	83	8/29/2013 12:02	South Van Ness at Market	66	8/29/2013 12:04	Market at 10th	67	319	Subscriber	94103
4927	103	8/29/2013 18:54	Golden Gate at Polk	59	8/29/2013 18:56	Golden Gate at Polk	59	527	Subscriber	94109

glimpse(trips)

## Observations: 669,959
## Variables: 11
## $ id                 <int> 4576, 4607, 4130, 4251, 4299, 4927, 4500, 4...
## $ duration           <int> 63, 70, 71, 77, 83, 103, 109, 111, 113, 114...
## $ start_date         <chr> "8/29/2013 14:13", "8/29/2013 14:42", "8/29...
## $ start_station_name <chr> "South Van Ness at Market", "San Jose City ...
## $ start_station_id   <int> 66, 10, 27, 10, 66, 59, 4, 8, 66, 10, 49, 6...
## $ end_date           <chr> "8/29/2013 14:14", "8/29/2013 14:43", "8/29...
## $ end_station_name   <chr> "South Van Ness at Market", "San Jose City ...
## $ end_station_id     <int> 66, 10, 27, 10, 67, 59, 5, 8, 66, 11, 54, 4...
## $ bike_id            <int> 520, 661, 48, 26, 319, 527, 679, 687, 553, ...
## $ subscription_type  <chr> "Subscriber", "Subscriber", "Subscriber", "...
## $ zip_code           <chr> "94127", "95138", "97214", "95060", "94103"...

Description of trip data

Stations

head(stations) %>%
  kable("html") %>%
  kable_styling()

id	name	lat	long	dock_count	city	installation_date
2	San Jose Diridon Caltrain Station	37.32973	-121.9018	27	San Jose	8/6/2013
3	San Jose Civic Center	37.33070	-121.8890	15	San Jose	8/5/2013
4	Santa Clara at Almaden	37.33399	-121.8949	11	San Jose	8/6/2013
5	Adobe on Almaden	37.33141	-121.8932	19	San Jose	8/5/2013
6	San Pedro Square	37.33672	-121.8941	15	San Jose	8/7/2013
7	Paseo de San Antonio	37.33380	-121.8869	15	San Jose	8/7/2013

glimpse(stations)

## Observations: 70
## Variables: 7
## $ id                <int> 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, ...
## $ name              <chr> "San Jose Diridon Caltrain Station", "San Jo...
## $ lat               <dbl> 37.32973, 37.33070, 37.33399, 37.33141, 37.3...
## $ long              <dbl> -121.9018, -121.8890, -121.8949, -121.8932, ...
## $ dock_count        <int> 27, 15, 11, 19, 15, 15, 15, 15, 15, 19, 19, ...
## $ city              <chr> "San Jose", "San Jose", "San Jose", "San Jos...
## $ installation_date <chr> "8/6/2013", "8/5/2013", "8/6/2013", "8/5/201...

Description of station data

Weather

head(weather) %>%
  kable("html") %>%
  kable_styling()

date	max_temperature_f	mean_temperature_f	min_temperature_f	max_dew_point_f	mean_dew_point_f	min_dew_point_f	max_humidity	mean_humidity	min_humidity	max_sea_level_pressure_inches	mean_sea_level_pressure_inches	min_sea_level_pressure_inches	max_visibility_miles	mean_visibility_miles	min_visibility_miles	max_wind_Speed_mph	mean_wind_speed_mph	max_gust_speed_mph	cloud_cover	wind_dir_degrees	zip_code
8/29/2013	74	68	61	61	58	56	93	75	57	30.07	30.02	29.97	10	10	10	23	11	28	4	286	94107
8/30/2013	78	69	60	61	58	56	90	70	50	30.05	30.00	29.93	10	10	7	29	13	35	2	291	94107
8/31/2013	71	64	57	57	56	54	93	75	57	30.00	29.96	29.92	10	10	10	26	15	31	4	284	94107
9/1/2013	74	66	58	60	56	53	87	68	49	29.96	29.93	29.91	10	10	10	25	13	29	4	284	94107
9/2/2013	75	69	62	61	60	58	93	77	61	29.97	29.94	29.90	10	10	6	23	12	30	6	277	94107
9/3/2013	73	67	60	59	56	51	84	65	46	30.02	29.98	29.95	10	10	10	24	15	31	2	276	94107

glimpse(weather)

## Observations: 3,665
## Variables: 24
## $ date                           <chr> "8/29/2013", "8/30/2013", "8/31...
## $ max_temperature_f              <dbl> 74, 78, 71, 74, 75, 73, 74, 72,...
## $ mean_temperature_f             <dbl> 68, 69, 64, 66, 69, 67, 68, 66,...
## $ min_temperature_f              <dbl> 61, 60, 57, 58, 62, 60, 61, 60,...
## $ max_dew_point_f                <dbl> 61, 61, 57, 60, 61, 59, 59, 57,...
## $ mean_dew_point_f               <dbl> 58, 58, 56, 56, 60, 56, 57, 56,...
## $ min_dew_point_f                <dbl> 56, 56, 54, 53, 58, 51, 56, 54,...
## $ max_humidity                   <dbl> 93, 90, 93, 87, 93, 84, 90, 90,...
## $ mean_humidity                  <dbl> 75, 70, 75, 68, 77, 65, 72, 74,...
## $ min_humidity                   <dbl> 57, 50, 57, 49, 61, 46, 53, 57,...
## $ max_sea_level_pressure_inches  <dbl> 30.07, 30.05, 30.00, 29.96, 29....
## $ mean_sea_level_pressure_inches <dbl> 30.02, 30.00, 29.96, 29.93, 29....
## $ min_sea_level_pressure_inches  <dbl> 29.97, 29.93, 29.92, 29.91, 29....
## $ max_visibility_miles           <dbl> 10, 10, 10, 10, 10, 10, 10, 10,...
## $ mean_visibility_miles          <dbl> 10, 10, 10, 10, 10, 10, 10, 10,...
## $ min_visibility_miles           <dbl> 10, 7, 10, 10, 6, 10, 10, 10, 1...
## $ max_wind_Speed_mph             <dbl> 23, 29, 26, 25, 23, 24, 29, 31,...
## $ mean_wind_speed_mph            <dbl> 11, 13, 15, 13, 12, 15, 19, 21,...
## $ max_gust_speed_mph             <dbl> 28, 35, 31, 29, 30, 31, 35, 37,...
## $ precipitation_inches           <chr> "0", "0", "0", "0", "0", "0", "...
## $ cloud_cover                    <dbl> 4, 2, 4, 4, 6, 2, 4, 3, 0, 1, 2...
## $ events                         <chr> "", "", "", "", "", "", "", "",...
## $ wind_dir_degrees               <dbl> 286, 291, 284, 284, 277, 276, 2...
## $ zip_code                       <int> 94107, 94107, 94107, 94107, 941...

Description of weather data

2. Individual Feature Visualizations

2.1 Trips Data

2.1.1 Distribution of duration

# Checking Earliest Date (Day of the Week)
weekdays(sort(as.Date(trips$start_date, format = "%m/%d/%Y %H:%M"), decreasing = F)[1])

## [1] "Thursday"

# Aggregating Data by Day
duration <- aggregate(trips$duration, by = list(Category = as.Date(trips$start_date, format = "%m/%d/%Y %H:%M")), FUN = mean)$x

# Convert to Time Series (in Hours)
durationts <- ts(duration/(60*60), frequency=7, start=c(1,5))

# Creating Decomposed Dataset
durationcomp <- decompose(durationts)

# Looking at the Different Componants
plot_ly(x = factor(c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"), 
                   levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"), ordered = TRUE),
     y = durationcomp$seasonal[5:11],
     type = "bar")

# Looks like people take longer trips on the weekends

finding out it is log distibuted

finding the anomolies (6 months) and 525421 where the duration is different than the start and end dates

bar chart of subscription type

time series of number of trips

Bar chart of end destination stations

Bar chart of begining destination stations

Map of trips from station to station

2.2 Station Data

2.2.1 Bike Share Locations

Let’s visualize the locations of the different bike stations using leaflet:

leaflet() %>%
  addTiles() %>% 
  addMarkers(lng=stations$long, lat=stations$lat, label = stations$name)

2.2.2 Size of Stations

Looks like there is a line of stations around Market Street and the coastline. Let’s take a look at where the more popular locations are…

leaflet() %>%
  addTiles() %>%  # Add default OpenStreetMap map tiles
  addCircles(lng = stations$long, 
             lat = stations$lat, 
             radius = stations$dock_count*5,
             label = paste(stations$name, "--", "Dock Count:", stations$dock_count))

plot_ly(
  x = unique(stations$city),
  y = c(sum(stations[stations$city == unique(stations$city)[1],]$dock_count), 
        sum(stations[stations$city == unique(stations$city)[2],]$dock_count),
        sum(stations[stations$city == unique(stations$city)[3],]$dock_count),
        sum(stations[stations$city == unique(stations$city)[4],]$dock_count),
        sum(stations[stations$city == unique(stations$city)[5],]$dock_count)),
  name = "Number of Bike Docks by City",
  type = "bar"
)

That’s not too surprising considering that San Fransisco has the most number of bike stations. Let’s look at the number of bike docks per station by city:

plot_ly(
  x = unique(stations$city),
  y = c(sum(stations[stations$city == unique(stations$city)[1],]$dock_count)/length(unique(stations[stations$city == unique(stations$city)[1],]$name)), 
        sum(stations[stations$city == unique(stations$city)[2],]$dock_count)/length(unique(stations[stations$city == unique(stations$city)[2],]$name)),
        sum(stations[stations$city == unique(stations$city)[3],]$dock_count)/length(unique(stations[stations$city == unique(stations$city)[3],]$name)),
        sum(stations[stations$city == unique(stations$city)[4],]$dock_count)/length(unique(stations[stations$city == unique(stations$city)[4],]$name)),
        sum(stations[stations$city == unique(stations$city)[5],]$dock_count)/length(unique(stations[stations$city == unique(stations$city)[5],]$name))),
  name = "Number of Bike Docks by City",
  type = "bar"
)

On average, the bike stations in San Fransisco are larger than the nearby cities

# Sorting Data in Decending Order by Factorizing Name 
stations$name <- factor(stations$name, levels = unique(stations$name)[order(stations$dock_count, decreasing = TRUE)])

# Plotly Bar Chart
plot_ly(
  x = stations$name,
  y = stations$dock_count,
  name = "Number of Bike Docks by City",
  type = "bar"
)

Plotting distribution of dock size of

plot_ly(x = density(aggregate(stations$dock_count, by=list(Category=stations$name), FUN=sum)$x)$x, 
        y = density(aggregate(stations$dock_count, by=list(Category=stations$name), FUN=sum)$x)$y, 
                               mode = "lines", fill = "tozeroy", name = "Density")

This may indicate we want to include dummies for the different cities.

2.2.3 Look at age of stations

Are there stations that are notably older than others?

# Calculating Average Age as of April 11th, 2018
stations$age <- as.numeric(as.Date("4-11-2018", format = "%m-%d-%Y")) - as.numeric(as.Date(stations$installation_date, format = "%m/%d/%Y"))

# Converting Age to Years
stations$age <- stations$age/365.25

# Making a Table By City
agetable <- aggregate(stations$age, by=list(Category=stations$city), FUN = mean)

# Rounding to Hundredths Place
agetable$x <- round(agetable$x, digits = 2)

# Renaming and Ordering Columns
agetable[order(agetable$x, decreasing = TRUE),] %>% 
  na.omit() %>%
  dplyr::rename("City" = "Category",
                "Average Age (Years)" = "x") %>%

# Making Kable Table
    kable("html", row.names = FALSE) %>%
    kable_styling(full_width = F)

City	Average Age (Years)
Palo Alto	4.66
San Francisco	4.62
San Jose	4.61
Redwood City	4.59
Mountain View	4.55

2.3 Weather

2.3.1 Weather by Zip Code

want to include that as a dummy variable

 table(tolower(weather$events), weather$zip_code)

##                    
##                     94041 94063 94107 94301 95113
##                       639   639   574   653   638
##   fog                  10    17    54    18    13
##   fog-rain              2     2    10     1     2
##   rain                 82    75    93    60    80
##   rain-thunderstorm     0     0     2     1     0

Zip code 94107 seems to have the worst weather in terms of days with events

aggregate(as.numeric(weather$precipitation_inches), by = list(Category = weather$zip_code), FUN = mean, na.rm = TRUE)

##   Category           x
## 1    94041 0.031493599
## 2    94063 0.004583902
## 3    94107 0.037058824
## 4    94301 0.005631868
## 5    95113 0.028198847

94107 still seems to have the most rainfall over these two years

aggregate(as.numeric(weather$mean_temperature_f), by = list(Category = weather$zip_code), FUN = mean, na.rm = TRUE)

##   Category        x
## 1    94041 61.38082
## 2    94063 60.91132
## 3    94107 60.42701
## 4    94301 62.08060
## 5    95113 61.94543

aggregate(as.numeric(weather$mean_wind_speed_mph), by = list(Category=weather$zip_code), FUN = mean, na.rm = TRUE)

##   Category        x
## 1    94041 4.755798
## 2    94063 4.016371
## 3    94107 8.210095
## 4    94301 6.931694
## 5    95113 5.897681

EDA of SF Bike Share Data

Alex Kaechele

April 9th, 2018