Bay Area Bike Share at Berkeley (Berkeleyside 2014)
Hello everyone! I’m happy to celebrate my kaggle debut with you and these intriguing datasets. The purpose of this kernel is to do a basic exploritory data analysis of the SF bike share data and start doing some basic predictions. More specifically my goals are:
Data was pulled from SF Bay Area Bike Share data contributed by Ben Hamner.
Thank you in advance for your constructive comments, questions, and upvotes.
library(data.table) # Reading Data
library(kableExtra) # Table Creation
library(knitr) # Table Creation
library(dplyr) # Glimpsing Data
library(leaflet) # Map Creation
library(plotly) # VisualizationsMy analysis uses 4 data sets:
trips <- fread("C:/Users/AK055125/Cerner Corporation/Kaechele, Alex/Analytics Learning Lab Projects/Non Projects - NA/Kaggle EDA Project/trip.csv")
stations <- fread("C:/Users/AK055125/Cerner Corporation/Kaechele, Alex/Analytics Learning Lab Projects/Non Projects - NA/Kaggle EDA Project/station.csv")
weather <- fread("C:/Users/AK055125/Cerner Corporation/Kaechele, Alex/Analytics Learning Lab Projects/Non Projects - NA/Kaggle EDA Project/weather.csv")head(trips) %>%
kable("html") %>%
kable_styling()| id | duration | start_date | start_station_name | start_station_id | end_date | end_station_name | end_station_id | bike_id | subscription_type | zip_code |
|---|---|---|---|---|---|---|---|---|---|---|
| 4576 | 63 | 8/29/2013 14:13 | South Van Ness at Market | 66 | 8/29/2013 14:14 | South Van Ness at Market | 66 | 520 | Subscriber | 94127 |
| 4607 | 70 | 8/29/2013 14:42 | San Jose City Hall | 10 | 8/29/2013 14:43 | San Jose City Hall | 10 | 661 | Subscriber | 95138 |
| 4130 | 71 | 8/29/2013 10:16 | Mountain View City Hall | 27 | 8/29/2013 10:17 | Mountain View City Hall | 27 | 48 | Subscriber | 97214 |
| 4251 | 77 | 8/29/2013 11:29 | San Jose City Hall | 10 | 8/29/2013 11:30 | San Jose City Hall | 10 | 26 | Subscriber | 95060 |
| 4299 | 83 | 8/29/2013 12:02 | South Van Ness at Market | 66 | 8/29/2013 12:04 | Market at 10th | 67 | 319 | Subscriber | 94103 |
| 4927 | 103 | 8/29/2013 18:54 | Golden Gate at Polk | 59 | 8/29/2013 18:56 | Golden Gate at Polk | 59 | 527 | Subscriber | 94109 |
glimpse(trips)## Observations: 669,959
## Variables: 11
## $ id <int> 4576, 4607, 4130, 4251, 4299, 4927, 4500, 4...
## $ duration <int> 63, 70, 71, 77, 83, 103, 109, 111, 113, 114...
## $ start_date <chr> "8/29/2013 14:13", "8/29/2013 14:42", "8/29...
## $ start_station_name <chr> "South Van Ness at Market", "San Jose City ...
## $ start_station_id <int> 66, 10, 27, 10, 66, 59, 4, 8, 66, 10, 49, 6...
## $ end_date <chr> "8/29/2013 14:14", "8/29/2013 14:43", "8/29...
## $ end_station_name <chr> "South Van Ness at Market", "San Jose City ...
## $ end_station_id <int> 66, 10, 27, 10, 67, 59, 5, 8, 66, 11, 54, 4...
## $ bike_id <int> 520, 661, 48, 26, 319, 527, 679, 687, 553, ...
## $ subscription_type <chr> "Subscriber", "Subscriber", "Subscriber", "...
## $ zip_code <chr> "94127", "95138", "97214", "95060", "94103"...
Description of trip data
head(stations) %>%
kable("html") %>%
kable_styling()| id | name | lat | long | dock_count | city | installation_date |
|---|---|---|---|---|---|---|
| 2 | San Jose Diridon Caltrain Station | 37.32973 | -121.9018 | 27 | San Jose | 8/6/2013 |
| 3 | San Jose Civic Center | 37.33070 | -121.8890 | 15 | San Jose | 8/5/2013 |
| 4 | Santa Clara at Almaden | 37.33399 | -121.8949 | 11 | San Jose | 8/6/2013 |
| 5 | Adobe on Almaden | 37.33141 | -121.8932 | 19 | San Jose | 8/5/2013 |
| 6 | San Pedro Square | 37.33672 | -121.8941 | 15 | San Jose | 8/7/2013 |
| 7 | Paseo de San Antonio | 37.33380 | -121.8869 | 15 | San Jose | 8/7/2013 |
glimpse(stations)## Observations: 70
## Variables: 7
## $ id <int> 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, ...
## $ name <chr> "San Jose Diridon Caltrain Station", "San Jo...
## $ lat <dbl> 37.32973, 37.33070, 37.33399, 37.33141, 37.3...
## $ long <dbl> -121.9018, -121.8890, -121.8949, -121.8932, ...
## $ dock_count <int> 27, 15, 11, 19, 15, 15, 15, 15, 15, 19, 19, ...
## $ city <chr> "San Jose", "San Jose", "San Jose", "San Jos...
## $ installation_date <chr> "8/6/2013", "8/5/2013", "8/6/2013", "8/5/201...
Description of station data
head(weather) %>%
kable("html") %>%
kable_styling()| date | max_temperature_f | mean_temperature_f | min_temperature_f | max_dew_point_f | mean_dew_point_f | min_dew_point_f | max_humidity | mean_humidity | min_humidity | max_sea_level_pressure_inches | mean_sea_level_pressure_inches | min_sea_level_pressure_inches | max_visibility_miles | mean_visibility_miles | min_visibility_miles | max_wind_Speed_mph | mean_wind_speed_mph | max_gust_speed_mph | precipitation_inches | cloud_cover | events | wind_dir_degrees | zip_code |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8/29/2013 | 74 | 68 | 61 | 61 | 58 | 56 | 93 | 75 | 57 | 30.07 | 30.02 | 29.97 | 10 | 10 | 10 | 23 | 11 | 28 | 0 | 4 | 286 | 94107 | |
| 8/30/2013 | 78 | 69 | 60 | 61 | 58 | 56 | 90 | 70 | 50 | 30.05 | 30.00 | 29.93 | 10 | 10 | 7 | 29 | 13 | 35 | 0 | 2 | 291 | 94107 | |
| 8/31/2013 | 71 | 64 | 57 | 57 | 56 | 54 | 93 | 75 | 57 | 30.00 | 29.96 | 29.92 | 10 | 10 | 10 | 26 | 15 | 31 | 0 | 4 | 284 | 94107 | |
| 9/1/2013 | 74 | 66 | 58 | 60 | 56 | 53 | 87 | 68 | 49 | 29.96 | 29.93 | 29.91 | 10 | 10 | 10 | 25 | 13 | 29 | 0 | 4 | 284 | 94107 | |
| 9/2/2013 | 75 | 69 | 62 | 61 | 60 | 58 | 93 | 77 | 61 | 29.97 | 29.94 | 29.90 | 10 | 10 | 6 | 23 | 12 | 30 | 0 | 6 | 277 | 94107 | |
| 9/3/2013 | 73 | 67 | 60 | 59 | 56 | 51 | 84 | 65 | 46 | 30.02 | 29.98 | 29.95 | 10 | 10 | 10 | 24 | 15 | 31 | 0 | 2 | 276 | 94107 |
glimpse(weather)## Observations: 3,665
## Variables: 24
## $ date <chr> "8/29/2013", "8/30/2013", "8/31...
## $ max_temperature_f <dbl> 74, 78, 71, 74, 75, 73, 74, 72,...
## $ mean_temperature_f <dbl> 68, 69, 64, 66, 69, 67, 68, 66,...
## $ min_temperature_f <dbl> 61, 60, 57, 58, 62, 60, 61, 60,...
## $ max_dew_point_f <dbl> 61, 61, 57, 60, 61, 59, 59, 57,...
## $ mean_dew_point_f <dbl> 58, 58, 56, 56, 60, 56, 57, 56,...
## $ min_dew_point_f <dbl> 56, 56, 54, 53, 58, 51, 56, 54,...
## $ max_humidity <dbl> 93, 90, 93, 87, 93, 84, 90, 90,...
## $ mean_humidity <dbl> 75, 70, 75, 68, 77, 65, 72, 74,...
## $ min_humidity <dbl> 57, 50, 57, 49, 61, 46, 53, 57,...
## $ max_sea_level_pressure_inches <dbl> 30.07, 30.05, 30.00, 29.96, 29....
## $ mean_sea_level_pressure_inches <dbl> 30.02, 30.00, 29.96, 29.93, 29....
## $ min_sea_level_pressure_inches <dbl> 29.97, 29.93, 29.92, 29.91, 29....
## $ max_visibility_miles <dbl> 10, 10, 10, 10, 10, 10, 10, 10,...
## $ mean_visibility_miles <dbl> 10, 10, 10, 10, 10, 10, 10, 10,...
## $ min_visibility_miles <dbl> 10, 7, 10, 10, 6, 10, 10, 10, 1...
## $ max_wind_Speed_mph <dbl> 23, 29, 26, 25, 23, 24, 29, 31,...
## $ mean_wind_speed_mph <dbl> 11, 13, 15, 13, 12, 15, 19, 21,...
## $ max_gust_speed_mph <dbl> 28, 35, 31, 29, 30, 31, 35, 37,...
## $ precipitation_inches <chr> "0", "0", "0", "0", "0", "0", "...
## $ cloud_cover <dbl> 4, 2, 4, 4, 6, 2, 4, 3, 0, 1, 2...
## $ events <chr> "", "", "", "", "", "", "", "",...
## $ wind_dir_degrees <dbl> 286, 291, 284, 284, 277, 276, 2...
## $ zip_code <int> 94107, 94107, 94107, 94107, 941...
Description of weather data
# Checking Earliest Date (Day of the Week)
weekdays(sort(as.Date(trips$start_date, format = "%m/%d/%Y %H:%M"), decreasing = F)[1])## [1] "Thursday"
# Aggregating Data by Day
duration <- aggregate(trips$duration, by = list(Category = as.Date(trips$start_date, format = "%m/%d/%Y %H:%M")), FUN = mean)$x
# Convert to Time Series (in Hours)
durationts <- ts(duration/(60*60), frequency=7, start=c(1,5))
# Creating Decomposed Dataset
durationcomp <- decompose(durationts)
# Looking at the Different Componants
plot_ly(x = factor(c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"),
levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"), ordered = TRUE),
y = durationcomp$seasonal[5:11],
type = "bar")# Looks like people take longer trips on the weekendsfinding out it is log distibuted
finding the anomolies (6 months) and 525421 where the duration is different than the start and end dates
Looks like there is a line of stations around Market Street and the coastline. Let’s take a look at where the more popular locations are…
leaflet() %>%
addTiles() %>% # Add default OpenStreetMap map tiles
addCircles(lng = stations$long,
lat = stations$lat,
radius = stations$dock_count*5,
label = paste(stations$name, "--", "Dock Count:", stations$dock_count))plot_ly(
x = unique(stations$city),
y = c(sum(stations[stations$city == unique(stations$city)[1],]$dock_count),
sum(stations[stations$city == unique(stations$city)[2],]$dock_count),
sum(stations[stations$city == unique(stations$city)[3],]$dock_count),
sum(stations[stations$city == unique(stations$city)[4],]$dock_count),
sum(stations[stations$city == unique(stations$city)[5],]$dock_count)),
name = "Number of Bike Docks by City",
type = "bar"
)That’s not too surprising considering that San Fransisco has the most number of bike stations. Let’s look at the number of bike docks per station by city:
plot_ly(
x = unique(stations$city),
y = c(sum(stations[stations$city == unique(stations$city)[1],]$dock_count)/length(unique(stations[stations$city == unique(stations$city)[1],]$name)),
sum(stations[stations$city == unique(stations$city)[2],]$dock_count)/length(unique(stations[stations$city == unique(stations$city)[2],]$name)),
sum(stations[stations$city == unique(stations$city)[3],]$dock_count)/length(unique(stations[stations$city == unique(stations$city)[3],]$name)),
sum(stations[stations$city == unique(stations$city)[4],]$dock_count)/length(unique(stations[stations$city == unique(stations$city)[4],]$name)),
sum(stations[stations$city == unique(stations$city)[5],]$dock_count)/length(unique(stations[stations$city == unique(stations$city)[5],]$name))),
name = "Number of Bike Docks by City",
type = "bar"
)On average, the bike stations in San Fransisco are larger than the nearby cities
# Sorting Data in Decending Order by Factorizing Name
stations$name <- factor(stations$name, levels = unique(stations$name)[order(stations$dock_count, decreasing = TRUE)])
# Plotly Bar Chart
plot_ly(
x = stations$name,
y = stations$dock_count,
name = "Number of Bike Docks by City",
type = "bar"
)Plotting distribution of dock size of
plot_ly(x = density(aggregate(stations$dock_count, by=list(Category=stations$name), FUN=sum)$x)$x,
y = density(aggregate(stations$dock_count, by=list(Category=stations$name), FUN=sum)$x)$y,
mode = "lines", fill = "tozeroy", name = "Density")This may indicate we want to include dummies for the different cities.
Are there stations that are notably older than others?
# Calculating Average Age as of April 11th, 2018
stations$age <- as.numeric(as.Date("4-11-2018", format = "%m-%d-%Y")) - as.numeric(as.Date(stations$installation_date, format = "%m/%d/%Y"))
# Converting Age to Years
stations$age <- stations$age/365.25
# Making a Table By City
agetable <- aggregate(stations$age, by=list(Category=stations$city), FUN = mean)
# Rounding to Hundredths Place
agetable$x <- round(agetable$x, digits = 2)
# Renaming and Ordering Columns
agetable[order(agetable$x, decreasing = TRUE),] %>%
na.omit() %>%
dplyr::rename("City" = "Category",
"Average Age (Years)" = "x") %>%
# Making Kable Table
kable("html", row.names = FALSE) %>%
kable_styling(full_width = F)| City | Average Age (Years) |
|---|---|
| Palo Alto | 4.66 |
| San Francisco | 4.62 |
| San Jose | 4.61 |
| Redwood City | 4.59 |
| Mountain View | 4.55 |
want to include that as a dummy variable
table(tolower(weather$events), weather$zip_code)##
## 94041 94063 94107 94301 95113
## 639 639 574 653 638
## fog 10 17 54 18 13
## fog-rain 2 2 10 1 2
## rain 82 75 93 60 80
## rain-thunderstorm 0 0 2 1 0
Zip code 94107 seems to have the worst weather in terms of days with events
aggregate(as.numeric(weather$precipitation_inches), by = list(Category = weather$zip_code), FUN = mean, na.rm = TRUE)## Category x
## 1 94041 0.031493599
## 2 94063 0.004583902
## 3 94107 0.037058824
## 4 94301 0.005631868
## 5 95113 0.028198847
94107 still seems to have the most rainfall over these two years
aggregate(as.numeric(weather$mean_temperature_f), by = list(Category = weather$zip_code), FUN = mean, na.rm = TRUE)## Category x
## 1 94041 61.38082
## 2 94063 60.91132
## 3 94107 60.42701
## 4 94301 62.08060
## 5 95113 61.94543
aggregate(as.numeric(weather$mean_wind_speed_mph), by = list(Category=weather$zip_code), FUN = mean, na.rm = TRUE)## Category x
## 1 94041 4.755798
## 2 94063 4.016371
## 3 94107 8.210095
## 4 94301 6.931694
## 5 95113 5.897681
94107 is also quite a bit windier