Bike_Ride Notebook

1.Install the required packages

Load all the packages installed above

library(tidyverse)
library(DBI)
library(RMySQL)

Data Extraction

3.Connect to the MYSQL database

mydb = dbConnect(MySQL(), user='root', password='password', dbname='BikeRidedb', host='Prakhars-MacBook-Air.local')

List the tables in the db

dbListTables(mydb)

## [1] "bike_tripdata"

This shows that the connectio has been established and we can see and access our table “bike-tripdata”.

List the columns of the db table

dbListFields(mydb, "bike_tripdata")

##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"

4.Retrieve the data from the MySQL databse

result = dbSendQuery(mydb, "SELECT * from bike_tripdata")

Load the result object into an R dataframe

df = fetch(result, n=-1)

5.Check the dimensions of df

dim(df)

## [1] 3901253      13

Data Cleaning

Check the column names

colnames(df)

##  [1] "ride_id"            "rideable_type"      "started_at"        
##  [4] "ended_at"           "start_station_name" "start_station_id"  
##  [7] "end_station_name"   "end_station_id"     "start_lat"         
## [10] "start_lng"          "end_lat"            "end_lng"           
## [13] "member_casual"

Get the details of the dataframe

summary(df)

##    ride_id          rideable_type       started_at          ended_at        
##  Length:3901253     Length:3901253     Length:3901253     Length:3901253    
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  start_station_name start_station_id   end_station_name   end_station_id    
##  Length:3901253     Length:3901253     Length:3901253     Length:3901253    
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##    start_lat       start_lng         end_lat         end_lng      
##  Min.   :41.64   Min.   :-87.84   Min.   :41.51   Min.   :-88.07  
##  1st Qu.:41.88   1st Qu.:-87.66   1st Qu.:41.88   1st Qu.:-87.66  
##  Median :41.90   Median :-87.64   Median :41.90   Median :-87.64  
##  Mean   :41.90   Mean   :-87.65   Mean   :41.90   Mean   :-87.65  
##  3rd Qu.:41.93   3rd Qu.:-87.63   3rd Qu.:41.93   3rd Qu.:-87.63  
##  Max.   :42.08   Max.   :-87.52   Max.   :42.15   Max.   :-87.44  
##  member_casual     
##  Length:3901253    
##  Class :character  
##  Mode  :character  
##                    
##                    
##

Find the number of value counts of “members” and “casual” riders

table(df["member_casual"])

## 
##  casual  member 
## 1722518 2178735

casual_riders_pct = 100 * table(df["member_casual"])["casual"]/dim(df)[1] 
members_pct = 100 * table(df["member_casual"])["member"]/dim(df)[1]
cat("Percentage of casual riders: ", casual_riders_pct, "\n")

## Percentage of casual riders:  44.15294

cat("Percentage of members: ", members_pct)

## Percentage of members:  55.84706

The data is not skewed towards either members or casual riders as the difference between the number of members and casual riders is only about 10%.

Drop columns not required for analysis : Drop following columns : ‘start_station_name’, ‘start_station_id’, ‘end_station_name’, ‘end_station_id’.

df = subset(df, select = -c(start_station_name, start_station_id, end_station_name,   end_station_id) )
colnames(df)

## [1] "ride_id"       "rideable_type" "started_at"    "ended_at"     
## [5] "start_lat"     "start_lng"     "end_lat"       "end_lng"      
## [9] "member_casual"

Find missing values if any

sapply(df,  function(x) filter(df,x == "")%>%nrow())

##       ride_id rideable_type    started_at      ended_at     start_lat 
##             0             0             0             0             0 
##     start_lng       end_lat       end_lng member_casual 
##             0             0             0             0

Data Exploration

Find the valu count of casual riders and members

table(df["member_casual"])

## 
##  casual  member 
## 1722518 2178735

casual_riders_pct = 100 * table(df["member_casual"])["casual"]/dim(df)[1] 
members_pct = 100 * table(df["member_casual"])["member"]/dim(df)[1]
cat("Percentage of casual riders: ", casual_riders_pct, "\n")

## Percentage of casual riders:  44.15294

cat("Percentage of members: ", members_pct)

## Percentage of members:  55.84706

The data is not unbalanced towards either members or casual riders as the difference between the number of members and casual riders is only about 13%.

We will save this data to a csv file

write.csv(df,"/Users/calvin/Documents/Projects/GoogleFinalProject/cleaned_df.csv",row.names = FALSE)

I will also load a subset of this dataset for building a streamlit app to AWS S3. Get a random sample of 10000 rows from the dataset

df_sample = df[sample(nrow(df),10000),]

Check the dimension of the dataset

dim(df_sample)

## [1] 10000     9

Save the sample dataframe into a csv file

write.csv(df_sample, "/Users/calvin/Documents/Projects/GoogleFinalProject/cleaned_df_sample.csv",row.names = FALSE)

Bike_Ride Notebook

Prakhar Srivastava

Data Extraction

Data Cleaning

Data Exploration