1.Install the required packages
library(tidyverse)
library(DBI)
library(RMySQL)
3.Connect to the MYSQL database
mydb = dbConnect(MySQL(), user='root', password='password', dbname='BikeRidedb', host='Prakhars-MacBook-Air.local')
List the tables in the db
dbListTables(mydb)
## [1] "bike_tripdata"
This shows that the connectio has been established and we can see and access our table “bike-tripdata”.
List the columns of the db table
dbListFields(mydb, "bike_tripdata")
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
4.Retrieve the data from the MySQL databse
result = dbSendQuery(mydb, "SELECT * from bike_tripdata")
Load the result object into an R dataframe
df = fetch(result, n=-1)
5.Check the dimensions of df
dim(df)
## [1] 3901253 13
Check the column names
colnames(df)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual"
Get the details of the dataframe
summary(df)
## ride_id rideable_type started_at ended_at
## Length:3901253 Length:3901253 Length:3901253 Length:3901253
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## start_station_name start_station_id end_station_name end_station_id
## Length:3901253 Length:3901253 Length:3901253 Length:3901253
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## start_lat start_lng end_lat end_lng
## Min. :41.64 Min. :-87.84 Min. :41.51 Min. :-88.07
## 1st Qu.:41.88 1st Qu.:-87.66 1st Qu.:41.88 1st Qu.:-87.66
## Median :41.90 Median :-87.64 Median :41.90 Median :-87.64
## Mean :41.90 Mean :-87.65 Mean :41.90 Mean :-87.65
## 3rd Qu.:41.93 3rd Qu.:-87.63 3rd Qu.:41.93 3rd Qu.:-87.63
## Max. :42.08 Max. :-87.52 Max. :42.15 Max. :-87.44
## member_casual
## Length:3901253
## Class :character
## Mode :character
##
##
##
Find the number of value counts of “members” and “casual” riders
table(df["member_casual"])
##
## casual member
## 1722518 2178735
casual_riders_pct = 100 * table(df["member_casual"])["casual"]/dim(df)[1]
members_pct = 100 * table(df["member_casual"])["member"]/dim(df)[1]
cat("Percentage of casual riders: ", casual_riders_pct, "\n")
## Percentage of casual riders: 44.15294
cat("Percentage of members: ", members_pct)
## Percentage of members: 55.84706
The data is not skewed towards either members or casual riders as the difference between the number of members and casual riders is only about 10%.
Drop columns not required for analysis : Drop following columns : ‘start_station_name’, ‘start_station_id’, ‘end_station_name’, ‘end_station_id’.
df = subset(df, select = -c(start_station_name, start_station_id, end_station_name, end_station_id) )
colnames(df)
## [1] "ride_id" "rideable_type" "started_at" "ended_at"
## [5] "start_lat" "start_lng" "end_lat" "end_lng"
## [9] "member_casual"
Find missing values if any
sapply(df, function(x) filter(df,x == "")%>%nrow())
## ride_id rideable_type started_at ended_at start_lat
## 0 0 0 0 0
## start_lng end_lat end_lng member_casual
## 0 0 0 0
Find the valu count of casual riders and members
table(df["member_casual"])
##
## casual member
## 1722518 2178735
casual_riders_pct = 100 * table(df["member_casual"])["casual"]/dim(df)[1]
members_pct = 100 * table(df["member_casual"])["member"]/dim(df)[1]
cat("Percentage of casual riders: ", casual_riders_pct, "\n")
## Percentage of casual riders: 44.15294
cat("Percentage of members: ", members_pct)
## Percentage of members: 55.84706
The data is not unbalanced towards either members or casual riders as the difference between the number of members and casual riders is only about 13%.
We will save this data to a csv file
write.csv(df,"/Users/calvin/Documents/Projects/GoogleFinalProject/cleaned_df.csv",row.names = FALSE)
I will also load a subset of this dataset for building a streamlit app to AWS S3. Get a random sample of 10000 rows from the dataset
df_sample = df[sample(nrow(df),10000),]
Check the dimension of the dataset
dim(df_sample)
## [1] 10000 9
Save the sample dataframe into a csv file
write.csv(df_sample, "/Users/calvin/Documents/Projects/GoogleFinalProject/cleaned_df_sample.csv",row.names = FALSE)