library(tidyverse) library(readxl) # helps wrangle data library(conflicted)
conflict_prefer(“filter”, “dplyr”) conflict_prefer(“lag”, “dplyr”)
#===================== # STEP 1: COLLECT DATA #===================== # # Upload Divvy datasets (csv files) here q1_2019 <- read_excel(“Divvy_Trips_2019_Q1.xlsx”) q1_2020 <- read_excel(“Divvy_Trips_2020_Q1.xlsx”)
#==================================================== # STEP 2: WRANGLE DATA AND COMBINE INTO A SINGLE FILE #==================================================== # Compare column names each of the files # While the names don’t have to be in the same order, they DO need to match perfectly before we can use a command to join them into one file colnames(q1_2019) colnames(q1_2020)
q1_2019 <- rename(q1_2019 ,ride_id = trip_id ,rideable_type = bikeid ,started_at = start_time ,ended_at = end_time ,start_station_name = from_station_name ,start_station_id = from_station_id ,end_station_name = to_station_name ,end_station_id = to_station_id ,member_casual = usertype )
str(q1_2019) str(q1_2020)
q1_2019 <- mutate(q1_2019, ride_id = as.character(ride_id) ,rideable_type = as.character(rideable_type))
all_trips <- bind_rows(q1_2019, q1_2020)#, q3_2019)#, q4_2019, q1_2020)
all_trips <- all_trips %>% select(-c(start_lat, start_lng, end_lat, end_lng, birthyear, gender, “tripduration”))
#====================================================== # STEP 3: CLEAN UP AND ADD DATA TO PREPARE FOR ANALYSIS #====================================================== # Inspect the new table that has been created colnames(all_trips) #List of column names nrow(all_trips) #How many rows are in data frame? dim(all_trips) #Dimensions of the data frame? head(all_trips) #See the first 6 rows of data frame. Also tail(all_trips) str(all_trips) #See list of columns and data types (numeric, character, etc) summary(all_trips) #Statistical summary of data. Mainly for numerics
table(all_trips$member_casual)
all_trips <- all_trips %>% mutate(member_casual = recode(member_casual ,“Subscriber” = “member” ,“Customer” = “casual”))
table(all_trips$member_casual)
all_trips\(date <- as.Date(all_trips\)started_at) #The default format is yyyy-mm-dd all_trips\(month <- format(as.Date(all_trips\)date), “%m”) all_trips\(day <- format(as.Date(all_trips\)date), “%d”) all_trips\(year <- format(as.Date(all_trips\)date), “%Y”) all_trips\(day_of_week <- format(as.Date(all_trips\)date), “%A”)
all_trips\(ride_length <- difftime(all_trips\)ended_at,all_trips$started_at)
str(all_trips)
is.factor(all_trips\(ride_length) all_trips\)ride_length <- as.numeric(as.character(all_trips\(ride_length)) is.numeric(all_trips\)ride_length)
all_trips_v2 <- all_trips[!(all_trips\(start_station_name == "HQ QR" | all_trips\)ride_length<0),]
#===================================== # STEP 4: CONDUCT DESCRIPTIVE ANALYSIS #===================================== # Descriptive analysis on ride_length (all figures in seconds) mean(all_trips_v2\(ride_length) #straight average (total ride length / rides) median(all_trips_v2\)ride_length) #midpoint number in the ascending array of ride lengths max(all_trips_v2\(ride_length) #longest ride min(all_trips_v2\)ride_length) #shortest ride
summary(all_trips_v2$ride_length)
aggregate(all_trips_v2\(ride_length ~ all_trips_v2\)member_casual, FUN = mean) aggregate(all_trips_v2\(ride_length ~ all_trips_v2\)member_casual, FUN = median) aggregate(all_trips_v2\(ride_length ~ all_trips_v2\)member_casual, FUN = max) aggregate(all_trips_v2\(ride_length ~ all_trips_v2\)member_casual, FUN = min)
aggregate(all_trips_v2\(ride_length ~ all_trips_v2\)member_casual + all_trips_v2$day_of_week, FUN = mean)
all_trips_v2\(day_of_week <- ordered(all_trips_v2\)day_of_week, levels=c(“Sunday”, “Monday”, “Tuesday”, “Wednesday”, “Thursday”, “Friday”, “Saturday”))
aggregate(all_trips_v2\(ride_length ~ all_trips_v2\)member_casual + all_trips_v2$day_of_week, FUN = mean)
all_trips_v2 %>% mutate(weekday = wday(started_at, label = TRUE)) %>% #creates weekday field using wday() group_by(member_casual, weekday) %>% #groups by usertype and weekday summarise(number_of_rides = n() #calculates the number of rides and average duration ,average_duration = mean(ride_length)) %>% # calculates the average duration arrange(member_casual, weekday) # sorts
all_trips_v2 %>% mutate(weekday = wday(started_at, label = TRUE)) %>% group_by(member_casual, weekday) %>% summarise(number_of_rides = n() ,average_duration = mean(ride_length)) %>% arrange(member_casual, weekday) %>% ggplot(aes(x = weekday, y = number_of_rides, fill = member_casual)) + geom_col(position = “dodge”)
all_trips_v2 %>% mutate(weekday = wday(started_at, label = TRUE)) %>% group_by(member_casual, weekday) %>% summarize(number_of_rides = n() ,average_duration = mean(ride_length)) %>% arrange(member_casual, weekday) %>% ggplot(aes(x = weekday, y = average_duration, fill = member_casual)) + geom_col(position = “dodge”)
#================================================= # STEP 5: EXPORT SUMMARY FILE FOR FURTHER ANALYSIS #================================================= # Create a csv file that we will visualize in Excel, Tableau, or my presentation software # N.B.: This file location is for a Mac. If you are working on a PC, change the file location accordingly (most likely “C:_USERNAME...”) to export the data. You can read more here: https://datatofish.com/export-dataframe-to-csv-in-r/ counts <- aggregate(all_trips_v2\(ride_length ~ all_trips_v2\)member_casual + all_trips_v2$day_of_week, FUN = mean) write.csv(counts, file = ‘avg_ride_length.csv’)