##Setting up my environment
##Loading the packages
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.4.2
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.2
library(readr)
packageVersion("xfun")
## [1] '0.47'
options(repos = c(CRAN = "https://cran.rstudio.com/"))
install.packages("xfun", dependencies = TRUE)
## Installing package into 'C:/Users/Charles/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'xfun' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'xfun'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\Charles\AppData\Local\R\win-library\4.4\00LOCK\xfun\libs\x64\xfun.dll
## to C:\Users\Charles\AppData\Local\R\win-library\4.4\xfun\libs\x64\xfun.dll:
## Permission denied
## Warning: restored 'xfun'
##
## The downloaded binary packages are in
## C:\Users\Charles\AppData\Local\Temp\Rtmpc3Ft2W\downloaded_packages
##Importing the data ###Data tables source from two separate tables and the combine them together
setwd("C:/Users/Charles/OneDrive/Desktop/statistics")
##Reading the csv tables
###trip data 2020
library(readr)
TRIP_DATA_2020_ <- read_csv("TRIP_DATA_2020_.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 426887 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, started_at, ended_at, start_station_name, ...
## dbl (7): start_station_id, end_station_id, start_lat, start_lng, end_lat, e...
## time (1): ride_length
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(TRIP_DATA_2020_)
###trip data 2019
library(readr)
TRIP_DATA_2019_ <- read_csv("TRIP DATA 2019_.csv")
## Rows: 365069 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): start_time, end_time, tripduration, from_station_name, to_station_n...
## dbl (6): trip_id, bikeid, from_station_id, to_station_id, birthyear, day_ of...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(TRIP_DATA_2019_)
###change the column format to be consistent
TRIP_DATA_2020_$ride_length <-as.character(TRIP_DATA_2020_$ride_length)
TRIP_DATA_2019_$ride_length <-as.character(TRIP_DATA_2019_$ride_length)
##combine the two tables rows ###Assign a vector name to the joined table
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Combined_data <- bind_rows(TRIP_DATA_2019_,TRIP_DATA_2020_)
##add combined columns ##Clean the data further ready for analysis
Cleaned_data <- Combined_data %>%
mutate(
combined_user_type = coalesce(member_casual,usertype),
combined_start_station_id = coalesce(start_station_id, from_station_id),
combined_day_of_week = coalesce(`day_ of _week` ,day_of_week)
)
##Select columns to aid us in analysis ###preparing data for analysis
Analysis_data <- Cleaned_data %>%
select(combined_user_type,trip_id,gender,birthyear,,combined_start_station_id,combined_day_of_week)
Analysis_data %>%
group_by(combined_day_of_week) %>%
summarize(num_trips = n())
## # A tibble: 7 × 2
## combined_day_of_week num_trips
## <dbl> <int>
## 1 1 78849
## 2 2 117177
## 3 3 135966
## 4 4 130325
## 5 5 133043
## 6 6 123710
## 7 7 72886
##Exploration and visualization ###Explore the data more for analysis
####Usertype gender this helps us understand the genders of the bike share company users. N/A represents the data table that did not capture the genders but its still useful for our business task
ggplot(Analysis_data, aes(x = gender, fill = combined_user_type)) +
geom_bar(position = "dodge") +
labs(title = "Gender vs. User Type")
this helps us determine the age group of the highest members,customers as well as the lowest.
ggplot(Analysis_data, aes(x = birthyear, fill = combined_user_type)) +
geom_histogram(position = "dodge") +
labs(title = "Birth Year vs. User Type")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 444910 rows containing non-finite outside the scale range
## (`stat_bin()`).
Analysis_data %>%
group_by(combined_start_station_id) %>%
summarize(num_trips = n())
## # A tibble: 612 × 2
## combined_start_station_id num_trips
## <dbl> <int>
## 1 2 746
## 2 3 2899
## 3 4 1242
## 4 5 1392
## 5 6 1352
## 6 7 1795
## 7 9 118
## 8 11 56
## 9 12 123
## 10 13 3074
## # ℹ 602 more rows
station192 = 14155 Station91 =13640 station77=13362 station195 =9080
###Add a filtered data frame
Usertype_trips <- Analysis_data %>%
group_by(combined_user_type) %>%
summarize(num_trips = n())
###Visualization
library(tidyr)
library(ggplot2)
####Usertype number of trips
ggplot(Usertype_trips, aes(x = combined_user_type, y = num_trips, fill = combined_user_type)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Number of Trips by User Type",
x = "Usertype",
y = "Number of Trips") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
###Visualization ####Usertype total count in each start_station
ggplot(Analysis_data, aes(x = combined_start_station_id, fill = combined_user_type)) +
geom_bar() +
coord_flip() +
labs(title = "User Type Count by Start Station")