##Setting up my environment

##Loading the packages

library(lubridate)
## Warning: package 'lubridate' was built under R version 4.4.2
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.2
library(readr)
packageVersion("xfun")
## [1] '0.47'
options(repos = c(CRAN = "https://cran.rstudio.com/"))
install.packages("xfun", dependencies = TRUE)
## Installing package into 'C:/Users/Charles/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'xfun' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'xfun'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\Charles\AppData\Local\R\win-library\4.4\00LOCK\xfun\libs\x64\xfun.dll
## to C:\Users\Charles\AppData\Local\R\win-library\4.4\xfun\libs\x64\xfun.dll:
## Permission denied
## Warning: restored 'xfun'
## 
## The downloaded binary packages are in
##  C:\Users\Charles\AppData\Local\Temp\Rtmpc3Ft2W\downloaded_packages

##Importing the data ###Data tables source from two separate tables and the combine them together

setwd("C:/Users/Charles/OneDrive/Desktop/statistics")

##Reading the csv tables

###trip data 2020

library(readr)
TRIP_DATA_2020_ <- read_csv("TRIP_DATA_2020_.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 426887 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): ride_id, rideable_type, started_at, ended_at, start_station_name, ...
## dbl  (7): start_station_id, end_station_id, start_lat, start_lng, end_lat, e...
## time (1): ride_length
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(TRIP_DATA_2020_)

###trip data 2019

library(readr)
TRIP_DATA_2019_ <- read_csv("TRIP DATA 2019_.csv")
## Rows: 365069 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): start_time, end_time, tripduration, from_station_name, to_station_n...
## dbl (6): trip_id, bikeid, from_station_id, to_station_id, birthyear, day_ of...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(TRIP_DATA_2019_)

###change the column format to be consistent

TRIP_DATA_2020_$ride_length <-as.character(TRIP_DATA_2020_$ride_length)
TRIP_DATA_2019_$ride_length <-as.character(TRIP_DATA_2019_$ride_length)

##combine the two tables rows ###Assign a vector name to the joined table

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
Combined_data <- bind_rows(TRIP_DATA_2019_,TRIP_DATA_2020_)

##add combined columns ##Clean the data further ready for analysis

Cleaned_data <- Combined_data %>%
  mutate(
    combined_user_type = coalesce(member_casual,usertype),
   combined_start_station_id = coalesce(start_station_id, from_station_id),
    combined_day_of_week = coalesce(`day_ of _week` ,day_of_week)
  )

##Select columns to aid us in analysis ###preparing data for analysis

Analysis_data <- Cleaned_data %>%
  select(combined_user_type,trip_id,gender,birthyear,,combined_start_station_id,combined_day_of_week)
 Analysis_data %>%
  group_by(combined_day_of_week) %>%
  summarize(num_trips = n())
## # A tibble: 7 × 2
##   combined_day_of_week num_trips
##                  <dbl>     <int>
## 1                    1     78849
## 2                    2    117177
## 3                    3    135966
## 4                    4    130325
## 5                    5    133043
## 6                    6    123710
## 7                    7     72886

##Exploration and visualization ###Explore the data more for analysis

####Usertype gender this helps us understand the genders of the bike share company users. N/A represents the data table that did not capture the genders but its still useful for our business task

ggplot(Analysis_data, aes(x = gender, fill = combined_user_type)) +
  geom_bar(position = "dodge") +
  labs(title = "Gender vs. User Type")

Birth Year vs. User Type

this helps us determine the age group of the highest members,customers as well as the lowest.

ggplot(Analysis_data, aes(x = birthyear, fill = combined_user_type)) +
  geom_histogram(position = "dodge") +
  labs(title = "Birth Year vs. User Type")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 444910 rows containing non-finite outside the scale range
## (`stat_bin()`).

Number of trips by Start Station/Usertype by start station

 Analysis_data %>%
  group_by(combined_start_station_id) %>%
  summarize(num_trips = n())
## # A tibble: 612 × 2
##    combined_start_station_id num_trips
##                        <dbl>     <int>
##  1                         2       746
##  2                         3      2899
##  3                         4      1242
##  4                         5      1392
##  5                         6      1352
##  6                         7      1795
##  7                         9       118
##  8                        11        56
##  9                        12       123
## 10                        13      3074
## # ℹ 602 more rows

Number of trips per start_station

station192 = 14155 Station91 =13640 station77=13362 station195 =9080

###Add a filtered data frame

Usertype_trips <- Analysis_data %>%
  group_by(combined_user_type) %>%
  summarize(num_trips = n())

###Visualization

library(tidyr)
library(ggplot2)

####Usertype number of trips

ggplot(Usertype_trips, aes(x = combined_user_type, y = num_trips, fill = combined_user_type)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Number of Trips by User Type",
       x = "Usertype",
       y = "Number of Trips") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

###Visualization ####Usertype total count in each start_station

ggplot(Analysis_data, aes(x = combined_start_station_id, fill = combined_user_type)) +
  geom_bar() +
  coord_flip() +
  labs(title = "User Type Count by Start Station")