library(tidyverse)
library(openintro)
library(flextable)
library(infer)
library(psych)
library(epiDisplay)
library(summarytools)
knitr::opts_chunk$set(tidy.opts=list(width.cutoff=50),tidy=TRUE,echo =TRUE)
set_flextable_defaults(fonts_ignore=TRUE)
projectid = "data-607-project"
# load Data
Citibike <- read.csv("202308-citibike-tripdata.csv")
Citibike %>%
dfSummary()
## Data Frame Summary
## Citibike
## Dimensions: 4093169 x 13
## Duplicates: 0
##
## -----------------------------------------------------------------------------------------------------------------------------
## No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
## ---- -------------------- ------------------------------ ------------------------ ---------------------- ---------- ---------
## 1 ride_id 1. 0000021FE140CCC6 1 ( 0.0%) 4093169 0
## [character] 2. 0000026F89F2441F 1 ( 0.0%) (100.0%) (0.0%)
## 3. 000004121C91C853 1 ( 0.0%)
## 4. 0000048DBE53542F 1 ( 0.0%)
## 5. 00000647B60115E9 1 ( 0.0%)
## 6. 00000861C5C15BD9 1 ( 0.0%)
## 7. 00001955056D2FBC 1 ( 0.0%)
## 8. 00001D54D3D085B0 1 ( 0.0%)
## 9. 00001D73A3E06D23 1 ( 0.0%)
## 10. 00001FA8D064DEA7 1 ( 0.0%)
## [ 4093159 others ] 4093159 (100.0%) IIIIIIIIIIIIIIIIIII
##
## 2 rideable_type 1. classic_bike 3784828 (92.5%) IIIIIIIIIIIIIIIIII 4093169 0
## [character] 2. electric_bike 308341 ( 7.5%) I (100.0%) (0.0%)
##
## 3 started_at 1. 2023-08-23 18:19:01 17 ( 0.0%) 4093169 0
## [character] 2. 2023-08-23 17:19:50 16 ( 0.0%) (100.0%) (0.0%)
## 3. 2023-08-02 18:10:31 14 ( 0.0%)
## 4. 2023-08-15 17:23:28 14 ( 0.0%)
## 5. 2023-08-15 17:24:10 14 ( 0.0%)
## 6. 2023-08-15 18:05:49 14 ( 0.0%)
## 7. 2023-08-16 17:07:54 14 ( 0.0%)
## 8. 2023-08-19 12:16:07 14 ( 0.0%)
## 9. 2023-08-26 13:54:01 14 ( 0.0%)
## 10. 2023-08-30 17:14:00 14 ( 0.0%)
## [ 1751370 others ] 4093024 (100.0%) IIIIIIIIIIIIIIIIIII
##
## 4 ended_at 1. 2023-08-24 14:11:35 69 ( 0.0%) 4093169 0
## [character] 2. 2023-08-09 15:21:33 62 ( 0.0%) (100.0%) (0.0%)
## 3. 2023-08-03 09:28:36 42 ( 0.0%)
## 4. 2023-08-19 18:03:44 42 ( 0.0%)
## 5. 2023-08-26 11:30:12 37 ( 0.0%)
## 6. 2023-08-28 12:46:51 35 ( 0.0%)
## 7. 2023-08-03 22:46:37 34 ( 0.0%)
## 8. 2023-08-05 16:39:11 32 ( 0.0%)
## 9. 2023-08-15 16:06:26 32 ( 0.0%)
## 10. 2023-08-18 18:02:23 29 ( 0.0%)
## [ 1757210 others ] 4092755 (100.0%) IIIIIIIIIIIIIIIIIII
##
## 5 start_station_name 1. West St & Chambers St 14684 ( 0.4%) 4093169 0
## [character] 2. W 21 St & 6 Ave 14551 ( 0.4%) (100.0%) (0.0%)
## 3. Broadway & W 58 St 13745 ( 0.3%)
## 4. 11 Ave & W 41 St 13714 ( 0.3%)
## 5. E 17 St & Broadway 12197 ( 0.3%)
## 6. 7 Ave & Central Park Sout 11927 ( 0.3%)
## 7. 6 Ave & W 34 St 11824 ( 0.3%)
## 8. W 30 St & 10 Ave 11753 ( 0.3%)
## 9. W 31 St & 7 Ave 11474 ( 0.3%)
## 10. W 34 St & Hudson Blvd E 11466 ( 0.3%)
## [ 1945 others ] 3965834 (96.9%) IIIIIIIIIIIIIIIIIII
##
## 6 start_station_id 1. 5329.03 14684 ( 0.4%) 4093169 0
## [character] 2. 6140.05 14551 ( 0.4%) (100.0%) (0.0%)
## 3. 6948.10 13745 ( 0.3%)
## 4. 6726.01 13714 ( 0.3%)
## 5. 5980.07 12197 ( 0.3%)
## 6. 6912.01 11927 ( 0.3%)
## 7. 6364.10 11824 ( 0.3%)
## 8. 6459.07 11753 ( 0.3%)
## 9. 6331.01 11474 ( 0.3%)
## 10. 6535.04 11466 ( 0.3%)
## [ 1946 others ] 3965834 (96.9%) IIIIIIIIIIIIIIIIIII
##
## 7 end_station_name 1. West St & Chambers St 14714 ( 0.4%) 4093169 0
## [character] 2. W 21 St & 6 Ave 14576 ( 0.4%) (100.0%) (0.0%)
## 3. 11 Ave & W 41 St 13784 ( 0.3%)
## 4. Broadway & W 58 St 12992 ( 0.3%)
## 5. E 17 St & Broadway 12268 ( 0.3%)
## 6. 7 Ave & Central Park Sout 11750 ( 0.3%)
## 7. W 30 St & 10 Ave 11743 ( 0.3%)
## 8. 6 Ave & W 34 St 11524 ( 0.3%)
## 9. W 34 St & Hudson Blvd E 11468 ( 0.3%)
## 10. W 31 St & 7 Ave 11454 ( 0.3%)
## [ 1986 others ] 3966896 (96.9%) IIIIIIIIIIIIIIIIIII
##
## 8 end_station_id 1. 5329.03 14714 ( 0.4%) 4093169 0
## [character] 2. 6140.05 14576 ( 0.4%) (100.0%) (0.0%)
## 3. 6726.01 13784 ( 0.3%)
## 4. 6948.10 12992 ( 0.3%)
## 5. 5980.07 12268 ( 0.3%)
## 6. 6912.01 11750 ( 0.3%)
## 7. 6459.07 11743 ( 0.3%)
## 8. 6364.10 11524 ( 0.3%)
## 9. 6535.04 11468 ( 0.3%)
## 10. 6331.01 11454 ( 0.3%)
## [ 1987 others ] 3966896 (96.9%) IIIIIIIIIIIIIIIIIII
##
## 9 start_lat Mean (sd) : 40.7 (0) 885055 distinct values : 4093169 0
## [numeric] min < med < max: . : : (100.0%) (0.0%)
## 40.6 < 40.7 < 40.9 : : :
## IQR (CV) : 0 (0) : : : : .
## . : : : : : : .
##
## 10 start_lng Mean (sd) : -74 (0) 724338 distinct values : 4093169 0
## [numeric] min < med < max: . : (100.0%) (0.0%)
## -74 < -74 < -73.8 : : .
## IQR (CV) : 0 (0) : : : :
## . : : : : : : .
##
## 11 end_lat Mean (sd) : 40.7 (0.1) 2029 distinct values : 4090221 2948
## [numeric] min < med < max: : (99.9%) (0.1%)
## -37.3 < 40.7 < 41 :
## IQR (CV) : 0 (0) :
## :
##
## 12 end_lng Mean (sd) : -74 (0.1) 2025 distinct values : 4090221 2948
## [numeric] min < med < max: : (99.9%) (0.1%)
## -173.4 < -74 < 0 :
## IQR (CV) : 0 (0) :
## :
##
## 13 member_casual 1. casual 907197 (22.2%) IIII 4093169 0
## [character] 2. member 3185972 (77.8%) IIIIIIIIIIIIIII (100.0%) (0.0%)
## -----------------------------------------------------------------------------------------------------------------------------
CitiBike have two different types of bikes (classic and electric) and memberships (members and casual). Compare and contrasting Casual bike riders and annual bike riders: Do they effect each other?
Citibike data set contains 4,093,169 rows, which consist of individual rides. For this experiment I will use approximately 25% of the sample consisting of 1,023,292 cases, and I will set the confident interval at 0.95(95%).
Citibike provides downloadable csv files format of Citibike information in a publicly available database. The database itself was created on September, 2023 for the month of August, 2023 and was last updated on October 12, 2023. This citibike database contain more recent data until October 2023, and can be view or accessible at the [Citibike website] (https://citibikenyc.com/system-data).
This is an observational study, since I will be using data set from citibike database.
For this assignment I will be using a random sample taken from https://s3.amazonaws.com/tripdata/index.html. This data come from the citi bike site https://citibikenyc.com/system-data
The dependent variable is the type of bike being rented on a day to day, this is categorical and qualitative.
The independent variable is the type of members that are rented the bikes, this is qualitative.
ggplot(Citibike, aes(x = rideable_type)) + geom_bar()
tab1(Citibike$rideable_type,sort.group = "decreasing", cum.percent = F )
## Citibike$rideable_type :
## Frequency Percent
## classic_bike 3784828 92.5
## electric_bike 308341 7.5
## Total 4093169 100.0
ggplot(Citibike, aes(x = member_casual)) + geom_bar()
tab1(Citibike$member_casual,sort.group = "decreasing", cum.percent = F )
## Citibike$member_casual :
## Frequency Percent
## member 3185972 77.8
## casual 907197 22.2
## Total 4093169 100.0