library(tidyverse)
library(lubridate)
library(ggplot2)
library(dplyr)
Eres un analista de datos júnior que trabaja en el equipo de analistas de marketing de Cyclistic, una empresa de bicicletas compartidas de Chicago. La directora de marketing cree que el éxito futuro de la empresa depende de maximizar la cantidad de membresías anuales. Por lo tanto, tu equipo quiere entender qué diferencias existen en el uso de las bicicletas Cyclistic entre los ciclistas ocasionales y los miembros anuales. A través de estos conocimientos, tu equipo diseñará una nueva estrategia de marketing para convertir a los ciclistas ocasionales en miembros anuales. Sin embargo, antes de eso, los ejecutivos de Cyclistic deben aprobar tus recomendaciones; por eso, debes respaldar tu propuesta con una visión convincente de los datos y visualizaciones profesionales de los mismos.
El primer paso es recolectar los datos disponibles de los ultimos 12 meses Continuamos comparando los nombres de cada columna de los archivos para combinarlos en uno solo.
install.packages(“tidyverse”) install.packages(“lubridate”) install.packages(“ggplot2”)
library(tidyverse) #helps wrangle data library(lubridate) #helps wrangle date attributes library(ggplot2) #helps visualize data library(dplyr) getwd() #displays your working directory setwd() #sets your working directory to simplify calls to data
q1_202210 <- read.csv2("202210_CSV_tripdata.csv")
q2_202211 <- read.csv2("202211_CSV_tripdata.csv")
q3_202212 <- read.csv2("202212_CSV_tripdata.csv")
q4_202301 <- read.csv2("202301_CSV_tripdata.csv")
q5_202302 <- read.csv2("202302_CSV_tripdata.csv")
q6_202303 <- read.csv2("202303_CSV_tripdata.csv")
q7_202304 <- read.csv2("202304_CSV_tripdata.csv")
q8_202305 <- read.csv2("202305_CSV_tripdata.csv")
q9_202306 <- read.csv2("202306_CSV_tripdata.csv")
q10_202307 <- read.csv2("202307_CSV_tripdata.csv")
q11_202308 <- read.csv2("202308_CSV_tripdata.csv")
q12_202309 <- read.csv2("202309_CSV_tripdata.csv")
Paso 2: Organizar los datos y combinarlos en un solo archivo * Comparar los nombres de las columnas de cada uno de los archivos * Mientras que los nombres no tienen que estar en el mismo orden, DEBEN coincidir perfectamente antes de que podamos usar un comando para unirlos en un solo archivo.
colnames(q1_202210)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual" "ride_length" "day_of_week"
colnames(q2_202211)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual" "ride_length" "day_of_week"
colnames(q3_202212)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual" "ride_length" "day_of_week"
colnames(q4_202301)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual" "ride_length" "day_of_week"
colnames(q5_202302)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual" "ride_length" "day_of_week"
colnames(q6_202303)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual" "ride_length" "day_of_week"
colnames(q7_202304)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual" "ride_length" "day_of_week"
colnames(q8_202305)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual" "ride_length" "day_of_week"
colnames(q9_202306)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual" "ride_length" "day_of_week"
colnames(q10_202307)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual" "ride_length" "day_of_week"
colnames(q11_202308)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual" "ride_length" "day_of_week"
colnames(q12_202309)
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "start_lat"
## [10] "start_lng" "end_lat" "end_lng"
## [13] "member_casual" "ride_length" "day_of_week"
str(q1_202210)
## 'data.frame': 558685 obs. of 15 variables:
## $ ride_id : chr "2073591EA92F95EA" "F3BD1CC541D1D526" "8BD7CD28EB78C3F0" "DCBCCF540A0AE3CB" ...
## $ rideable_type : chr "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : chr "1/10/2022 0:00" "1/10/2022 0:00" "1/10/2022 0:00" "1/10/2022 0:00" ...
## $ ended_at : chr "1/10/2022 0:18" "1/10/2022 0:32" "1/10/2022 0:06" "1/10/2022 0:01" ...
## $ start_station_name: chr "Sheffield Ave & Wrightwood Ave" "Clark St & Bryn Mawr Ave" "Racine Ave & Congress Pkwy" "" ...
## $ start_station_id : chr "TA1309000023" "KA1504000151" "TA1306000025" "" ...
## $ end_station_name : chr "Franklin St & Illinois St" "" "Wolcott Ave & Polk St" "" ...
## $ end_station_id : chr "RN-" "" "TA1309000064" "" ...
## $ start_lat : num 4.19e+15 4.20e+16 4.19e+16 4.20e+03 4.19e+03 ...
## $ start_lng : num -8.77e+15 -8.77e+07 -8.77e+07 -8.78e+03 -8.77e+03 ...
## $ end_lat : num 4.19e+15 4.20e+03 4.19e+07 4.20e+03 4.19e+07 ...
## $ end_lng : num -8.76e+15 -8.76e+03 -8.77e+07 -8.78e+03 -8.77e+06 ...
## $ member_casual : chr "member" "casual" "member" "member" ...
## $ ride_length : chr "0:18:00" "0:32:02" "0:06:04" "0:00:28" ...
## $ day_of_week : int 7 7 7 7 7 7 7 7 7 7 ...
str(q2_202211)
## 'data.frame': 337735 obs. of 15 variables:
## $ ride_id : chr "05B1EBEBACFF6094" "45A0619030A22712" "AF7B05772C41BBB3" "5446FCEB95D32460" ...
## $ rideable_type : chr "classic_bike" "electric_bike" "electric_bike" "classic_bike" ...
## $ started_at : chr "1/11/2022 0:00" "1/11/2022 0:00" "1/11/2022 0:00" "1/11/2022 0:00" ...
## $ ended_at : chr "1/11/2022 0:13" "1/11/2022 0:03" "1/11/2022 0:15" "1/11/2022 0:13" ...
## $ start_station_name: chr "Lincoln Ave & Diversey Pkwy" "Clark St & Montrose Ave" "California Ave & Cortez St" "Halsted St & Wrightwood Ave" ...
## $ start_station_id : chr "TA1307000064" "KA1503000022" "17660" "TA1309000061" ...
## $ end_station_name : chr "Clifton Ave & Armitage Ave" "" "" "Clark St & North Ave" ...
## $ end_station_id : chr "TA1307000163" "" "" "13128" ...
## $ start_lat : num 4.19e+07 4.20e+08 4.19e+10 4.19e+07 4.20e+03 ...
## $ start_lng : num -8.77e+07 -8.77e+08 -8.77e+10 -8.76e+07 -8.77e+02 ...
## $ end_lat : num 41918216 4197 4192 41911974 4199 ...
## $ end_lng : num -87656936 -8766 -8771 -87631942 -8766 ...
## $ member_casual : chr "casual" "casual" "casual" "casual" ...
## $ ride_length : chr "0:13:00" "0:03:44" "0:14:59" "0:12:38" ...
## $ day_of_week : int 3 3 3 3 3 3 3 3 3 3 ...
str(q3_202212)
## 'data.frame': 181806 obs. of 15 variables:
## $ ride_id : chr "C2728784FCDB3735" "8FE02CD119032644" "918C1979B5B339D0" "94B73A76807D9E2F" ...
## $ rideable_type : chr "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : chr "1/12/2022 0:01" "1/12/2022 0:01" "1/12/2022 0:02" "1/12/2022 0:04" ...
## $ ended_at : chr "1/12/2022 0:03" "1/12/2022 0:14" "1/12/2022 0:08" "1/12/2022 0:14" ...
## $ start_station_name: chr "Greenview Ave & Fullerton Ave" "Wood St & Taylor St (Temp)" "" "" ...
## $ start_station_id : chr "TA1307000001" "13285" "" "" ...
## $ end_station_name : chr "Racine Ave & Fullerton Ave (Temp)" "Orleans St & Merchandise Mart Plaza" "Federal St & Polk St" "" ...
## $ end_station_id : chr "TA1306000026" "TA1305000022" "SL-008" "" ...
## $ start_lat : num 4.19e+15 4.19e+15 4.19e+03 4.19e+02 4.19e+12 ...
## $ start_lng : num -8.77e+15 -8.77e+15 -8.76e+03 -8.76e+03 -8.76e+11 ...
## $ end_lat : num 4.19e+16 4.19e+07 4.19e+12 4.19e+03 4.19e+07 ...
## $ end_lng : num -8.77e+13 -8.76e+06 -8.76e+11 -8.76e+03 -8.76e+06 ...
## $ member_casual : chr "member" "member" "member" "member" ...
## $ ride_length : chr "0:02:19" "0:12:22" "0:06:42" "0:09:09" ...
## $ day_of_week : int 5 5 5 5 5 5 5 5 5 5 ...
str(q4_202301)
## 'data.frame': 190301 obs. of 15 variables:
## $ ride_id : chr "53F0F248F28485D2" "D8EEE72183269F07" "E5AD797A579842F8" "8FBD2AD70B0F6A6F" ...
## $ rideable_type : chr "electric_bike" "classic_bike" "electric_bike" "classic_bike" ...
## $ started_at : chr "1/01/2023 0:01" "1/01/2023 0:02" "1/01/2023 0:03" "1/01/2023 0:04" ...
## $ ended_at : chr "1/01/2023 0:02" "1/01/2023 0:29" "1/01/2023 0:07" "1/01/2023 0:13" ...
## $ start_station_name: chr "" "Fairbanks Ct & Grand Ave" "Sheridan Rd & Loyola Ave" "Leavitt St & Lawrence Ave" ...
## $ start_station_id : chr "" "TA1305000003" "RP-009" "TA1309000015" ...
## $ end_station_name : chr "" "New St & Illinois St" "Sheridan Rd & Loyola Ave" "Broadway & Argyle St" ...
## $ end_station_id : chr "" "TA1306000013" "RP-009" "13108" ...
## $ start_lat : num 4.18e+02 4.19e+15 4.20e+10 4.20e+07 4.20e+08 ...
## $ start_lng : num -8.76e+03 -8.76e+15 -8.77e+10 -8.77e+07 -8.77e+15 ...
## $ end_lat : num 4.18e+02 4.19e+16 4.20e+12 4.20e+07 4.20e+07 ...
## $ end_lng : num -8.76e+03 -8.76e+15 -8.77e+11 -8.77e+06 -8.77e+07 ...
## $ member_casual : chr "casual" "member" "casual" "casual" ...
## $ ride_length : chr "0:00:43" "0:27:40" "0:03:57" "0:09:49" ...
## $ day_of_week : int 1 1 1 1 1 1 1 1 1 1 ...
str(q5_202302)
## 'data.frame': 190445 obs. of 15 variables:
## $ ride_id : chr "4617F5134B0A2BA2" "F7CC28370FE0D243" "3D5C67AFD3CF6E87" "0D3CFCF951AD63B2" ...
## $ rideable_type : chr "electric_bike" "classic_bike" "classic_bike" "electric_bike" ...
## $ started_at : chr "1/02/2023 0:01" "1/02/2023 0:03" "1/02/2023 0:05" "1/02/2023 0:11" ...
## $ ended_at : chr "1/02/2023 0:25" "1/02/2023 0:08" "1/02/2023 0:12" "1/02/2023 0:17" ...
## $ start_station_name: chr "" "University Ave & 57th St" "Broadway & Ridge Ave" "" ...
## $ start_station_id : chr "" "KA1503000071" "15578" "" ...
## $ end_station_name : chr "" "Ellis Ave & 60th St" "Broadway & Berwyn Ave" "Marine Dr & Ainslie St" ...
## $ end_station_id : chr "" "KA1503000014" "13109" "KA1504000171" ...
## $ start_lat : num 4.19e+03 4.18e+07 4.20e+11 4.20e+03 4.19e+02 ...
## $ start_lng : num -8.77e+03 -8.76e+07 -8.77e+11 -8.77e+03 -8.77e+02 ...
## $ end_lat : num 4.19e+03 4.18e+12 4.20e+07 4.20e+05 4.19e+03 ...
## $ end_lng : num -8.77e+03 -8.76e+11 -8.77e+07 -8.77e+07 -8.77e+03 ...
## $ member_casual : chr "member" "casual" "member" "member" ...
## $ ride_length : chr "0:23:51" "0:05:19" "0:06:43" "0:06:33" ...
## $ day_of_week : int 4 4 4 4 4 4 4 4 4 4 ...
str(q6_202303)
## 'data.frame': 258678 obs. of 15 variables:
## $ ride_id : chr "3F858645ACA7EB53" "6F13A9FC965782B1" "C15F561F543E67EA" "44563F7CDABE55B6" ...
## $ rideable_type : chr "classic_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : chr "1/03/2023 0:00" "1/03/2023 0:01" "1/03/2023 0:03" "1/03/2023 0:04" ...
## $ ended_at : chr "1/03/2023 0:22" "1/03/2023 0:04" "1/03/2023 0:07" "1/03/2023 0:07" ...
## $ start_station_name: chr "Racine Ave & Congress Pkwy" "" "Clark St & Lunt Ave" "Public Rack - Prairie Ave & 47th St " ...
## $ start_station_id : chr "TA1306000025" "" "KA1504000162" "814" ...
## $ end_station_name : chr "Wabash Ave & Grand Ave" "Kedzie Ave & George St" "" "Cottage Grove Ave & 47th St" ...
## $ end_station_id : chr "TA1307000117" "436" "" "TA1309000053" ...
## $ start_lat : num 4.19e+06 4.19e+03 4.20e+10 4.18e+03 4.19e+03 ...
## $ start_lng : num -8.77e+06 -8.77e+03 -8.77e+10 -8.76e+03 -8.77e+03 ...
## $ end_lat : num 41891466 4193 420 41809855 4193 ...
## $ end_lng : num -87626761 -8771 -8767 -87606755 -877 ...
## $ member_casual : chr "member" "member" "member" "casual" ...
## $ ride_length : chr "0:21:53" "0:02:29" "0:03:52" "0:03:17" ...
## $ day_of_week : int 4 4 4 4 4 4 4 4 4 4 ...
str(q7_202304)
## 'data.frame': 426590 obs. of 15 variables:
## $ ride_id : chr "563BB19A89F51F15" "AD304476EF192169" "F4490F618609D351" "08848F48F7ACF6C3" ...
## $ rideable_type : chr "classic_bike" "classic_bike" "electric_bike" "electric_bike" ...
## $ started_at : chr "1/04/2023 0:00" "1/04/2023 0:00" "1/04/2023 0:00" "1/04/2023 0:00" ...
## $ ended_at : chr "1/04/2023 0:07" "1/04/2023 0:03" "1/04/2023 0:04" "1/04/2023 0:06" ...
## $ start_station_name: chr "Wentworth Ave & 35th St" "Sheffield Ave & Wrightwood Ave" "Stave St & Armitage Ave" "" ...
## $ start_station_id : int NA NA 13266 NA 13033 NA NA 604 NA NA ...
## $ end_station_name : chr "Halsted St & 35th St" "Sheffield Ave & Webster Ave" "" "" ...
## $ end_station_id : chr "TA1308000043" "TA1309000033" "" "" ...
## $ start_lat : num 4.18e+07 4.19e+07 4.19e+16 4.20e+03 4.19e+07 ...
## $ start_lng : num -8.76e+07 -8.77e+07 -8.77e+15 -8.77e+03 -8.76e+07 ...
## $ end_lat : num 41830661 4192154 4191 4196 4191552 ...
## $ end_lng : num -87647172 -87653818 -8769 -8773 -87687022 ...
## $ member_casual : chr "casual" "member" "casual" "casual" ...
## $ ride_length : chr "0:07:02" "0:03:03" "0:03:53" "0:05:27" ...
## $ day_of_week : int 7 7 7 7 7 7 7 7 7 7 ...
str(q8_202305)
## 'data.frame': 604827 obs. of 15 variables:
## $ ride_id : chr "F0664569182AFB20" "E890C841FAF9BA38" "5802807EB4E1B2E7" "CD23D2262D4B2A34" ...
## $ rideable_type : chr "electric_bike" "classic_bike" "electric_bike" "electric_bike" ...
## $ started_at : chr "1/05/2023 0:00" "1/05/2023 0:00" "1/05/2023 0:01" "1/05/2023 0:01" ...
## $ ended_at : chr "1/05/2023 0:06" "1/05/2023 0:04" "1/05/2023 0:14" "1/05/2023 0:04" ...
## $ start_station_name: chr "Burling St & Diversey Pkwy" "Wabash Ave & Wacker Pl" "Larrabee St & Kingsbury St" "" ...
## $ start_station_id : chr "TA1309000036" "TA1307000131" "TA1306000009" "" ...
## $ end_station_name : chr "Southport Ave & Belmont Ave" "Wabash Ave & Grand Ave" "Southport Ave & Wellington Ave" "Chicago Ave & Sheridan Rd" ...
## $ end_station_id : chr "13229" "TA1307000117" "TA1307000006" "E008" ...
## $ start_lat : num 4.19e+10 4.19e+07 4.19e+10 4.20e+03 4.20e+03 ...
## $ start_lng : num -8.76e+10 -8.76e+06 -8.76e+10 -8.77e+03 -8.77e+03 ...
## $ end_lat : num 4.19e+15 4.19e+07 4.19e+15 4.21e+07 4.20e+07 ...
## $ end_lng : num -8.77e+14 -8.76e+07 -8.77e+15 -8.77e+07 -8.77e+07 ...
## $ member_casual : chr "member" "member" "member" "member" ...
## $ ride_length : chr "0:05:29" "0:03:33" "0:12:38" "0:02:36" ...
## $ day_of_week : int 2 2 2 2 2 2 2 2 2 2 ...
str(q9_202306)
## 'data.frame': 719618 obs. of 15 variables:
## $ ride_id : chr "1797E27992201ECA" "ECB6F680EC5ADDB7" "76862A2B6074D143" "88AF94DE4C0970F9" ...
## $ rideable_type : chr "classic_bike" "classic_bike" "electric_bike" "electric_bike" ...
## $ started_at : chr "1/06/2023 0:00" "1/06/2023 0:00" "1/06/2023 0:00" "1/06/2023 0:00" ...
## $ ended_at : chr "1/06/2023 0:14" "1/06/2023 0:04" "1/06/2023 0:08" "1/06/2023 0:11" ...
## $ start_station_name: chr "Montrose Harbor" "Chicago Ave & Sheridan Rd" "Central Park Ave & Bloomingdale Ave" "" ...
## $ start_station_id : chr "TA1308000012" "E008" "18017" "" ...
## $ end_station_name : chr "Lakefront Trail & Bryn Mawr Ave" "Sheridan Rd & Noyes St (NU)" "California Ave & Francis Pl" "Public Rack - Woodlawn Ave & 63rd St N" ...
## $ end_station_id : int 15576 604 13259 951 13259 13135 13259 NA 13323 13235 ...
## $ start_lat : num 4.20e+07 4.21e+07 4.19e+10 4.18e+03 4.19e+10 ...
## $ start_lng : num -8.76e+07 -8.77e+07 -8.77e+10 -8.76e+03 -8.77e+10 ...
## $ end_lat : num 4.20e+15 4.21e+07 4.19e+15 4.18e+10 4.19e+15 ...
## $ end_lng : num -8.77e+15 -8.77e+07 -8.77e+15 -8.76e+10 -8.77e+15 ...
## $ member_casual : chr "member" "member" "casual" "member" ...
## $ ride_length : chr "0:13:32" "0:03:28" "0:07:28" "0:10:19" ...
## $ day_of_week : int 5 5 5 5 5 5 5 5 5 5 ...
str(q10_202307)
## 'data.frame': 767650 obs. of 15 variables:
## $ ride_id : chr "D5B97E091F7E773D" "CC3C6CE0620492A6" "050060710066CF85" "3FF6C8072BDBACEB" ...
## $ rideable_type : chr "classic_bike" "classic_bike" "classic_bike" "electric_bike" ...
## $ started_at : chr "1/07/2023 0:00" "1/07/2023 0:00" "1/07/2023 0:00" "1/07/2023 0:00" ...
## $ ended_at : chr "1/07/2023 0:10" "1/07/2023 0:25" "1/07/2023 0:04" "1/07/2023 0:01" ...
## $ start_station_name: chr "Sangamon St & Lake St" "Canal St & Monroe St" "Sheffield Ave & Webster Ave" "" ...
## $ start_station_id : chr "TA1306000015" "13056" "TA1309000033" "" ...
## $ end_station_name : chr "Wood St & Chicago Ave" "Wells St & Elm St" "Lincoln Ave & Fullerton Ave" "State St & Harrison St" ...
## $ end_station_id : chr "637" "KA1504000135" "TA1309000058" "SL-007" ...
## $ start_lat : num 4.19e+15 4.19e+06 4.19e+06 4.19e+03 4.19e+06 ...
## $ start_lng : num -8.77e+15 -8.76e+06 -8.77e+07 -8.76e+03 -8.76e+06 ...
## $ end_lat : num 4.19e+07 4.19e+07 4.19e+16 4.19e+07 4.19e+07 ...
## $ end_lng : num -8.77e+07 -8.76e+07 -8.76e+15 -8.76e+07 -8.76e+07 ...
## $ member_casual : chr "casual" "casual" "member" "member" ...
## $ ride_length : chr "0:10:09" "0:25:22" "0:04:28" "0:01:17" ...
## $ day_of_week : int 7 7 7 7 7 7 7 7 7 7 ...
str(q11_202308)
## 'data.frame': 771693 obs. of 15 variables:
## $ ride_id : chr "E1212DA176FD5BAF" "4F9E4987AC3DF9D1" "27441012AD09BF4C" "608E6D41D97C3D3A" ...
## $ rideable_type : chr "electric_bike" "classic_bike" "electric_bike" "classic_bike" ...
## $ started_at : chr "1/08/2023 0:00" "1/08/2023 0:00" "1/08/2023 0:00" "1/08/2023 0:00" ...
## $ ended_at : chr "1/08/2023 0:02" "1/08/2023 0:01" "1/08/2023 0:05" "1/08/2023 0:11" ...
## $ start_station_name: chr "" "Marshfield Ave & 59th St" "St. Clair St & Erie St" "Clark St & Armitage Ave" ...
## $ start_station_id : chr "" "560" "13016" "13146" ...
## $ end_station_name : chr "" "Marshfield Ave & 59th St" "Franklin St & Illinois St" "Broadway & Barry Ave" ...
## $ end_station_id : chr "" "560" "RN-" "13137" ...
## $ start_lat : num 4.19e+03 4.18e+07 4.19e+10 4.19e+07 4.19e+15 ...
## $ start_lng : num -8.77e+03 -8.77e+07 -8.76e+10 -8.76e+07 -8.77e+08 ...
## $ end_lat : num 4.19e+03 4.18e+07 4.19e+15 4.19e+15 4.19e+03 ...
## $ end_lng : num -8.77e+03 -8.77e+07 -8.76e+15 -8.76e+14 -8.77e+03 ...
## $ member_casual : chr "casual" "member" "casual" "member" ...
## $ ride_length : chr "0:02:32" "0:00:56" "0:05:06" "0:10:32" ...
## $ day_of_week : int 3 3 3 3 3 3 3 3 3 3 ...
str(q12_202309)
## 'data.frame': 666371 obs. of 15 variables:
## $ ride_id : chr "D29A30D2F2BB9A5A" "EC24AF189FBAB69E" "046D369E97710037" "AE0BB544FB0D578D" ...
## $ rideable_type : chr "classic_bike" "electric_bike" "classic_bike" "classic_bike" ...
## $ started_at : chr "1/09/2023 0:00" "1/09/2023 0:01" "1/09/2023 0:01" "1/09/2023 0:01" ...
## $ ended_at : chr "1/09/2023 0:07" "1/09/2023 0:12" "1/09/2023 0:17" "1/09/2023 0:11" ...
## $ start_station_name: chr "Ravenswood Ave & Berteau Ave" "" "Wells St & Evergreen Ave" "Elston Ave & Cortland St" ...
## $ start_station_id : chr "TA1309000018" "" "TA1308000049" "TA1305000039" ...
## $ end_station_name : chr "Southport Ave & Clark St" "" "McClurg Ct & Ohio St" "Sheffield Ave & Wrightwood Ave" ...
## $ end_station_id : chr "TA1308000047" "" "TA1306000029" "TA1309000023" ...
## $ start_lat : num 4.20e+07 4.20e+03 4.19e+07 4.19e+15 4.19e+10 ...
## $ start_lng : num -8.77e+07 -8.77e+03 -8.76e+06 -8.77e+15 -8.76e+11 ...
## $ end_lat : num 4.20e+07 4.20e+03 4.19e+16 4.19e+07 4.19e+07 ...
## $ end_lng : num -8.77e+07 -8.77e+03 -8.76e+15 -8.77e+07 -8.76e+07 ...
## $ member_casual : chr "casual" "member" "casual" "casual" ...
## $ ride_length : chr "0:07:10" "0:11:17" "0:15:58" "0:09:37" ...
## $ day_of_week : int 6 6 6 6 6 6 6 6 6 6 ...
q7_202304 <- mutate(q7_202304, start_station_id = as.character(start_station_id))
q9_202306 <- mutate(q9_202306, end_station_id = as.character(end_station_id))
all_trips <- bind_rows(q1_202210, q2_202211, q3_202212, q4_202301, q5_202302,
q6_202303, q7_202304, q8_202305, q9_202306, q10_202307,
q11_202308, q12_202309)
glimpse(all_trips)
## Rows: 5,674,399
## Columns: 15
## $ ride_id <chr> "2073591EA92F95EA", "F3BD1CC541D1D526", "8BD7CD28EB…
## $ rideable_type <chr> "electric_bike", "electric_bike", "electric_bike", …
## $ started_at <chr> "1/10/2022 0:00", "1/10/2022 0:00", "1/10/2022 0:00…
## $ ended_at <chr> "1/10/2022 0:18", "1/10/2022 0:32", "1/10/2022 0:06…
## $ start_station_name <chr> "Sheffield Ave & Wrightwood Ave", "Clark St & Bryn …
## $ start_station_id <chr> "TA1309000023", "KA1504000151", "TA1306000025", "",…
## $ end_station_name <chr> "Franklin St & Illinois St", "", "Wolcott Ave & Pol…
## $ end_station_id <chr> "RN-", "", "TA1309000064", "", "13071", "TA13050000…
## $ start_lat <dbl> 4.192870e+15, 4.198360e+16, 4.187460e+16, 4.197000e…
## $ start_lng <dbl> -8.765380e+15, -8.766915e+07, -8.765698e+07, -8.776…
## $ end_lat <dbl> 4.189100e+15, 4.196000e+03, 4.187126e+07, 4.197000e…
## $ end_lng <dbl> -8.763550e+15, -8.765000e+03, -8.767369e+07, -8.776…
## $ member_casual <chr> "member", "casual", "member", "member", "member", "…
## $ ride_length <chr> "0:18:00", "0:32:02", "0:06:04", "0:00:28", "0:09:1…
## $ day_of_week <int> 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, …
all_trips <- all_trips %>%
select(-c(start_lat, start_lng, end_lat, end_lng))
Paso 3: Limpiar y agregar datos para prepararlos para el análisis * Inspeccionar la tabla nueva que ha sido creada
colnames(all_trips) #List of column names
## [1] "ride_id" "rideable_type" "started_at"
## [4] "ended_at" "start_station_name" "start_station_id"
## [7] "end_station_name" "end_station_id" "member_casual"
## [10] "ride_length" "day_of_week"
nrow(all_trips) #How many rows are in data frame?
## [1] 5674399
dim(all_trips) #Dimensions of the data frame?
## [1] 5674399 11
head(all_trips) #See the first 6 rows of data frame. Also tail(all_trips)
## ride_id rideable_type started_at ended_at
## 1 2073591EA92F95EA electric_bike 1/10/2022 0:00 1/10/2022 0:18
## 2 F3BD1CC541D1D526 electric_bike 1/10/2022 0:00 1/10/2022 0:32
## 3 8BD7CD28EB78C3F0 electric_bike 1/10/2022 0:00 1/10/2022 0:06
## 4 DCBCCF540A0AE3CB electric_bike 1/10/2022 0:00 1/10/2022 0:01
## 5 D6A773C844DE6EEE electric_bike 1/10/2022 0:00 1/10/2022 0:10
## 6 A80C0CFF35C90615 classic_bike 1/10/2022 0:01 1/10/2022 0:05
## start_station_name start_station_id
## 1 Sheffield Ave & Wrightwood Ave TA1309000023
## 2 Clark St & Bryn Mawr Ave KA1504000151
## 3 Racine Ave & Congress Pkwy TA1306000025
## 4
## 5
## 6 Dearborn St & Erie St 13045
## end_station_name end_station_id member_casual ride_length
## 1 Franklin St & Illinois St RN- member 0:18:00
## 2 casual 0:32:02
## 3 Wolcott Ave & Polk St TA1309000064 member 0:06:04
## 4 member 0:00:28
## 5 Southport Ave & Roscoe St 13071 member 0:09:14
## 6 Orleans St & Merchandise Mart Plaza TA1305000022 member 0:04:33
## day_of_week
## 1 7
## 2 7
## 3 7
## 4 7
## 5 7
## 6 7
str(all_trips) #See list of columns and data types (numeric, character, etc)
## 'data.frame': 5674399 obs. of 11 variables:
## $ ride_id : chr "2073591EA92F95EA" "F3BD1CC541D1D526" "8BD7CD28EB78C3F0" "DCBCCF540A0AE3CB" ...
## $ rideable_type : chr "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : chr "1/10/2022 0:00" "1/10/2022 0:00" "1/10/2022 0:00" "1/10/2022 0:00" ...
## $ ended_at : chr "1/10/2022 0:18" "1/10/2022 0:32" "1/10/2022 0:06" "1/10/2022 0:01" ...
## $ start_station_name: chr "Sheffield Ave & Wrightwood Ave" "Clark St & Bryn Mawr Ave" "Racine Ave & Congress Pkwy" "" ...
## $ start_station_id : chr "TA1309000023" "KA1504000151" "TA1306000025" "" ...
## $ end_station_name : chr "Franklin St & Illinois St" "" "Wolcott Ave & Polk St" "" ...
## $ end_station_id : chr "RN-" "" "TA1309000064" "" ...
## $ member_casual : chr "member" "casual" "member" "member" ...
## $ ride_length : chr "0:18:00" "0:32:02" "0:06:04" "0:00:28" ...
## $ day_of_week : int 7 7 7 7 7 7 7 7 7 7 ...
summary(all_trips) #Statistical summary of data. Mainly for numerics
## ride_id rideable_type started_at ended_at
## Length:5674399 Length:5674399 Length:5674399 Length:5674399
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## start_station_name start_station_id end_station_name end_station_id
## Length:5674399 Length:5674399 Length:5674399 Length:5674399
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## member_casual ride_length day_of_week
## Length:5674399 Length:5674399 Min. :1.000
## Class :character Class :character 1st Qu.:2.000
## Mode :character Mode :character Median :4.000
## Mean :4.138
## 3rd Qu.:6.000
## Max. :7.000
table(all_trips$member_casual)
##
## casual member
## 2086699 3587700
all_trips$date <- as.Date(all_trips$started_at) #The default format is yyyy-mm-dd
all_trips$month <- format(as.Date(all_trips$date), "%m")
all_trips$day <- format(as.Date(all_trips$date), "%d")
all_trips$year <- format(as.Date(all_trips$date), "%Y")
all_trips$day_of_week <- format(as.Date(all_trips$date), "%A")
all_trips$start_hour <- format(as.POSIXct(all_trips$started_at), format = "%H")
*Agregar el calculo “ride_length” a all_trips (en segundos)
all_trips$ride_length <- difftime(all_trips$ended_at,all_trips$started_at)
glimpse(all_trips)
## Rows: 5,674,399
## Columns: 16
## $ ride_id <chr> "2073591EA92F95EA", "F3BD1CC541D1D526", "8BD7CD28EB…
## $ rideable_type <chr> "electric_bike", "electric_bike", "electric_bike", …
## $ started_at <chr> "1/10/2022 0:00", "1/10/2022 0:00", "1/10/2022 0:00…
## $ ended_at <chr> "1/10/2022 0:18", "1/10/2022 0:32", "1/10/2022 0:06…
## $ start_station_name <chr> "Sheffield Ave & Wrightwood Ave", "Clark St & Bryn …
## $ start_station_id <chr> "TA1309000023", "KA1504000151", "TA1306000025", "",…
## $ end_station_name <chr> "Franklin St & Illinois St", "", "Wolcott Ave & Pol…
## $ end_station_id <chr> "RN-", "", "TA1309000064", "", "13071", "TA13050000…
## $ member_casual <chr> "member", "casual", "member", "member", "member", "…
## $ ride_length <drtn> 0 secs, 0 secs, 0 secs, 0 secs, 0 secs, 0 secs, 0 …
## $ day_of_week <chr> "sábado", "sábado", "sábado", "sábado", "sábado", "…
## $ date <date> 0001-10-20, 0001-10-20, 0001-10-20, 0001-10-20, 00…
## $ month <chr> "10", "10", "10", "10", "10", "10", "10", "10", "10…
## $ day <chr> "20", "20", "20", "20", "20", "20", "20", "20", "20…
## $ year <chr> "0001", "0001", "0001", "0001", "0001", "0001", "00…
## $ start_hour <chr> "00", "00", "00", "00", "00", "00", "00", "00", "00…
str(all_trips)
## 'data.frame': 5674399 obs. of 16 variables:
## $ ride_id : chr "2073591EA92F95EA" "F3BD1CC541D1D526" "8BD7CD28EB78C3F0" "DCBCCF540A0AE3CB" ...
## $ rideable_type : chr "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
## $ started_at : chr "1/10/2022 0:00" "1/10/2022 0:00" "1/10/2022 0:00" "1/10/2022 0:00" ...
## $ ended_at : chr "1/10/2022 0:18" "1/10/2022 0:32" "1/10/2022 0:06" "1/10/2022 0:01" ...
## $ start_station_name: chr "Sheffield Ave & Wrightwood Ave" "Clark St & Bryn Mawr Ave" "Racine Ave & Congress Pkwy" "" ...
## $ start_station_id : chr "TA1309000023" "KA1504000151" "TA1306000025" "" ...
## $ end_station_name : chr "Franklin St & Illinois St" "" "Wolcott Ave & Polk St" "" ...
## $ end_station_id : chr "RN-" "" "TA1309000064" "" ...
## $ member_casual : chr "member" "casual" "member" "member" ...
## $ ride_length : 'difftime' num 0 0 0 0 ...
## ..- attr(*, "units")= chr "secs"
## $ day_of_week : chr "sábado" "sábado" "sábado" "sábado" ...
## $ date : Date, format: "0001-10-20" "0001-10-20" ...
## $ month : chr "10" "10" "10" "10" ...
## $ day : chr "20" "20" "20" "20" ...
## $ year : chr "0001" "0001" "0001" "0001" ...
## $ start_hour : chr "00" "00" "00" "00" ...
is.factor(all_trips$ride_length)
## [1] FALSE
all_trips$ride_length <- as.numeric(as.character(all_trips$ride_length))
is.numeric(all_trips$ride_length)
## [1] TRUE
all_trips_v2 <- all_trips[!(all_trips$ride_length<0),]
PASO 4: REALIZAR ANÁLISIS DESCRIPTIVO * Análisis descriptivo sobre la duración de los viajes (todos los valores en segundos)
mean(all_trips_v2$ride_length) #straight average (total ride length / rides)
## [1] 186842
median(all_trips_v2$ride_length) #midpoint number in the ascending array of ride lengths
## [1] 0
max(all_trips_v2$ride_length) #longest ride
## [1] 946684800
min(all_trips_v2$ride_length) #shortest ride
## [1] 0
summary(all_trips_v2$ride_length)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 186842 0 946684800
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = mean)
## all_trips_v2$member_casual all_trips_v2$ride_length
## 1 casual 353470.8
## 2 member 89954.7
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = median)
## all_trips_v2$member_casual all_trips_v2$ride_length
## 1 casual 0
## 2 member 0
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = max)
## all_trips_v2$member_casual all_trips_v2$ride_length
## 1 casual 946684800
## 2 member 63158400
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual, FUN = min)
## all_trips_v2$member_casual all_trips_v2$ride_length
## 1 casual 0
## 2 member 0
aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual + all_trips_v2$day_of_week, FUN = mean)
## all_trips_v2$member_casual all_trips_v2$day_of_week all_trips_v2$ride_length
## 1 casual domingo 352221.23
## 2 member domingo 89883.51
## 3 casual jueves 321550.45
## 4 member jueves 83332.52
## 5 casual lunes 372593.31
## 6 member lunes 90873.86
## 7 casual martes 385331.95
## 8 member martes 96258.94
## 9 casual miércoles 320521.77
## 10 member miércoles 82249.74
## 11 casual sábado 362253.85
## 12 member sábado 92997.52
## 13 casual viernes 358491.38
## 14 member viernes 93980.73
all_trips_v2 %>%
mutate(weekday = wday(started_at, label = TRUE)) %>% #creates weekday field using wday()
group_by(member_casual, weekday) %>% #groups by usertype and weekday
summarise(number_of_rides = n() #calculates the number of rides and average duration
,average_duration = mean(ride_length)) %>% # calculates the average duration
arrange(member_casual, weekday) # sorts
## # A tibble: 14 × 4
## # Groups: member_casual [2]
## member_casual weekday number_of_rides average_duration
## <chr> <ord> <int> <dbl>
## 1 casual "dom\\." 300791 352221.
## 2 casual "lun\\." 301774 372593.
## 3 casual "mar\\." 296242 385332.
## 4 casual "mié\\." 276782 320522.
## 5 casual "jue\\." 308064 321550.
## 6 casual "vie\\." 310819 358491.
## 7 casual "sáb\\." 291399 362254.
## 8 member "dom\\." 501310 89884.
## 9 member "lun\\." 515365 90874.
## 10 member "mar\\." 517974 96259.
## 11 member "mié\\." 503774 82250.
## 12 member "jue\\." 521811 83333.
## 13 member "vie\\." 517085 93981.
## 14 member "sáb\\." 510004 92998.
all_trips_v2 %>%
mutate(weekday = wday(started_at, label = TRUE)) %>%
group_by(member_casual, weekday) %>%
summarise(number_of_rides = n()
,average_duration = mean(ride_length)) %>%
arrange(member_casual, weekday) %>%
ggplot(aes(x = weekday, y = number_of_rides, fill = member_casual)) +
geom_col(position = "dodge") + labs(title="Number of Rides by Rider Type",
y="Number of Rides", x="Weekday", fill="Member Status")
Ambos usuarios tienen una tendencia de uso que se mantiene todos los días de la semana.
Los usuarios miembros tienen un uso mayor de las bicicletas.
Visualizar el número de viajes por mes
all_trips_v2 %>%
mutate(month = format(as.Date(all_trips_v2$started_at), "%m")) %>%
group_by(member_casual, month) %>%
summarise(number_of_rides = n()
,average_duration = mean(ride_length)) %>%
arrange(member_casual, month) %>%
ggplot(aes(x = month, y = number_of_rides, fill = member_casual)) +
geom_col(position = "dodge") + labs(title="Number of Rides per Month",
y="Number of Rides", x="Month", fill="Member Status")
La frecuencia de los viajes y el uso de las bicicletas aumenta entre los meses de Julio y Agosto para ambos usuarios.
Los usuarios miembros realizan mayor cantidad de viajes respecto a los usuarios ocasionales.
Crear una visualización para la duración promedio
all_trips_v2 %>%
mutate(weekday = wday(started_at, label = TRUE)) %>%
group_by(member_casual, weekday) %>%
summarise(number_of_rides = n()
,average_duration = mean(ride_length)) %>%
arrange(member_casual, weekday) %>%
ggplot(aes(x = weekday, y = average_duration, fill = member_casual)) +
geom_col(position = "dodge") + labs(title="Average Ride Duration Casual vs Member",
y="Average Duration in Seconds", x="Weekday", fill="Member Status")
Los usuarios ocasionales realizan trayectos más largos que los miembros.
Crear una visualización para el tipo de bicicletas
all_trips_v2 %>%
group_by(member_casual, rideable_type) %>%
arrange(member_casual, rideable_type) %>%
ggplot(aes(x = rideable_type, fill = member_casual)) +
geom_bar(position = "dodge") + labs(title="Type of Bikes Casual vs Member",
y="Count", x="Type of Bikes", fill="Member Status")
Se evidencia la preferencia por el uso de las bicicletas electricas.
Los usuarios miembros usan en mayor cantidad bicicletas electricas.
Paso 5: Exportar archivo de resumen para un análisis adicional * Crear un archivo CSV que visualizaremos en Excel, Tableau o mi software de presentación
counts <- aggregate(all_trips_v2$ride_length ~ all_trips_v2$member_casual + all_trips_v2$day_of_week, FUN = mean)
write.csv(counts, file = 'avg_ride_length.csv')
1. Los resultados del uso de bicicletas basado en la cantidad de viajes cada día de la semana enseñan que ambos usuarios hacen un uso regular todos los días, pero los usuarios miembros en mayor cantidad.
2. Analizamos la duración de los viajes que realizan los usuarios mostrando como los usuarios ocasionales realizan trayectos más largos, mientras que los miembros realizan viajes mas cortos.
3. Vemos como se comportan los usuarios en el uso de las bicicletas a lo largo del año, obteniendo como resultado que la frecuencia de los viajes y el uso de las bicicletas aumenta entre los meses de Julio y Agosto para ambos usuarios, en usuarios miembros con mayor cantidad comparado a los usuarios ocasionales.
4. Respecto al uso de los diferentes tipos de bicicletas se evidencia la preferencia por el uso de las bicicletas electricas, siendo mayor para los usuarios miembros.