Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.
Select the path:
setwd("C:/ANALISTA DE DATOS/PROYECTOS FINAL/Negocio de bicicletas compartidas")
Select the file to analyze:
archivo <- "DATOS_COMPLETOS_BICICLETAS_COMPARTIDAS.xlsx"
Load required libraries
install.packages("readxl", dependencies = TRUE)
library(readxl)
We have to load dplyr to collect all data in one data table
install.packages("dplyr")
library(dplyr)
We iterate all solapas from excel to integrate in one
# Obtener los nombres de las hojas (solapas)
hojas <- excel_sheets(archivo)
# Leer cada hoja, aƱadirle el nombre de la hoja como columna opcional, y unir todas
datos_unificados <- lapply(hojas, function(hoja) {
read_excel(archivo, sheet = hoja) %>%
mutate(origen_hoja = hoja) # AƱade una columna con el nombre de la hoja (opcional)
}) %>%
bind_rows()
Now we have all data in one tabla, lets go to analyze
head(datos_unificados)
[38;5;246m# A tibble: 6 x 16[39m
ride_id rideable_type started_at weekday ended_at ride_length start_station_name
[3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<dttm>[39m[23m [3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<dttm>[39m[23m [3m[38;5;246m<dttm>[39m[23m [3m[38;5;246m<chr>[39m[23m
[38;5;250m1[39m CDE6023BE~ electric_bike 2024-06-11 [38;5;246m17:20:06[39m lunes 2024-06-11 [38;5;246m17:21:39[39m 1899-12-31 [38;5;246m00:01:33[39m [31mNA[39m
[38;5;250m2[39m 462B48CD2~ electric_bike 2024-06-11 [38;5;246m17:19:21[39m lunes 2024-06-11 [38;5;246m17:19:36[39m 1899-12-31 [38;5;246m00:00:14[39m [31mNA[39m
[38;5;250m3[39m 9CFB6A858~ electric_bike 2024-06-11 [38;5;246m17:25:27[39m lunes 2024-06-11 [38;5;246m17:30:13[39m 1899-12-31 [38;5;246m00:04:45[39m [31mNA[39m
[38;5;250m4[39m 6365EFEB6~ electric_bike 2024-06-11 [38;5;246m11:53:50[39m lunes 2024-06-11 [38;5;246m12:08:13[39m 1899-12-31 [38;5;246m00:14:22[39m [31mNA[39m
[38;5;250m5[39m BA0323C33~ electric_bike 2024-06-11 [38;5;246m00:11:08[39m lunes 2024-06-11 [38;5;246m00:11:22[39m 1899-12-31 [38;5;246m00:00:14[39m [31mNA[39m
[38;5;250m6[39m DE26F0D72~ electric_bike 2024-06-11 [38;5;246m00:12:38[39m lunes 2024-06-11 [38;5;246m00:12:57[39m 1899-12-31 [38;5;246m00:00:19[39m [31mNA[39m
[38;5;246m# i 9 more variables: start_station_id <chr>, end_station_name <chr>, end_station_id <chr>, start_lat <dbl>,[39m
[38;5;246m# start_lng <dbl>, end_lat <dbl>, end_lng <dbl>, member_casual <chr>, origen_hoja <chr>[39m
Check the structure of ādatos_unificadosā
str(datos_unificados)
tibble [5,126,391 x 16] (S3: tbl_df/tbl/data.frame)
$ ride_id : chr [1:5126391] "CDE6023BE6B11D2F" "462B48CD292B6A18" "9CFB6A858D23ABF7" "6365EFEB64231153" ...
$ rideable_type : chr [1:5126391] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
$ started_at : POSIXct[1:5126391], format: "2024-06-11 17:20:06" "2024-06-11 17:19:21" "2024-06-11 17:25:27" "2024-06-11 11:53:50" ...
$ weekday : chr [1:5126391] "lunes" "lunes" "lunes" "lunes" ...
$ ended_at : POSIXct[1:5126391], format: "2024-06-11 17:21:39" "2024-06-11 17:19:36" "2024-06-11 17:30:13" "2024-06-11 12:08:13" ...
$ ride_length : POSIXct[1:5126391], format: "1899-12-31 00:01:33" "1899-12-31 00:00:14" "1899-12-31 00:04:45" "1899-12-31 00:14:22" ...
$ start_station_name: chr [1:5126391] NA NA NA NA ...
$ start_station_id : chr [1:5126391] NA NA NA NA ...
$ end_station_name : chr [1:5126391] NA NA NA NA ...
$ end_station_id : chr [1:5126391] NA NA NA NA ...
$ start_lat : num [1:5126391] 41.9 41.9 41.9 41.9 41.9 ...
$ start_lng : num [1:5126391] -87.7 -87.7 -87.7 -87.6 -87.6 ...
$ end_lat : num [1:5126391] 41.9 41.9 41.9 41.9 41.9 ...
$ end_lng : num [1:5126391] -87.7 -87.7 -87.7 -87.6 -87.6 ...
$ member_casual : chr [1:5126391] "casual" "casual" "casual" "casual" ...
$ origen_hoja : chr [1:5126391] "202406-divvy-tripdata" "202406-divvy-tripdata" "202406-divvy-tripdata" "202406-divvy-tripdata" ...
glimpse(datos_unificados)
Rows: 5,126,391
Columns: 16
$ ride_id [3m[38;5;246m<chr>[39m[23m "CDE6023BE6B11D2F", "462B48CD292B6A18", "9CFB6A858D23ABF7", "6365EFEB64231153", "BA0323C33134CBA8", "DE26F0~
$ rideable_type [3m[38;5;246m<chr>[39m[23m "electric_bike", "electric_bike", "electric_bike", "electric_bike", "electric_bike", "electric_bike", "elec~
$ started_at [3m[38;5;246m<dttm>[39m[23m 2024-06-11 17:20:06, 2024-06-11 17:19:21, 2024-06-11 17:25:27, 2024-06-11 11:53:50, 2024-06-11 00:11:08, 2~
$ weekday [3m[38;5;246m<chr>[39m[23m "lunes", "lunes", "lunes", "lunes", "lunes", "lunes", "lunes", "lunes", "lunes", "lunes", "lunes", "domingo~
$ ended_at [3m[38;5;246m<dttm>[39m[23m 2024-06-11 17:21:39, 2024-06-11 17:19:36, 2024-06-11 17:30:13, 2024-06-11 12:08:13, 2024-06-11 00:11:22, 2~
$ ride_length [3m[38;5;246m<dttm>[39m[23m 1899-12-31 00:01:33, 1899-12-31 00:00:14, 1899-12-31 00:04:45, 1899-12-31 00:14:22, 1899-12-31 00:00:14, 1~
$ start_station_name [3m[38;5;246m<chr>[39m[23m [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m,~
$ start_station_id [3m[38;5;246m<chr>[39m[23m [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m,~
$ end_station_name [3m[38;5;246m<chr>[39m[23m [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, "Wood St & Chicago Ave", [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m~
$ end_station_id [3m[38;5;246m<chr>[39m[23m [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, "637", [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, ~
$ start_lat [3m[38;5;246m<dbl>[39m[23m 41.89, 41.89, 41.93, 41.88, 41.94, 41.94, 41.94, 41.91, 41.91, 41.91, 41.89, 41.80, 41.80, 41.88, 41.89, 41~
$ start_lng [3m[38;5;246m<dbl>[39m[23m -87.65, -87.65, -87.65, -87.64, -87.64, -87.64, -87.64, -87.63, -87.74, -87.74, -87.62, -87.70, -87.70, -87~
$ end_lat [3m[38;5;246m<dbl>[39m[23m 41.89000, 41.89000, 41.94000, 41.88000, 41.94000, 41.94000, 41.93000, 41.87000, 41.91000, 41.91000, 41.8800~
$ end_lng [3m[38;5;246m<dbl>[39m[23m -87.65000, -87.65000, -87.65000, -87.64000, -87.64000, -87.64000, -87.64000, -87.61000, -87.74000, -87.7400~
$ member_casual [3m[38;5;246m<chr>[39m[23m "casual", "casual", "casual", "casual", "casual", "casual", "casual", "casual", "casual", "casual", "casual~
$ origen_hoja [3m[38;5;246m<chr>[39m[23m "202406-divvy-tripdata", "202406-divvy-tripdata", "202406-divvy-tripdata", "202406-divvy-tripdata", "202406~
Change of āride_lengthā to suitable data type
datos_unificados <- datos_unificados %>%
mutate(ride_length_min = as.numeric(difftime(ended_at, started_at, units = "mins")))
Clean of data that have null values:
datos_unificados_clean <- datos_unificados %>%
filter(!is.na(started_at), # sin fechas nulas
!is.na(ended_at),
ride_length_min > 0, # sin viajes negativos o nulos
ride_length_min < 1440 # menos de 24 horas (fuera outliers extremos)
)
Now we have to check the data again:
str(datos_unificados_clean)
tibble [5,120,526 x 17] (S3: tbl_df/tbl/data.frame)
$ ride_id : chr [1:5120526] "CDE6023BE6B11D2F" "462B48CD292B6A18" "9CFB6A858D23ABF7" "6365EFEB64231153" ...
$ rideable_type : chr [1:5120526] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
$ started_at : POSIXct[1:5120526], format: "2024-06-11 17:20:06" "2024-06-11 17:19:21" "2024-06-11 17:25:27" "2024-06-11 11:53:50" ...
$ weekday : chr [1:5120526] "lunes" "lunes" "lunes" "lunes" ...
$ ended_at : POSIXct[1:5120526], format: "2024-06-11 17:21:39" "2024-06-11 17:19:36" "2024-06-11 17:30:13" "2024-06-11 12:08:13" ...
$ ride_length : POSIXct[1:5120526], format: "1899-12-31 00:01:33" "1899-12-31 00:00:14" "1899-12-31 00:04:45" "1899-12-31 00:14:22" ...
$ start_station_name: chr [1:5120526] NA NA NA NA ...
$ start_station_id : chr [1:5120526] NA NA NA NA ...
$ end_station_name : chr [1:5120526] NA NA NA NA ...
$ end_station_id : chr [1:5120526] NA NA NA NA ...
$ start_lat : num [1:5120526] 41.9 41.9 41.9 41.9 41.9 ...
$ start_lng : num [1:5120526] -87.7 -87.7 -87.7 -87.6 -87.6 ...
$ end_lat : num [1:5120526] 41.9 41.9 41.9 41.9 41.9 ...
$ end_lng : num [1:5120526] -87.7 -87.7 -87.7 -87.6 -87.6 ...
$ member_casual : chr [1:5120526] "casual" "casual" "casual" "casual" ...
$ origen_hoja : chr [1:5120526] "202406-divvy-tripdata" "202406-divvy-tripdata" "202406-divvy-tripdata" "202406-divvy-tripdata" ...
$ ride_length_min : num [1:5120526] 1.553 0.247 4.766 14.377 0.246 ...
glimpse(datos_unificados_clean)
Rows: 5,120,526
Columns: 17
$ ride_id [3m[38;5;246m<chr>[39m[23m "CDE6023BE6B11D2F", "462B48CD292B6A18", "9CFB6A858D23ABF7", "6365EFEB64231153", "BA0323C33134CBA8", "DE26F~
$ rideable_type [3m[38;5;246m<chr>[39m[23m "electric_bike", "electric_bike", "electric_bike", "electric_bike", "electric_bike", "electric_bike", "ele~
$ started_at [3m[38;5;246m<dttm>[39m[23m 2024-06-11 17:20:06, 2024-06-11 17:19:21, 2024-06-11 17:25:27, 2024-06-11 11:53:50, 2024-06-11 00:11:08, ~
$ weekday [3m[38;5;246m<chr>[39m[23m "lunes", "lunes", "lunes", "lunes", "lunes", "lunes", "lunes", "lunes", "lunes", "lunes", "lunes", "doming~
$ ended_at [3m[38;5;246m<dttm>[39m[23m 2024-06-11 17:21:39, 2024-06-11 17:19:36, 2024-06-11 17:30:13, 2024-06-11 12:08:13, 2024-06-11 00:11:22, ~
$ ride_length [3m[38;5;246m<dttm>[39m[23m 1899-12-31 00:01:33, 1899-12-31 00:00:14, 1899-12-31 00:04:45, 1899-12-31 00:14:22, 1899-12-31 00:00:14, ~
$ start_station_name [3m[38;5;246m<chr>[39m[23m [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m~
$ start_station_id [3m[38;5;246m<chr>[39m[23m [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m~
$ end_station_name [3m[38;5;246m<chr>[39m[23m [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, "Wood St & Chicago Ave", [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mN[39m~
$ end_station_id [3m[38;5;246m<chr>[39m[23m [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, "637", [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m,~
$ start_lat [3m[38;5;246m<dbl>[39m[23m 41.89, 41.89, 41.93, 41.88, 41.94, 41.94, 41.94, 41.91, 41.91, 41.91, 41.89, 41.80, 41.80, 41.88, 41.89, 4~
$ start_lng [3m[38;5;246m<dbl>[39m[23m -87.65, -87.65, -87.65, -87.64, -87.64, -87.64, -87.64, -87.63, -87.74, -87.74, -87.62, -87.70, -87.70, -8~
$ end_lat [3m[38;5;246m<dbl>[39m[23m 41.89000, 41.89000, 41.94000, 41.88000, 41.94000, 41.94000, 41.93000, 41.87000, 41.91000, 41.91000, 41.880~
$ end_lng [3m[38;5;246m<dbl>[39m[23m -87.65000, -87.65000, -87.65000, -87.64000, -87.64000, -87.64000, -87.64000, -87.61000, -87.74000, -87.740~
$ member_casual [3m[38;5;246m<chr>[39m[23m "casual", "casual", "casual", "casual", "casual", "casual", "casual", "casual", "casual", "casual", "casua~
$ origen_hoja [3m[38;5;246m<chr>[39m[23m "202406-divvy-tripdata", "202406-divvy-tripdata", "202406-divvy-tripdata", "202406-divvy-tripdata", "20240~
$ ride_length_min [3m[38;5;246m<dbl>[39m[23m 1.5529167, 0.2468333, 4.7657667, 14.3768833, 0.2460167, 0.3236167, 6.5761833, 64.3043500, 0.2990667, 0.355~
we eliminate ride_length because is in a wrong data type:
datos_unificados_clean <- datos_unificados_clean %>%
select(-ride_length) %>%
mutate(ride_length_min = as.numeric(difftime(ended_at, started_at, units = "mins")))
Rename ride_length_min to ride_length:
datos_unificados_clean <- datos_unificados_clean %>%
rename(ride_length = ride_length_min)
Check the data again:
str(datos_unificados_clean)
tibble [5,120,526 x 16] (S3: tbl_df/tbl/data.frame)
$ ride_id : chr [1:5120526] "CDE6023BE6B11D2F" "462B48CD292B6A18" "9CFB6A858D23ABF7" "6365EFEB64231153" ...
$ rideable_type : chr [1:5120526] "electric_bike" "electric_bike" "electric_bike" "electric_bike" ...
$ started_at : POSIXct[1:5120526], format: "2024-06-11 17:20:06" "2024-06-11 17:19:21" "2024-06-11 17:25:27" "2024-06-11 11:53:50" ...
$ weekday : chr [1:5120526] "lunes" "lunes" "lunes" "lunes" ...
$ ended_at : POSIXct[1:5120526], format: "2024-06-11 17:21:39" "2024-06-11 17:19:36" "2024-06-11 17:30:13" "2024-06-11 12:08:13" ...
$ start_station_name: chr [1:5120526] NA NA NA NA ...
$ start_station_id : chr [1:5120526] NA NA NA NA ...
$ end_station_name : chr [1:5120526] NA NA NA NA ...
$ end_station_id : chr [1:5120526] NA NA NA NA ...
$ start_lat : num [1:5120526] 41.9 41.9 41.9 41.9 41.9 ...
$ start_lng : num [1:5120526] -87.7 -87.7 -87.7 -87.6 -87.6 ...
$ end_lat : num [1:5120526] 41.9 41.9 41.9 41.9 41.9 ...
$ end_lng : num [1:5120526] -87.7 -87.7 -87.7 -87.6 -87.6 ...
$ member_casual : chr [1:5120526] "casual" "casual" "casual" "casual" ...
$ origen_hoja : chr [1:5120526] "202406-divvy-tripdata" "202406-divvy-tripdata" "202406-divvy-tripdata" "202406-divvy-tripdata" ...
$ ride_length : num [1:5120526] 1.553 0.247 4.766 14.377 0.246 ...
glimpse(datos_unificados_clean) # Requiere dplyr
Rows: 5,120,526
Columns: 16
$ ride_id [3m[38;5;246m<chr>[39m[23m "CDE6023BE6B11D2F", "462B48CD292B6A18", "9CFB6A858D23ABF7", "6365EFEB64231153", "BA0323C33134CBA8", "DE26F~
$ rideable_type [3m[38;5;246m<chr>[39m[23m "electric_bike", "electric_bike", "electric_bike", "electric_bike", "electric_bike", "electric_bike", "ele~
$ started_at [3m[38;5;246m<dttm>[39m[23m 2024-06-11 17:20:06, 2024-06-11 17:19:21, 2024-06-11 17:25:27, 2024-06-11 11:53:50, 2024-06-11 00:11:08, ~
$ weekday [3m[38;5;246m<chr>[39m[23m "lunes", "lunes", "lunes", "lunes", "lunes", "lunes", "lunes", "lunes", "lunes", "lunes", "lunes", "doming~
$ ended_at [3m[38;5;246m<dttm>[39m[23m 2024-06-11 17:21:39, 2024-06-11 17:19:36, 2024-06-11 17:30:13, 2024-06-11 12:08:13, 2024-06-11 00:11:22, ~
$ start_station_name [3m[38;5;246m<chr>[39m[23m [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m~
$ start_station_id [3m[38;5;246m<chr>[39m[23m [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m~
$ end_station_name [3m[38;5;246m<chr>[39m[23m [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, "Wood St & Chicago Ave", [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mN[39m~
$ end_station_id [3m[38;5;246m<chr>[39m[23m [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, "637", [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m, [31mNA[39m,~
$ start_lat [3m[38;5;246m<dbl>[39m[23m 41.89, 41.89, 41.93, 41.88, 41.94, 41.94, 41.94, 41.91, 41.91, 41.91, 41.89, 41.80, 41.80, 41.88, 41.89, 4~
$ start_lng [3m[38;5;246m<dbl>[39m[23m -87.65, -87.65, -87.65, -87.64, -87.64, -87.64, -87.64, -87.63, -87.74, -87.74, -87.62, -87.70, -87.70, -8~
$ end_lat [3m[38;5;246m<dbl>[39m[23m 41.89000, 41.89000, 41.94000, 41.88000, 41.94000, 41.94000, 41.93000, 41.87000, 41.91000, 41.91000, 41.880~
$ end_lng [3m[38;5;246m<dbl>[39m[23m -87.65000, -87.65000, -87.65000, -87.64000, -87.64000, -87.64000, -87.64000, -87.61000, -87.74000, -87.740~
$ member_casual [3m[38;5;246m<chr>[39m[23m "casual", "casual", "casual", "casual", "casual", "casual", "casual", "casual", "casual", "casual", "casua~
$ origen_hoja [3m[38;5;246m<chr>[39m[23m "202406-divvy-tripdata", "202406-divvy-tripdata", "202406-divvy-tripdata", "202406-divvy-tripdata", "20240~
$ ride_length [3m[38;5;246m<dbl>[39m[23m 1.5529167, 0.2468333, 4.7657667, 14.3768833, 0.2460167, 0.3236167, 6.5761833, 64.3043500, 0.2990667, 0.355~
create a basic descriptive analysis:
summary(datos_unificados_clean$ride_length)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0006 5.4419 9.5069 14.9672 16.7934 1439.9346
datos_unificados_clean %>%
summarise(
total_viajes = n(),
duracion_promedio = mean(ride_length),
duracion_mediana = median(ride_length),
duracion_max = max(ride_length),
duracion_min = min(ride_length)
)
[38;5;246m# A tibble: 1 x 5[39m
total_viajes duracion_promedio duracion_mediana duracion_max duracion_min
[3m[38;5;246m<int>[39m[23m [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m
[38;5;250m1[39m 5[4m1[24m[4m2[24m[4m0[24m526 15.0 9.51 [4m1[24m440. 0.000[4m6[24m[4m5[24m[4m0[24m
we can see how many travels for each user:
datos_unificados_clean %>%
count(member_casual)
[38;5;246m# A tibble: 2 x 2[39m
member_casual n
[3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<int>[39m[23m
[38;5;250m1[39m casual 1[4m8[24m[4m7[24m[4m6[24m658
[38;5;250m2[39m member 3[4m2[24m[4m4[24m[4m3[24m868
and the mean of each user:
datos_unificados_clean %>%
group_by(member_casual) %>%
summarise(duracion_media = mean(ride_length), .groups = "drop")
[38;5;246m# A tibble: 2 x 2[39m
member_casual duracion_media
[3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<dbl>[39m[23m
[38;5;250m1[39m casual 20.2
[38;5;250m2[39m member 11.9
We see that casual are using bicycles nearly the double than members Now we are going to see the difference by day:
datos_unificados_clean %>%
group_by(weekday) %>%
summarise(
viajes = n(),
duracion_prom = mean(ride_length),
.groups = "drop"
) %>%
arrange(match(weekday, c("lunes", "martes", "miƩrcoles", "jueves", "viernes", "sƔbado", "domingo")))
[38;5;246m# A tibble: 7 x 3[39m
weekday viajes duracion_prom
[3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<int>[39m[23m [3m[38;5;246m<dbl>[39m[23m
[38;5;250m1[39m lunes [4m7[24m[4m0[24m[4m6[24m134 13.1
[38;5;250m2[39m martes [4m7[24m[4m5[24m[4m3[24m528 13.5
[38;5;250m3[39m miƩrcoles [4m7[24m[4m3[24m[4m0[24m141 13.6
[38;5;250m4[39m jueves [4m7[24m[4m5[24m[4m2[24m254 14.9
[38;5;250m5[39m viernes [4m8[24m[4m0[24m[4m2[24m696 17.7
[38;5;250m6[39m sƔbado [4m6[24m[4m8[24m[4m3[24m626 17.9
[38;5;250m7[39m domingo [4m6[24m[4m9[24m[4m2[24m147 13.9
We can see that people prefer weekends to use bycicles
library(ggplot2)
# Duración media por tipo de usuario
ggplot(datos_unificados_clean, aes(x = member_casual, y = ride_length)) +
geom_boxplot() +
coord_cartesian(ylim = c(0, 60)) + # corta valores extremos
labs(title = "Duración de viajes por tipo de usuario", y = "Minutos")
We can limit the Y-axis to avoid showing extremely long rides (for example, 60 minutes):
ggplot(datos_unificados_clean, aes(x = member_casual, y = ride_length)) +
geom_boxplot() +
ylim(0, 60) +
labs(title = "Duración de viajes por tipo de usuario",
y = "Minutos", x = "member_casual")
We are going to explore atipical behaviours:
outliers <- datos_unificados_clean %>%
group_by(member_casual) %>%
summarise(Q1 = quantile(ride_length, 0.25, na.rm = TRUE),
Q3 = quantile(ride_length, 0.75, na.rm = TRUE)) %>%
mutate(IQR = Q3 - Q1,
lower = Q1 - 1.5 * IQR,
upper = Q3 + 1.5 * IQR)
outliers
[38;5;246m# A tibble: 2 x 6[39m
member_casual Q1 Q3 IQR lower upper
[3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m [3m[38;5;246m<dbl>[39m[23m
[38;5;250m1[39m casual 6.51 21.9 15.4 -[31m16[39m[31m.[39m[31m5[39m 44.9
[38;5;250m2[39m member 5.01 14.5 9.49 -[31m9[39m[31m.[39m[31m22[39m 28.7
ggplot(datos_unificados_clean, aes(x = member_casual, y = ride_length)) +
geom_boxplot(outlier.colour = "red", outlier.shape = 16, outlier.size = 2) +
labs(title = "Viajes por tipo de usuario (con outliers resaltados)",
y = "Minutos",
x = "Tipo de usuario")
There are many extremely long outlier trips.
This could be due to:
Errors in data collection (e.g., the bike was not returned properly).
Users who forgot to end the ride.
Actual prolonged usage cases (less likely).