Chargement des extensions Loading extensions
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.3
## -- Attaching packages -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.1 v purrr 0.3.4
## v tibble 2.1.3 v dplyr 1.0.0
## v tidyr 1.1.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.6.3
## Warning: package 'tidyr' was built under R version 3.6.3
## Warning: package 'purrr' was built under R version 3.6.3
## Warning: package 'dplyr' was built under R version 3.6.3
## -- Conflicts ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lubridate)
## Warning: package 'lubridate' was built under R version 3.6.3
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
#Chargeons les différents dataset dans des dataframes.
uber_data_apr14 <- read_csv("uber-raw-data-apr14.csv")
## Parsed with column specification:
## cols(
## `Date/Time` = col_character(),
## Lat = col_double(),
## Lon = col_double(),
## Base = col_character()
## )
uber_data_may14 <- read_csv("uber-raw-data-may14.csv")
## Parsed with column specification:
## cols(
## `Date/Time` = col_character(),
## Lat = col_double(),
## Lon = col_double(),
## Base = col_character()
## )
uber_data_jun14 <- read_csv("uber-raw-data-jun14.csv")
## Parsed with column specification:
## cols(
## `Date/Time` = col_character(),
## Lat = col_double(),
## Lon = col_double(),
## Base = col_character()
## )
uber_data_jul14 <- read_csv("uber-raw-data-jul14.csv")
## Parsed with column specification:
## cols(
## `Date/Time` = col_character(),
## Lat = col_double(),
## Lon = col_double(),
## Base = col_character()
## )
uber_data_aug14 <- read_csv("uber-raw-data-aug14.csv")
## Parsed with column specification:
## cols(
## `Date/Time` = col_character(),
## Lat = col_double(),
## Lon = col_double(),
## Base = col_character()
## )
uber_data_sep14 <- read_csv("uber-raw-data-sep14.csv")
## Parsed with column specification:
## cols(
## `Date/Time` = col_character(),
## Lat = col_double(),
## Lon = col_double(),
## Base = col_character()
## )
# Concaténons les 6 tables en une seule avec la fonction bind_rows
uber_files_concatenated <- bind_rows(uber_data_apr14,uber_data_may14,uber_data_jun14,uber_data_jul14,uber_data_aug14,uber_data_sep14)
head(uber_files_concatenated, 10)
## # A tibble: 10 x 4
## `Date/Time` Lat Lon Base
## <chr> <dbl> <dbl> <chr>
## 1 4/1/2014 0:11:00 40.8 -74.0 B02512
## 2 4/1/2014 0:17:00 40.7 -74.0 B02512
## 3 4/1/2014 0:21:00 40.7 -74.0 B02512
## 4 4/1/2014 0:28:00 40.8 -74.0 B02512
## 5 4/1/2014 0:33:00 40.8 -74.0 B02512
## 6 4/1/2014 0:33:00 40.7 -74.0 B02512
## 7 4/1/2014 0:39:00 40.7 -74.0 B02512
## 8 4/1/2014 0:45:00 40.8 -74.0 B02512
## 9 4/1/2014 0:55:00 40.8 -74.0 B02512
## 10 4/1/2014 1:01:00 40.8 -74.0 B02512
dim_combinated_data <- dim(uber_data_apr14) + dim(uber_data_may14) + dim(uber_data_jun14) + dim(uber_data_jul14) + dim(uber_data_aug14) + dim(uber_data_sep14)
dim_combinated_data
## [1] 4534327 24
dim(uber_files_concatenated)
## [1] 4534327 4
Conclusion: Ici on peut conclure que nous n’avons pas subit de pertes d’informations lors de la concaténation des données
summary(uber_files_concatenated)
## Date/Time Lat Lon Base
## Length:4534327 Min. :39.66 Min. :-74.93 Length:4534327
## Class :character 1st Qu.:40.72 1st Qu.:-74.00 Class :character
## Mode :character Median :40.74 Median :-73.98 Mode :character
## Mean :40.74 Mean :-73.97
## 3rd Qu.:40.76 3rd Qu.:-73.97
## Max. :42.12 Max. :-72.07
This step consists of cleaning and rearranging your data so that you can work on it more easily. It’s a good idea to first think of the sparsity of the dataset and check the amount of missing data . You can see that the first column is Date.Time. To be able to use these values, you need to separate them. So let’s do that, you can use the lubridate library for this. Lubridate makes it simple for you to identify the order in which the year, month, and day appears in your dates and manipulate them.
## Première étape, nous allons séparer la colonne date.time en deux colonnes (date et time) avec Lubridate
## Créons une nouvelle colonne
col_date_time <- uber_files_concatenated$`Date/Time`
## Séparons les colonnes et convertissons les en dataframes pour notre futur Dataframe
# Date
date_time <- mdy_hms(col_date_time)
Month <- month(date_time)
Month <- as.data.frame(Month)
Day <- day(date_time)
Day <- as.data.frame(Day)
Year <- year(date_time)
Year <- as.data.frame(Year)
# Time
Hour <- hour(date_time)
Hour <- as.data.frame(Hour)
Minute <- minute(date_time)
Minute <- as.data.frame(Minute)
Second <- second(date_time)
Second <- as.data.frame(Second)
# Récupérons la date pour en extraire le Weekday.
Date <- lubridate::as_date(date_time)
# unclass(Date)
Weekday <- lubridate::wday(Date)
Weekday <- as.data.frame(Weekday)
# Récupérons kes colonnes Latide, Longitude et Base afin de créer notre nouveau tableau.
Lat <- uber_files_concatenated$Lat
Lat <- as.data.frame(Lat)
Lon <- uber_files_concatenated$Lon
Lon <- as.data.frame(Lon)
Base <- uber_files_concatenated$Base
Base <- as.data.frame(Base)
## Créons une nouvelle base de Donnée
## (Ici nous allons utilser la fonction cbind pour concatener nos différentes variables)
## A remarquer que nous aurons pu utiliser rbind qui marcherait parfaite sans que l'on ai
## à convertir nos différents variables en dataframe or bind_cols nécessite des dataframes,
## d'où la convertion des différentes valeurs en dataframes avec la fonction 'as.data.frames'.
uber_final <- bind_cols(Lat,Lon,Base,Year,Month,Day,Weekday,Hour,Minute,Second)
head(uber_final, 20)
## Lat Lon Base Year Month Day Weekday Hour Minute Second
## 1 40.7690 -73.9549 B02512 2014 4 1 3 0 11 0
## 2 40.7267 -74.0345 B02512 2014 4 1 3 0 17 0
## 3 40.7316 -73.9873 B02512 2014 4 1 3 0 21 0
## 4 40.7588 -73.9776 B02512 2014 4 1 3 0 28 0
## 5 40.7594 -73.9722 B02512 2014 4 1 3 0 33 0
## 6 40.7383 -74.0403 B02512 2014 4 1 3 0 33 0
## 7 40.7223 -73.9887 B02512 2014 4 1 3 0 39 0
## 8 40.7620 -73.9790 B02512 2014 4 1 3 0 45 0
## 9 40.7524 -73.9960 B02512 2014 4 1 3 0 55 0
## 10 40.7575 -73.9846 B02512 2014 4 1 3 1 1 0
## 11 40.7256 -73.9869 B02512 2014 4 1 3 1 19 0
## 12 40.7591 -73.9684 B02512 2014 4 1 3 1 48 0
## 13 40.7271 -73.9803 B02512 2014 4 1 3 1 49 0
## 14 40.6463 -73.7896 B02512 2014 4 1 3 2 11 0
## 15 40.7564 -73.9167 B02512 2014 4 1 3 2 25 0
## 16 40.7666 -73.9531 B02512 2014 4 1 3 2 31 0
## 17 40.7580 -73.9761 B02512 2014 4 1 3 2 43 0
## 18 40.7238 -73.9821 B02512 2014 4 1 3 3 22 0
## 19 40.7531 -74.0039 B02512 2014 4 1 3 3 35 0
## 20 40.7389 -74.0393 B02512 2014 4 1 3 3 35 0