R Markdown

Chargement des extensions Loading extensions

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 3.6.3

## -- Attaching packages -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.1     v purrr   0.3.4
## v tibble  2.1.3     v dplyr   1.0.0
## v tidyr   1.1.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## Warning: package 'ggplot2' was built under R version 3.6.3

## Warning: package 'tidyr' was built under R version 3.6.3

## Warning: package 'purrr' was built under R version 3.6.3

## Warning: package 'dplyr' was built under R version 3.6.3

## -- Conflicts ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(lubridate)

## Warning: package 'lubridate' was built under R version 3.6.3

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

Understand The Data

Charger le fichier .csv?

Load in R the .csv file ?

#Chargeons les différents dataset dans des dataframes.
 uber_data_apr14 <- read_csv("uber-raw-data-apr14.csv")

## Parsed with column specification:
## cols(
##   `Date/Time` = col_character(),
##   Lat = col_double(),
##   Lon = col_double(),
##   Base = col_character()
## )

 uber_data_may14 <- read_csv("uber-raw-data-may14.csv")

## Parsed with column specification:
## cols(
##   `Date/Time` = col_character(),
##   Lat = col_double(),
##   Lon = col_double(),
##   Base = col_character()
## )

 uber_data_jun14 <- read_csv("uber-raw-data-jun14.csv")

## Parsed with column specification:
## cols(
##   `Date/Time` = col_character(),
##   Lat = col_double(),
##   Lon = col_double(),
##   Base = col_character()
## )

 uber_data_jul14 <- read_csv("uber-raw-data-jul14.csv")

## Parsed with column specification:
## cols(
##   `Date/Time` = col_character(),
##   Lat = col_double(),
##   Lon = col_double(),
##   Base = col_character()
## )

 uber_data_aug14 <- read_csv("uber-raw-data-aug14.csv")

## Parsed with column specification:
## cols(
##   `Date/Time` = col_character(),
##   Lat = col_double(),
##   Lon = col_double(),
##   Base = col_character()
## )

 uber_data_sep14 <- read_csv("uber-raw-data-sep14.csv")

## Parsed with column specification:
## cols(
##   `Date/Time` = col_character(),
##   Lat = col_double(),
##   Lon = col_double(),
##   Base = col_character()
## )

Bind all the data files into one. We may use the bind_rows() function under the dplyr library in R.

# Concaténons les 6 tables en une seule avec la fonction bind_rows
uber_files_concatenated <- bind_rows(uber_data_apr14,uber_data_may14,uber_data_jun14,uber_data_jul14,uber_data_aug14,uber_data_sep14)
head(uber_files_concatenated, 10)

## # A tibble: 10 x 4
##    `Date/Time`        Lat   Lon Base  
##    <chr>            <dbl> <dbl> <chr> 
##  1 4/1/2014 0:11:00  40.8 -74.0 B02512
##  2 4/1/2014 0:17:00  40.7 -74.0 B02512
##  3 4/1/2014 0:21:00  40.7 -74.0 B02512
##  4 4/1/2014 0:28:00  40.8 -74.0 B02512
##  5 4/1/2014 0:33:00  40.8 -74.0 B02512
##  6 4/1/2014 0:33:00  40.7 -74.0 B02512
##  7 4/1/2014 0:39:00  40.7 -74.0 B02512
##  8 4/1/2014 0:45:00  40.8 -74.0 B02512
##  9 4/1/2014 0:55:00  40.8 -74.0 B02512
## 10 4/1/2014 1:01:00  40.8 -74.0 B02512

Vérifions la dimension total des données combiné

dim_combinated_data <- dim(uber_data_apr14) + dim(uber_data_may14) + dim(uber_data_jun14) + dim(uber_data_jul14) + dim(uber_data_aug14) + dim(uber_data_sep14)
dim_combinated_data

## [1] 4534327      24

Vérifions si nous obtenons le même nombre de ligne sur le fichier combiné et les autres données

dim(uber_files_concatenated)

## [1] 4534327       4

Conclusion: Ici on peut conclure que nous n’avons pas subit de pertes d’informations lors de la concaténation des données

get the summary of the data to get an idea of what you are dealing with.

summary(uber_files_concatenated)

##   Date/Time              Lat             Lon             Base          
##  Length:4534327     Min.   :39.66   Min.   :-74.93   Length:4534327    
##  Class :character   1st Qu.:40.72   1st Qu.:-74.00   Class :character  
##  Mode  :character   Median :40.74   Median :-73.98   Mode  :character  
##                     Mean   :40.74   Mean   :-73.97                     
##                     3rd Qu.:40.76   3rd Qu.:-73.97                     
##                     Max.   :42.12   Max.   :-72.07

DATA PREPARATION

This step consists of cleaning and rearranging your data so that you can work on it more easily. It’s a good idea to first think of the sparsity of the dataset and check the amount of missing data . You can see that the first column is Date.Time. To be able to use these values, you need to separate them. So let’s do that, you can use the lubridate library for this. Lubridate makes it simple for you to identify the order in which the year, month, and day appears in your dates and manipulate them.

## Première étape, nous allons séparer la colonne date.time en deux colonnes (date et time) avec Lubridate
## Créons une nouvelle colonne
col_date_time <- uber_files_concatenated$`Date/Time`

## Séparons les colonnes et convertissons les en dataframes pour notre futur Dataframe
# Date
date_time <- mdy_hms(col_date_time)
Month <- month(date_time)
Month <- as.data.frame(Month)
Day <- day(date_time)
Day <- as.data.frame(Day)
Year <- year(date_time)
Year <- as.data.frame(Year)

# Time
Hour <- hour(date_time)
Hour <- as.data.frame(Hour)
Minute <- minute(date_time)
Minute <- as.data.frame(Minute)
Second <- second(date_time)
Second <- as.data.frame(Second)

# Récupérons la date pour en extraire le Weekday.
Date <- lubridate::as_date(date_time)

# unclass(Date)
Weekday <- lubridate::wday(Date)
Weekday <- as.data.frame(Weekday)

# Récupérons kes colonnes Latide, Longitude et Base afin de créer notre nouveau tableau.
Lat <- uber_files_concatenated$Lat
Lat <- as.data.frame(Lat)
Lon <- uber_files_concatenated$Lon
Lon <- as.data.frame(Lon)
Base <- uber_files_concatenated$Base
Base <- as.data.frame(Base)


## Créons une nouvelle base de Donnée 
## (Ici nous allons utilser la fonction cbind pour concatener nos différentes variables)
## A remarquer que nous aurons pu utiliser rbind qui marcherait parfaite sans que l'on ai 
## à convertir nos différents variables en dataframe or bind_cols nécessite des dataframes,
## d'où la convertion des différentes valeurs en dataframes avec la fonction 'as.data.frames'.
uber_final <- bind_cols(Lat,Lon,Base,Year,Month,Day,Weekday,Hour,Minute,Second)
head(uber_final, 20)

##        Lat      Lon   Base Year Month Day Weekday Hour Minute Second
## 1  40.7690 -73.9549 B02512 2014     4   1       3    0     11      0
## 2  40.7267 -74.0345 B02512 2014     4   1       3    0     17      0
## 3  40.7316 -73.9873 B02512 2014     4   1       3    0     21      0
## 4  40.7588 -73.9776 B02512 2014     4   1       3    0     28      0
## 5  40.7594 -73.9722 B02512 2014     4   1       3    0     33      0
## 6  40.7383 -74.0403 B02512 2014     4   1       3    0     33      0
## 7  40.7223 -73.9887 B02512 2014     4   1       3    0     39      0
## 8  40.7620 -73.9790 B02512 2014     4   1       3    0     45      0
## 9  40.7524 -73.9960 B02512 2014     4   1       3    0     55      0
## 10 40.7575 -73.9846 B02512 2014     4   1       3    1      1      0
## 11 40.7256 -73.9869 B02512 2014     4   1       3    1     19      0
## 12 40.7591 -73.9684 B02512 2014     4   1       3    1     48      0
## 13 40.7271 -73.9803 B02512 2014     4   1       3    1     49      0
## 14 40.6463 -73.7896 B02512 2014     4   1       3    2     11      0
## 15 40.7564 -73.9167 B02512 2014     4   1       3    2     25      0
## 16 40.7666 -73.9531 B02512 2014     4   1       3    2     31      0
## 17 40.7580 -73.9761 B02512 2014     4   1       3    2     43      0
## 18 40.7238 -73.9821 B02512 2014     4   1       3    3     22      0
## 19 40.7531 -74.0039 B02512 2014     4   1       3    3     35      0
## 20 40.7389 -74.0393 B02512 2014     4   1       3    3     35      0

Uber Pickups in NYC Avec R

Hurgland-Nick KELIET

26 mai 2019