For your assignment you may be using different dataset than what is included here.
Always read carefully the instructions on Sakai.
Tasks/questions to be completed/answered are highlighted in larger bolded fonts and numbered according to their section.
We are going to use tidyverse a collection of R packages designed for data science.
## Loading required package: tidyverse
## -- Attaching packages --------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1 v purrr 0.2.4
## v tibble 1.4.2 v dplyr 0.7.4
## v tidyr 0.8.0 v stringr 1.2.0
## v readr 1.1.1 v forcats 0.2.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
Name your dataset ‘mydata’ so it easy to work with.
Commands: read_csv() head() mean() sub() as.numeric()
mydata <- read_csv("data/Taxi_Trips_Sample.csv")
## Parsed with column specification:
## cols(
## .default = col_character(),
## `Trip Seconds` = col_integer(),
## `Trip Miles` = col_double(),
## `Pickup Census Tract` = col_double(),
## `Dropoff Census Tract` = col_double(),
## `Pickup Community Area` = col_integer(),
## `Dropoff Community Area` = col_integer(),
## `Pickup Centroid Latitude` = col_double(),
## `Pickup Centroid Longitude` = col_double(),
## `Dropoff Centroid Latitude` = col_double(),
## `Dropoff Centroid Longitude` = col_double(),
## `Community Areas` = col_integer()
## )
## See spec(...) for full column specifications.
mydata <- na.omit(mydata)
head(mydata)
## # A tibble: 6 x 24
## `Trip ID` `Taxi ID` `Trip Start Tim~ `Trip End Times~ `Trip Seconds`
## <chr> <chr> <chr> <chr> <int>
## 1 3e7d6e5c4~ bc1c0381e3b~ 09/23/2013 05:1~ 09/23/2013 05:1~ 420
## 2 3e7d706fe~ 631de3a4ad3~ 05/05/2014 09:3~ 05/05/2014 09:4~ 900
## 3 3e7d70d4a~ 623c452c15d~ 03/26/2016 01:3~ 03/26/2016 01:3~ 300
## 4 3e7d7115e~ 288345b3a02~ 05/13/2015 10:3~ 05/13/2015 10:4~ 540
## 5 3e7d712c0~ dd9b35afedd~ 06/17/2016 04:0~ 06/17/2016 04:3~ 1140
## 6 3e7d71a6e~ 19a9ad5b3b7~ 03/15/2013 10:1~ 03/15/2013 10:4~ 1200
## # ... with 19 more variables: `Trip Miles` <dbl>, `Pickup Census
## # Tract` <dbl>, `Dropoff Census Tract` <dbl>, `Pickup Community
## # Area` <int>, `Dropoff Community Area` <int>, Fare <chr>, Tips <chr>,
## # Tolls <chr>, Extras <chr>, `Trip Total` <chr>, `Payment Type` <chr>,
## # Company <chr>, `Pickup Centroid Latitude` <dbl>, `Pickup Centroid
## # Longitude` <dbl>, `Pickup Centroid Location` <chr>, `Dropoff Centroid
## # Latitude` <dbl>, `Dropoff Centroid Longitude` <dbl>, `Dropoff Centroid
## # Location` <chr>, `Community Areas` <int>
Fare <- mydata$Fare
Tips <- mydata$Tips
Tolls <- mydata$Tolls
Extras <- mydata$Extras
head(Fare)
## [1] "$6.05" "$10.45" "$6.25" "$7.25" "$13.25" "$35.25"
head(Tips)
## [1] "$0.00" "$0.00" "$0.00" "$0.00" "$2.00" "$11.15"
head(Tolls)
## [1] "$0.00" "$0.00" "$0.00" "$0.00" "$0.00" "$0.00"
head(Extras)
## [1] "$0.00" "$0.00" "$0.00" "$0.00" "$0.00" "$2.00"
** What happens when we try to use the function? **
mean(Fare)
## Warning in mean.default(Fare): argument is not numeric or logical:
## returning NA
## [1] NA
mean(Tips)
## Warning in mean.default(Tips): argument is not numeric or logical:
## returning NA
## [1] NA
mean(Tolls)
## Warning in mean.default(Tolls): argument is not numeric or logical:
## returning NA
## [1] NA
mean(Extras)
## Warning in mean.default(Extras): argument is not numeric or logical:
## returning NA
## [1] NA
To resolve the error, check if the feature data type is correct
Notice that comma ‘1,234’ in some values
Also the data type and dollar sign ‘$’ symbol
To remove the any character in this case the comma from “1,234”. We must substitute it with just an empty space.
** substitute comma with “” **
Fare <- sub(pattern = "," , replacement = "", x = Fare)
Tips <- sub(pattern = "," , replacement = "", x = Tips)
Tolls <- sub(pattern = "," , replacement = "", x = Tolls)
Extras <- sub(pattern = "," , replacement = "", x = Extras)
head(Fare)
## [1] "$6.05" "$10.45" "$6.25" "$7.25" "$13.25" "$35.25"
head(Tips)
## [1] "$0.00" "$0.00" "$0.00" "$0.00" "$2.00" "$11.15"
head(Tolls)
## [1] "$0.00" "$0.00" "$0.00" "$0.00" "$0.00" "$0.00"
head(Extras)
## [1] "$0.00" "$0.00" "$0.00" "$0.00" "$0.00" "$2.00"
** substitute dollar sign with “” **
Fare <- sub(pattern = "\\$" , replacement = "", x = Fare)
Tips <- sub(pattern = "\\$" , replacement = "", x = Tips)
Tolls <- sub(pattern = "\\$" , replacement = "", x = Tolls)
Extras <- sub(pattern = "\\$" , replacement = "", x = Extras)
head(Fare)
## [1] "6.05" "10.45" "6.25" "7.25" "13.25" "35.25"
head(Tips)
## [1] "0.00" "0.00" "0.00" "0.00" "2.00" "11.15"
head(Tolls)
## [1] "0.00" "0.00" "0.00" "0.00" "0.00" "0.00"
head(Extras)
## [1] "0.00" "0.00" "0.00" "0.00" "0.00" "2.00"
** character to numeric - as.numeric()**
Fare <- as.numeric(Fare)
Tips <- as.numeric(Tips)
Tolls <- as.numeric(Tolls)
Extras <- as.numeric(Extras)
** mean with NA removed **
# VARIABLE_MEAN <- mean( YOUR_VARIABLE , na.rm = TRUE)
Fare_mean <- mean(Fare , na.rm = TRUE)
Tips_mean <- mean(Tips , na.rm = TRUE)
Tolls_mean <- mean(Tolls , na.rm = TRUE)
Extras_mean <- mean(Extras , na.rm = TRUE)
Fare_mean
## [1] 10.73069
Tips_mean
## [1] 1.105968
Tolls_mean
## [1] 0.0003457773
Extras_mean
## [1] 0.7472891
** Clean variables to dataset **
# mydata$VARIABLE <- VARIABLE
mydata$Fare <- Fare
mydata$Tips <- Tips
mydata$Tolls <- Tolls
mydata$Extras <- Extras
mydata
## # A tibble: 39,939 x 24
## `Trip ID` `Taxi ID` `Trip Start Tim~ `Trip End Times~ `Trip Seconds`
## <chr> <chr> <chr> <chr> <int>
## 1 3e7d6e5c4~ bc1c0381e3~ 09/23/2013 05:1~ 09/23/2013 05:1~ 420
## 2 3e7d706fe~ 631de3a4ad~ 05/05/2014 09:3~ 05/05/2014 09:4~ 900
## 3 3e7d70d4a~ 623c452c15~ 03/26/2016 01:3~ 03/26/2016 01:3~ 300
## 4 3e7d7115e~ 288345b3a0~ 05/13/2015 10:3~ 05/13/2015 10:4~ 540
## 5 3e7d712c0~ dd9b35afed~ 06/17/2016 04:0~ 06/17/2016 04:3~ 1140
## 6 3e7d71a6e~ 19a9ad5b3b~ 03/15/2013 10:1~ 03/15/2013 10:4~ 1200
## 7 3e7d71e3a~ 73680c341d~ 05/20/2015 10:0~ 05/20/2015 10:0~ 0
## 8 3e7d71fb2~ 1cbeb81c01~ 01/29/2015 10:3~ 01/29/2015 10:4~ 1080
## 9 3e7d722c8~ 2e9532b316~ 04/26/2014 03:0~ 04/26/2014 03:0~ 780
## 10 3e7d72410~ 4eec0654a5~ 02/21/2013 05:4~ 02/21/2013 06:0~ 540
## # ... with 39,929 more rows, and 19 more variables: `Trip Miles` <dbl>,
## # `Pickup Census Tract` <dbl>, `Dropoff Census Tract` <dbl>, `Pickup
## # Community Area` <int>, `Dropoff Community Area` <int>, Fare <dbl>,
## # Tips <dbl>, Tolls <dbl>, Extras <dbl>, `Trip Total` <chr>, `Payment
## # Type` <chr>, Company <chr>, `Pickup Centroid Latitude` <dbl>, `Pickup
## # Centroid Longitude` <dbl>, `Pickup Centroid Location` <chr>, `Dropoff
## # Centroid Latitude` <dbl>, `Dropoff Centroid Longitude` <dbl>, `Dropoff
## # Centroid Location` <chr>, `Community Areas` <int>
** Save clean data **
# write_csv(mydata, path = "data/mydata_clean.csv")
write_csv(mydata, path = "data/Taxi_Trips_Sample_clean.csv")
In this task we must calculate the mean, standard deviation, maximum, and minimum for the given feature.
mydata <- read_csv(file = "data/Taxi_Trips_Sample_clean.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## `Trip ID` = col_character(),
## `Taxi ID` = col_character(),
## `Trip Start Timestamp` = col_character(),
## `Trip End Timestamp` = col_character(),
## `Trip Seconds` = col_integer(),
## `Pickup Community Area` = col_integer(),
## `Dropoff Community Area` = col_integer(),
## `Trip Total` = col_character(),
## `Payment Type` = col_character(),
## Company = col_character(),
## `Pickup Centroid Location` = col_character(),
## `Dropoff Centroid Location` = col_character(),
## `Community Areas` = col_integer()
## )
## See spec(...) for full column specifications.
Trip_Seconds <- mydata$'Trip Seconds'
** calculate the average **
mean(Trip_Seconds)
## [1] 709.3322
** calculate the standard deviation **
sd(Trip_Seconds)
## [1] 659.8013
** calculate the min **
min(Trip_Seconds, na.rm = TRUE)
## [1] 0
** calculate the max **
max(Trip_Seconds, na.rm = TRUE)
## [1] 8940
To find the outliers we are going to look at the upper and lower limits
An outlier is value that “lies outside” most of the other values in a set of data.
A method to find upper and lower thresholds involves finding the interquartile range.
** quantile calculation for the give feature**
quantile(Trip_Seconds, na.rm = TRUE)
## 0% 25% 50% 75% 100%
## 0 360 540 840 8940
** Lower and upper quantile calculation **
# lowerq = quantile(VARIABLE)[2]
# upperq = quantile(VARIABLE)[4]
lowerq = quantile(Trip_Seconds, na.rm = TRUE)[2]
upperq = quantile(Trip_Seconds, na.rm = TRUE)[4]
Interquantile calculation
iqr = upperq - lowerq
iqr
## 75%
## 480
The threshold is the boundaries that determine if a value is an outlier.
If the value falls above the upper threshold or below the lower threshold, it is an outlier.
** Calculation the upper threshold **
# upper_threshold = (iqr * 1.5) + upperq
upper_threshold = (iqr * 1.5) + upperq
upper_threshold
## 75%
## 1560
** Calculation the lower threshold **
# lower_threshold = lowerq - (iqr * 1.5)
lower_threshold = lowerq - (iqr * 1.5)
lower_threshold
## 25%
## -360
** Identify outliers **
# VARIABLE[ VARIABLE > upper_threshold][1:10]
# VARIABLE[ VARIABLE > lower_threshold][1:10]
Trip_Seconds[ Trip_Seconds > upper_threshold][1:10]
## [1] 1680 1860 1980 3360 4500 2760 1860 2700 2940 3300
Trip_Seconds[ Trip_Seconds < lower_threshold][1:10]
## [1] NA NA NA NA NA NA NA NA NA NA
** Finding outliers records **
# mydata[ VARIABLE > upper_threshold, ][1:10]
# mydata[ VARIABLE > lower_threshold, ][1:10]
mydata[ Trip_Seconds > upper_threshold, ][1:10]
## # A tibble: 3,385 x 10
## `Trip ID` `Taxi ID` `Trip Start Tim~ `Trip End Times~ `Trip Seconds`
## <chr> <chr> <chr> <chr> <int>
## 1 3e7d78b99~ b80ae25954~ 06/06/2014 03:1~ 06/06/2014 03:4~ 1680
## 2 3e7d851f8~ 259ec57ee4~ 09/18/2014 12:0~ 09/18/2014 12:3~ 1860
## 3 3e7d87103~ 78fb99d332~ 03/24/2013 04:1~ 03/24/2013 04:4~ 1980
## 4 3e7d8b799~ 74605d6aa0~ 07/23/2015 04:0~ 07/23/2015 04:4~ 3360
## 5 3e7d8f714~ 97c6416993~ 09/30/2015 04:4~ 09/30/2015 06:0~ 4500
## 6 3e7d91de6~ 0de60e5618~ 10/30/2015 08:0~ 10/30/2015 08:4~ 2760
## 7 3e7d96aa7~ b918186326~ 02/05/2014 09:3~ 02/05/2014 10:0~ 1860
## 8 3e7d99a42~ 5cfc2dce21~ 08/19/2015 03:4~ 08/19/2015 04:3~ 2700
## 9 3e7d9d2f5~ 24b87270f3~ 06/08/2016 04:4~ 06/08/2016 05:3~ 2940
## 10 3e7da54c9~ 8d13012158~ 02/14/2014 07:4~ 02/14/2014 08:4~ 3300
## # ... with 3,375 more rows, and 5 more variables: `Trip Miles` <dbl>,
## # `Pickup Census Tract` <dbl>, `Dropoff Census Tract` <dbl>, `Pickup
## # Community Area` <int>, `Dropoff Community Area` <int>
mydata[ Trip_Seconds < lower_threshold, ][1:10]
## # A tibble: 0 x 10
## # ... with 10 variables: `Trip ID` <chr>, `Taxi ID` <chr>, `Trip Start
## # Timestamp` <chr>, `Trip End Timestamp` <chr>, `Trip Seconds` <int>,
## # `Trip Miles` <dbl>, `Pickup Census Tract` <dbl>, `Dropoff Census
## # Tract` <dbl>, `Pickup Community Area` <int>, `Dropoff Community
## # Area` <int>
count(mydata[ Trip_Seconds > upper_threshold, ])
## # A tibble: 1 x 1
## n
## <int>
## 1 3385
count(mydata[ Trip_Seconds < lower_threshold, ])
## # A tibble: 1 x 1
## n
## <int>
## 1 0
It can also be useful to visualize the data using a box and whisker plot.
The boxplot supports the IQR also shows the upper and lower thresholds
# p <- ggplot(data = Scoring, aes(x = "", y = Finrat)) + geom_boxplot() + coord_flip()
# p
p <- ggplot(data = mydata, aes(x = "", y = Trip_Seconds)) + geom_boxplot() + coord_flip()
p
Chicago Taxi Dashboard: https://data.cityofchicago.org/Transportation/Taxi-Trips-Dashboard/spcw-brbq
Chicago Taxi Data Description: http://digital.cityofchicago.org/index.php/chicago-taxi-data-released
knitr::include_graphics('imgs/tripmiles.png')
summary(mydata)
## Trip ID Taxi ID Trip Start Timestamp
## Length:39939 Length:39939 Length:39939
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## Trip End Timestamp Trip Seconds Trip Miles Pickup Census Tract
## Length:39939 Min. : 0.0 Min. : 0.00 Min. :1.703e+10
## Class :character 1st Qu.: 360.0 1st Qu.: 0.00 1st Qu.:1.703e+10
## Mode :character Median : 540.0 Median : 0.50 Median :1.703e+10
## Mean : 709.3 Mean : 1.95 Mean :1.703e+10
## 3rd Qu.: 840.0 3rd Qu.: 1.50 3rd Qu.:1.703e+10
## Max. :8940.0 Max. :1115.30 Max. :1.703e+10
## Dropoff Census Tract Pickup Community Area Dropoff Community Area
## Min. :1.703e+10 Min. : 1.00 Min. : 1.00
## 1st Qu.:1.703e+10 1st Qu.: 8.00 1st Qu.: 8.00
## Median :1.703e+10 Median : 8.00 Median : 8.00
## Mean :1.703e+10 Mean :21.58 Mean :21.19
## 3rd Qu.:1.703e+10 3rd Qu.:32.00 3rd Qu.:32.00
## Max. :1.703e+10 Max. :77.00 Max. :77.00
## Fare Tips Tolls Extras
## Min. : 0.00 Min. : 0.000 Min. :0.0000000 Min. : 0.0000
## 1st Qu.: 5.65 1st Qu.: 0.000 1st Qu.:0.0000000 1st Qu.: 0.0000
## Median : 7.25 Median : 0.000 Median :0.0000000 Median : 0.0000
## Mean : 10.73 Mean : 1.106 Mean :0.0003458 Mean : 0.7473
## 3rd Qu.: 10.45 3rd Qu.: 2.000 3rd Qu.:0.0000000 3rd Qu.: 1.0000
## Max. :106.25 Max. :50.000 Max. :3.0000000 Max. :213.0000
## Trip Total Payment Type Company
## Length:39939 Length:39939 Length:39939
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## Pickup Centroid Latitude Pickup Centroid Longitude
## Min. :41.79 Min. :-87.90
## 1st Qu.:41.88 1st Qu.:-87.64
## Median :41.89 Median :-87.63
## Mean :41.90 Mean :-87.65
## 3rd Qu.:41.90 3rd Qu.:-87.62
## Max. :42.02 Max. :-87.58
## Pickup Centroid Location Dropoff Centroid Latitude
## Length:39939 Min. :41.78
## Class :character 1st Qu.:41.88
## Mode :character Median :41.89
## Mean :41.90
## 3rd Qu.:41.90
## Max. :42.02
## Dropoff Centroid Longitude Dropoff Centroid Location Community Areas
## Min. :-87.90 Length:39939 Min. : 1.00
## 1st Qu.:-87.64 Class :character 1st Qu.:37.00
## Median :-87.63 Mode :character Median :37.00
## Mean :-87.65 Mean :40.63
## 3rd Qu.:-87.62 3rd Qu.:38.00
## Max. :-87.58 Max. :76.00
knitr::include_graphics('imgs/taxierdplus-diagram.png')