Data Wrangling by R

Dinesh

5/6/2021


R Markdown

Data wrangling is the process of cleaning, organizing and enriching raw complex data sets for easy access and analysis. It is a important step in between data acquisition and exploratory data analysis (EDA) in Data Science Process.

http://rmarkdown.rstudio.com.

To prepare your data for analysis, as part of data wrangling, there are six (6) basic steps that need to be followed.

  1. Data Discovery
  2. Data Structuring
  3. Data Cleaning
  4. Data Enriching
  5. Data Validation
  6. Data Publishing

##Lets jump to answer some of the questions

library(dplyr)
library(hflights)
library(nycflights13)
library(leaflet)
library(plotly)
library(ggmap)
library(mapdata)
library(maps)
library(kableExtra)

Data Exploration

You can also embed plots, for example:

## # A tibble: 6 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1     1      517            515         2      830            819
## 2  2013     1     1      533            529         4      850            830
## 3  2013     1     1      542            540         2      923            850
## 4  2013     1     1      544            545        -1     1004           1022
## 5  2013     1     1      554            600        -6      812            837
## 6  2013     1     1      554            558        -4      740            728
## # ... with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
glimpse(flights)
## Rows: 336,776
## Columns: 19
## $ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2~
## $ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, ~
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, ~
## $ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1~
## $ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,~
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,~
## $ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1~
## $ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "~
## $ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4~
## $ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394~
## $ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",~
## $ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",~
## $ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1~
## $ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, ~
## $ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6~
## $ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0~
## $ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0~

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

df_airlines <- airlines
head(airlines)
## # A tibble: 6 x 2
##   carrier name                    
##   <chr>   <chr>                   
## 1 9E      Endeavor Air Inc.       
## 2 AA      American Airlines Inc.  
## 3 AS      Alaska Airlines Inc.    
## 4 B6      JetBlue Airways         
## 5 DL      Delta Air Lines Inc.    
## 6 EV      ExpressJet Airlines Inc.
df_airports <- airports
head(airports)
## # A tibble: 6 x 8
##   faa   name                          lat   lon   alt    tz dst   tzone         
##   <chr> <chr>                       <dbl> <dbl> <dbl> <dbl> <chr> <chr>         
## 1 04G   Lansdowne Airport            41.1 -80.6  1044    -5 A     America/New_Y~
## 2 06A   Moton Field Municipal Airp~  32.5 -85.7   264    -6 A     America/Chica~
## 3 06C   Schaumburg Regional          42.0 -88.1   801    -6 A     America/Chica~
## 4 06N   Randall Airport              41.4 -74.4   523    -5 A     America/New_Y~
## 5 09J   Jekyll Island Airport        31.1 -81.4    11    -5 A     America/New_Y~
## 6 0A9   Elizabethton Municipal Air~  36.4 -82.2  1593    -5 A     America/New_Y~
df_weather <- weather
head(weather)
## # A tibble: 6 x 15
##   origin  year month   day  hour  temp  dewp humid wind_dir wind_speed wind_gust
##   <chr>  <int> <int> <int> <int> <dbl> <dbl> <dbl>    <dbl>      <dbl>     <dbl>
## 1 EWR     2013     1     1     1  39.0  26.1  59.4      270      10.4         NA
## 2 EWR     2013     1     1     2  39.0  27.0  61.6      250       8.06        NA
## 3 EWR     2013     1     1     3  39.0  28.0  64.4      240      11.5         NA
## 4 EWR     2013     1     1     4  39.9  28.0  62.2      250      12.7         NA
## 5 EWR     2013     1     1     5  39.0  28.0  64.4      260      12.7         NA
## 6 EWR     2013     1     1     6  37.9  28.0  67.2      240      11.5         NA
## # ... with 4 more variables: precip <dbl>, pressure <dbl>, visib <dbl>,
## #   time_hour <dttm>
glimpse(weather)
## Rows: 26,115
## Columns: 15
## $ origin     <chr> "EWR", "EWR", "EWR", "EWR", "EWR", "EWR", "EWR", "EWR", "EW~
## $ year       <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013,~
## $ month      <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,~
## $ day        <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,~
## $ hour       <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, ~
## $ temp       <dbl> 39.02, 39.02, 39.02, 39.92, 39.02, 37.94, 39.02, 39.92, 39.~
## $ dewp       <dbl> 26.06, 26.96, 28.04, 28.04, 28.04, 28.04, 28.04, 28.04, 28.~
## $ humid      <dbl> 59.37, 61.63, 64.43, 62.21, 64.43, 67.21, 64.43, 62.21, 62.~
## $ wind_dir   <dbl> 270, 250, 240, 250, 260, 240, 240, 250, 260, 260, 260, 330,~
## $ wind_speed <dbl> 10.35702, 8.05546, 11.50780, 12.65858, 12.65858, 11.50780, ~
## $ wind_gust  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 20.~
## $ precip     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ pressure   <dbl> 1012.0, 1012.3, 1012.5, 1012.2, 1011.9, 1012.4, 1012.2, 101~
## $ visib      <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,~
## $ time_hour  <dttm> 2013-01-01 01:00:00, 2013-01-01 02:00:00, 2013-01-01 03:00~
df_planes <- planes
head(planes)
## # A tibble: 6 x 9
##   tailnum  year type           manufacturer   model  engines seats speed engine 
##   <chr>   <int> <chr>          <chr>          <chr>    <int> <int> <int> <chr>  
## 1 N10156   2004 Fixed wing mu~ EMBRAER        EMB-1~       2    55    NA Turbo-~
## 2 N102UW   1998 Fixed wing mu~ AIRBUS INDUST~ A320-~       2   182    NA Turbo-~
## 3 N103US   1999 Fixed wing mu~ AIRBUS INDUST~ A320-~       2   182    NA Turbo-~
## 4 N104UW   1999 Fixed wing mu~ AIRBUS INDUST~ A320-~       2   182    NA Turbo-~
## 5 N10575   2002 Fixed wing mu~ EMBRAER        EMB-1~       2    55    NA Turbo-~
## 6 N105UW   1999 Fixed wing mu~ AIRBUS INDUST~ A320-~       2   182    NA Turbo-~

Before we jump into answering the questions, lets clean the data

duplicates <- df_flights %>% 
  #filter(origin %in% c("EWR", "LGA", "JFK") & dest %in% c("IAH", "IWS")) %>% 
  group_by(origin, dest, year, month, day, dep_time, flight) %>% 
  tally() %>% 
  filter(n > 1) %>% 
  pull()

print(duplicates)
## integer(0)

Good News!!! There are no duplicates in the dataset. Now, lets identify how many rows has atleast one null value and what are they?

rows_null <- df_flights[rowSums(is.na(df_flights)) > 0, ]
col_null_count <- colSums(is.na(rows_null))
df_nullcount <- as.data.frame(col_null_count) %>% 
  rename(nullcount = col_null_count) %>%
  filter(nullcount > 0)
         
print(df_nullcount)
##           nullcount
## dep_time       8255
## dep_delay      8255
## arr_time       8713
## arr_delay      9430
## tailnum        2512
## air_time       9430

From the above table, its clear that 6 rows in the dataset has NULL values.

The questions that we want to ask ourself:

Q1) How many different destinations can you fly to from Newyork airport in year 2013 ( EWR, LGA, JFK) ?

df_dest <- df_flights %>%
  filter(!dest %in% c("EWR", "LGA", "JFK")) %>%
  distinct(dest) %>% 
    left_join(., df_airports, c("dest" =  "faa")) 

dest_count <- df_dest %>%
  nrow() 
print(paste0("Total Destinations offered from New York Airports in year 2013: ", dest_count))
## [1] "Total Destinations offered from New York Airports in year 2013: 104"

Lets point out the destinations on a map.

#Excluding destinations with out Airport Name, latitude and longitude
df_sp <- df_dest %>% 
  filter(!((is.na(name) | is.na(lat) | is.na(lon))))

#leaflet(df_sp) %>% 
#  addTiles() %>% 
#  addMarkers(lng = ~lon, lat = ~lat, popup = ~as.character(name),label = ~as.character(name))

fig <- df_sp %>%
  plot_ly(
    lat = ~lat,
    lon = ~lon,
    marker = list(color = "lightcoral"),
    type = 'scattermapbox',
      size = 5,
    hovertext = df_sp$name) 
fig <- fig %>%
  layout(
    mapbox = list(
      style = 'open-street-map',
      zoom =2.5,
      center = list(lon = -88, lat = 34))) 

fig
## No scattermapbox mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

creating static map

states <- map_data("state")
ggplot() +
  theme_nothing() + 
  geom_polygon(data = states, aes(x = long , y = lat, group =  group), fill = NA, color = "gray" )   +
  geom_point(data = df_sp, aes(x = lon, y = lat)  ,show.legend = F) + 
  coord_fixed(1.3) +
  guides(fill=FALSE)

Q2) How many flights flew to Dallas in an year from NewYork Airports ?

flights_dallas <- df_flights %>% 
  filter(origin %in% c("EWR", "LGA", "JFK") & dest %in% c("DAL", "DFW", "RBD")) %>% 
  group_by(year) %>% 
  tally() %>% 
  select(n) %>% pull()

print(paste0("Total number of flight trips from New York Airports to Dallas in an year:", flights_dallas))
## [1] "Total number of flight trips from New York Airports to Dallas in an year:8738"

**Q3) Which flights (carrier + flight number) happen everyday and where do they fly?

df_everyday <- df_flights %>% 
  group_by(carrier, flight, origin, dest) %>% 
  summarise(totaltrips = n()) %>% 
  ungroup(.) %>% filter(totaltrips == 365)   #As 2013 is not a leap year
## `summarise()` has grouped output by 'carrier', 'flight', 'origin'. You can override using the `.groups` argument.
#print(df_everyday)
knitr::kable(df_everyday)
carrier flight origin dest totaltrips
AA 59 JFK SFO 365
AA 119 EWR LAX 365
AA 181 JFK LAX 365
AA 1357 JFK SJU 365
AA 1611 LGA MIA 365
B6 219 JFK CLT 365
B6 359 JFK BUR 365
B6 371 LGA FLL 365
B6 431 LGA SRQ 365
B6 703 JFK SJU 365
B6 1783 JFK MCO 365
DL 2159 JFK MCO 365
DL 2391 JFK TPA 365
EV 5712 JFK IAD 365
UA 15 EWR HNL 365
VX 251 JFK LAS 365
VX 407 JFK LAX 365
VX 413 JFK LAX 365