Data Wrangling by R

Dinesh

5/6/2021

R Markdown

Data wrangling is the process of cleaning, organizing and enriching raw complex data sets for easy access and analysis. It is a important step in between data acquisition and exploratory data analysis (EDA) in Data Science Process.

http://rmarkdown.rstudio.com.

To prepare your data for analysis, as part of data wrangling, there are six (6) basic steps that need to be followed.

Data Discovery
Data Structuring
Data Cleaning
Data Enriching
Data Validation
Data Publishing

##Lets jump to answer some of the questions

library(dplyr)
library(hflights)
library(nycflights13)
library(leaflet)
library(plotly)
library(ggmap)
library(mapdata)
library(maps)
library(kableExtra)

Data Exploration

You can also embed plots, for example:

## # A tibble: 6 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1     1      517            515         2      830            819
## 2  2013     1     1      533            529         4      850            830
## 3  2013     1     1      542            540         2      923            850
## 4  2013     1     1      544            545        -1     1004           1022
## 5  2013     1     1      554            600        -6      812            837
## 6  2013     1     1      554            558        -4      740            728
## # ... with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

glimpse(flights)

## Rows: 336,776
## Columns: 19
## $ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2~
## $ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, ~
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, ~
## $ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1~
## $ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,~
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,~
## $ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1~
## $ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "~
## $ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4~
## $ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394~
## $ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",~
## $ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",~
## $ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1~
## $ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, ~
## $ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6~
## $ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0~
## $ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0~

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

df_airlines <- airlines
head(airlines)

## # A tibble: 6 x 2
##   carrier name                    
##   <chr>   <chr>                   
## 1 9E      Endeavor Air Inc.       
## 2 AA      American Airlines Inc.  
## 3 AS      Alaska Airlines Inc.    
## 4 B6      JetBlue Airways         
## 5 DL      Delta Air Lines Inc.    
## 6 EV      ExpressJet Airlines Inc.

df_airports <- airports
head(airports)

## # A tibble: 6 x 8
##   faa   name                          lat   lon   alt    tz dst   tzone         
##   <chr> <chr>                       <dbl> <dbl> <dbl> <dbl> <chr> <chr>         
## 1 04G   Lansdowne Airport            41.1 -80.6  1044    -5 A     America/New_Y~
## 2 06A   Moton Field Municipal Airp~  32.5 -85.7   264    -6 A     America/Chica~
## 3 06C   Schaumburg Regional          42.0 -88.1   801    -6 A     America/Chica~
## 4 06N   Randall Airport              41.4 -74.4   523    -5 A     America/New_Y~
## 5 09J   Jekyll Island Airport        31.1 -81.4    11    -5 A     America/New_Y~
## 6 0A9   Elizabethton Municipal Air~  36.4 -82.2  1593    -5 A     America/New_Y~

df_weather <- weather
head(weather)

## # A tibble: 6 x 15
##   origin  year month   day  hour  temp  dewp humid wind_dir wind_speed wind_gust
##   <chr>  <int> <int> <int> <int> <dbl> <dbl> <dbl>    <dbl>      <dbl>     <dbl>
## 1 EWR     2013     1     1     1  39.0  26.1  59.4      270      10.4         NA
## 2 EWR     2013     1     1     2  39.0  27.0  61.6      250       8.06        NA
## 3 EWR     2013     1     1     3  39.0  28.0  64.4      240      11.5         NA
## 4 EWR     2013     1     1     4  39.9  28.0  62.2      250      12.7         NA
## 5 EWR     2013     1     1     5  39.0  28.0  64.4      260      12.7         NA
## 6 EWR     2013     1     1     6  37.9  28.0  67.2      240      11.5         NA
## # ... with 4 more variables: precip <dbl>, pressure <dbl>, visib <dbl>,
## #   time_hour <dttm>

glimpse(weather)

## Rows: 26,115
## Columns: 15
## $ origin     <chr> "EWR", "EWR", "EWR", "EWR", "EWR", "EWR", "EWR", "EWR", "EW~
## $ year       <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013,~
## $ month      <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,~
## $ day        <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,~
## $ hour       <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, ~
## $ temp       <dbl> 39.02, 39.02, 39.02, 39.92, 39.02, 37.94, 39.02, 39.92, 39.~
## $ dewp       <dbl> 26.06, 26.96, 28.04, 28.04, 28.04, 28.04, 28.04, 28.04, 28.~
## $ humid      <dbl> 59.37, 61.63, 64.43, 62.21, 64.43, 67.21, 64.43, 62.21, 62.~
## $ wind_dir   <dbl> 270, 250, 240, 250, 260, 240, 240, 250, 260, 260, 260, 330,~
## $ wind_speed <dbl> 10.35702, 8.05546, 11.50780, 12.65858, 12.65858, 11.50780, ~
## $ wind_gust  <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 20.~
## $ precip     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ pressure   <dbl> 1012.0, 1012.3, 1012.5, 1012.2, 1011.9, 1012.4, 1012.2, 101~
## $ visib      <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,~
## $ time_hour  <dttm> 2013-01-01 01:00:00, 2013-01-01 02:00:00, 2013-01-01 03:00~

df_planes <- planes
head(planes)

## # A tibble: 6 x 9
##   tailnum  year type           manufacturer   model  engines seats speed engine 
##   <chr>   <int> <chr>          <chr>          <chr>    <int> <int> <int> <chr>  
## 1 N10156   2004 Fixed wing mu~ EMBRAER        EMB-1~       2    55    NA Turbo-~
## 2 N102UW   1998 Fixed wing mu~ AIRBUS INDUST~ A320-~       2   182    NA Turbo-~
## 3 N103US   1999 Fixed wing mu~ AIRBUS INDUST~ A320-~       2   182    NA Turbo-~
## 4 N104UW   1999 Fixed wing mu~ AIRBUS INDUST~ A320-~       2   182    NA Turbo-~
## 5 N10575   2002 Fixed wing mu~ EMBRAER        EMB-1~       2    55    NA Turbo-~
## 6 N105UW   1999 Fixed wing mu~ AIRBUS INDUST~ A320-~       2   182    NA Turbo-~

Before we jump into answering the questions, lets clean the data

duplicates <- df_flights %>% 
  #filter(origin %in% c("EWR", "LGA", "JFK") & dest %in% c("IAH", "IWS")) %>% 
  group_by(origin, dest, year, month, day, dep_time, flight) %>% 
  tally() %>% 
  filter(n > 1) %>% 
  pull()

print(duplicates)

## integer(0)

Good News!!! There are no duplicates in the dataset. Now, lets identify how many rows has atleast one null value and what are they?

rows_null <- df_flights[rowSums(is.na(df_flights)) > 0, ]
col_null_count <- colSums(is.na(rows_null))
df_nullcount <- as.data.frame(col_null_count) %>% 
  rename(nullcount = col_null_count) %>%
  filter(nullcount > 0)
         
print(df_nullcount)

##           nullcount
## dep_time       8255
## dep_delay      8255
## arr_time       8713
## arr_delay      9430
## tailnum        2512
## air_time       9430

From the above table, its clear that 6 rows in the dataset has NULL values.

The questions that we want to ask ourself:

Q1) How many different destinations can you fly to from Newyork airport in year 2013 ( EWR, LGA, JFK) ?

df_dest <- df_flights %>%
  filter(!dest %in% c("EWR", "LGA", "JFK")) %>%
  distinct(dest) %>% 
    left_join(., df_airports, c("dest" =  "faa")) 

dest_count <- df_dest %>%
  nrow() 
print(paste0("Total Destinations offered from New York Airports in year 2013: ", dest_count))

## [1] "Total Destinations offered from New York Airports in year 2013: 104"

Lets point out the destinations on a map.

#Excluding destinations with out Airport Name, latitude and longitude
df_sp <- df_dest %>% 
  filter(!((is.na(name) | is.na(lat) | is.na(lon))))

#leaflet(df_sp) %>% 
#  addTiles() %>% 
#  addMarkers(lng = ~lon, lat = ~lat, popup = ~as.character(name),label = ~as.character(name))

fig <- df_sp %>%
  plot_ly(
    lat = ~lat,
    lon = ~lon,
    marker = list(color = "lightcoral"),
    type = 'scattermapbox',
      size = 5,
    hovertext = df_sp$name) 
fig <- fig %>%
  layout(
    mapbox = list(
      style = 'open-street-map',
      zoom =2.5,
      center = list(lon = -88, lat = 34))) 

fig

## No scattermapbox mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

creating static map

states <- map_data("state")
ggplot() +
  theme_nothing() + 
  geom_polygon(data = states, aes(x = long , y = lat, group =  group), fill = NA, color = "gray" )   +
  geom_point(data = df_sp, aes(x = lon, y = lat)  ,show.legend = F) + 
  coord_fixed(1.3) +
  guides(fill=FALSE)

Q2) How many flights flew to Dallas in an year from NewYork Airports ?

flights_dallas <- df_flights %>% 
  filter(origin %in% c("EWR", "LGA", "JFK") & dest %in% c("DAL", "DFW", "RBD")) %>% 
  group_by(year) %>% 
  tally() %>% 
  select(n) %>% pull()

print(paste0("Total number of flight trips from New York Airports to Dallas in an year:", flights_dallas))

## [1] "Total number of flight trips from New York Airports to Dallas in an year:8738"

**Q3) Which flights (carrier + flight number) happen everyday and where do they fly?

df_everyday <- df_flights %>% 
  group_by(carrier, flight, origin, dest) %>% 
  summarise(totaltrips = n()) %>% 
  ungroup(.) %>% filter(totaltrips == 365)   #As 2013 is not a leap year

## `summarise()` has grouped output by 'carrier', 'flight', 'origin'. You can override using the `.groups` argument.

#print(df_everyday)
knitr::kable(df_everyday)

carrier	flight	origin	dest	totaltrips
AA	59	JFK	SFO	365
AA	119	EWR	LAX	365
AA	181	JFK	LAX	365
AA	1357	JFK	SJU	365
AA	1611	LGA	MIA	365
B6	219	JFK	CLT	365
B6	359	JFK	BUR	365
B6	371	LGA	FLL	365
B6	431	LGA	SRQ	365
B6	703	JFK	SJU	365
B6	1783	JFK	MCO	365
DL	2159	JFK	MCO	365
DL	2391	JFK	TPA	365
EV	5712	JFK	IAD	365
UA	15	EWR	HNL	365
VX	251	JFK	LAS	365
VX	407	JFK	LAX	365
VX	413	JFK	LAX	365