library(readxl)
url <- "http://jamessuleiman.com/mba676/assets/units/unit07/fruit_prices_july_2016.xlsx"
destfile <- "fruit_prices_july_2016.xlsx"
download.file(url, destfile, mode= "wb")
fruit_prices_july_2016_ <- read_excel(destfile)
head(fruit_prices_july_2016_)
## # A tibble: 6 × 4
##             Item        Country  Unit UnitPrice
##            <chr>          <chr> <chr>     <chr>
## 1   Blackberries United Kingdom    kg     <U+0080>4.33
## 2  Blackcurrants United Kingdom    kg     £6.69
## 3       Cherries             UK    kg     £4.35
## 4 Cooking Apples             UK    kg     £1.03
## 5        Damsons        England    lb      0.86
## 6 Dessert Apples United Kingdom    lb        £1
fruit <- fruit_prices_july_2016_
str(fruit)
## Classes 'tbl_df', 'tbl' and 'data.frame':    30 obs. of  4 variables:
##  $ Item     : chr  "Blackberries" "Blackcurrants" "Cherries" "Cooking Apples" ...
##  $ Country  : chr  "United Kingdom" "United Kingdom" "UK" "UK" ...
##  $ Unit     : chr  "kg" "kg" "kg" "kg" ...
##  $ UnitPrice: chr  "<U+0080>4.33""| __truncated__ "£6.69" "£4.35" "£1.03" ...
summary(fruit)
##      Item             Country              Unit          
##  Length:30          Length:30          Length:30         
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##   UnitPrice        
##  Length:30         
##  Class :character  
##  Mode  :character
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
fruit %>% filter(!complete.cases(.))
## # A tibble: 2 × 4
##              Item Country  Unit UnitPrice
##             <chr>   <chr> <chr>     <chr>
## 1  Dessert Apples     USA  <NA>      1.31
## 2 Plums and Gages     USA  <NA>     $2.52
fruit %>% group_by(Country) %>% summarise(n=n())
## # A tibble: 7 × 2
##                    Country     n
##                      <chr> <int>
## 1              Deutschland     1
## 2                  England     1
## 3                  Germany     9
## 4                       UK     2
## 5           United Kingdom     7
## 6 United States of America     1
## 7                      USA     9
fruit$Country <- ifelse(fruit$Country %in% c("Deutschland", "Germany"), "DE", fruit$Country)

fruit$Country <- ifelse(fruit$Country %in% c("England", "United Kingdom"), "UK", fruit$Country)

fruit$Country <- ifelse(fruit$Country %in% c("United States of America","USA"), "US", fruit$Country)
fruit %>%  group_by(Country, Unit) %>% summarise(n=n())
## Source: local data frame [6 x 3]
## Groups: Country [?]
## 
##   Country  Unit     n
##     <chr> <chr> <int>
## 1      DE    kg    10
## 2      UK    kg     8
## 3      UK    lb     2
## 4      US    kg     7
## 5      US    lb     1
## 6      US  <NA>     2
names(fruit)
## [1] "Item"      "Country"   "Unit"      "UnitPrice"
fruit %>%  group_by(Country) %>% summarise(n=n())
## # A tibble: 3 × 2
##   Country     n
##     <chr> <int>
## 1      DE    10
## 2      UK    10
## 3      US    10
fruit %>% mutate(Currency = substr(UnitPrice,1,1)) %>% group_by (Country, Currency) %>% summarise(n = n())
## Source: local data frame [8 x 3]
## Groups: Country [?]
## 
##   Country Currency     n
##     <chr>    <chr> <int>
## 1      DE        <U+0080>    10
## 2      UK        £     6
## 3      UK        <U+0080>     1
## 4      UK        0     1
## 5      UK        1     1
## 6      UK        6     1
## 7      US        $     9
## 8      US        1     1
fruit$UnitPrice
##  [1] "<U+0080>4.33" "£6.69" "£4.35" "£1.03" "0.86"  "£1"    "£1.93" "6.91" 
##  [9] "£6.32" "1.49"  "<U+0080>4,99" "<U+0080>7,99" "<U+0080>5,00" "<U+0080>1,22" "<U+0080>0,99" "<U+0080>1,15"
## [17] "<U+0080>2,28" "<U+0080>7,94" "<U+0080>7,29" "<U+0080>1,79" "$5.67" "$8.77" "$5.70" "$1.35"
## [25] "$1.13" "1.31"  "$2.52" "$9.05" "$8.27" "$1.95"
fruit$UnitPrice <- gsub(",",".",fruit$UnitPrice)
fruit$UnitPrice <- as.numeric(ifelse(is.na(as.numeric(fruit$UnitPrice)), substr(fruit$UnitPrice, 2,nchar(fruit$UnitPrice)), fruit$UnitPrice))
## Warning in ifelse(is.na(as.numeric(fruit$UnitPrice)), substr(fruit
## $UnitPrice, : NAs introduced by coercion
head(fruit)
## # A tibble: 6 × 4
##             Item Country  Unit UnitPrice
##            <chr>   <chr> <chr>     <dbl>
## 1   Blackberries      UK    kg      4.33
## 2  Blackcurrants      UK    kg      6.69
## 3       Cherries      UK    kg      4.35
## 4 Cooking Apples      UK    kg      1.03
## 5        Damsons      UK    lb      0.86
## 6 Dessert Apples      UK    lb      1.00
#1 British Pound = 1.31 Dollars, 1 #Euro = 1.13 Dollars
library(ggvis)
fruit <- fruit %>% mutate(c_rate = ifelse(Country == "UK", 1.31, ifelse(Country == "DE", 1.13,1))) %>% mutate(DollarPrice = UnitPrice * c_rate)
head(fruit)
## # A tibble: 6 × 6
##             Item Country  Unit UnitPrice c_rate DollarPrice
##            <chr>   <chr> <chr>     <dbl>  <dbl>       <dbl>
## 1   Blackberries      UK    kg      4.33   1.31      5.6723
## 2  Blackcurrants      UK    kg      6.69   1.31      8.7639
## 3       Cherries      UK    kg      4.35   1.31      5.6985
## 4 Cooking Apples      UK    kg      1.03   1.31      1.3493
## 5        Damsons      UK    lb      0.86   1.31      1.1266
## 6 Dessert Apples      UK    lb      1.00   1.31      1.3100
fruit %>% ggvis(~Item, ~DollarPrice) %>% layer_points(fill = ~Country)
fruit %>% group_by(Country) %>% summarise(mean_usd = mean(DollarPrice))
## # A tibble: 3 × 2
##   Country mean_usd
##     <chr>    <dbl>
## 1      DE  4.59232
## 2      UK  4.57321
## 3      US  4.57200
library(validate)
library(tidyr)
fruit_wide <- fruit %>% select(Country, Item, DollarPrice) %>% spread(Country, DollarPrice)
fruit
## # A tibble: 30 × 6
##               Item Country  Unit UnitPrice c_rate DollarPrice
##              <chr>   <chr> <chr>     <dbl>  <dbl>       <dbl>
## 1     Blackberries      UK    kg      4.33   1.31      5.6723
## 2    Blackcurrants      UK    kg      6.69   1.31      8.7639
## 3         Cherries      UK    kg      4.35   1.31      5.6985
## 4   Cooking Apples      UK    kg      1.03   1.31      1.3493
## 5          Damsons      UK    lb      0.86   1.31      1.1266
## 6   Dessert Apples      UK    lb      1.00   1.31      1.3100
## 7  Plums and Gages      UK    kg      1.93   1.31      2.5283
## 8      Raspberries      UK    kg      6.91   1.31      9.0521
## 9      Redcurrants      UK    kg      6.32   1.31      8.2792
## 10    Strawberries      UK    kg      1.49   1.31      1.9519
## # ... with 20 more rows
fruit_wide %>% check_that(DE <=1.2 * UK, DE<=1.3 * US) %>% summary()
##   rule items passes fails nNA error warning     expression
## 1   V1    10     10     0   0 FALSE   FALSE DE <= 1.2 * UK
## 2   V2    10     10     0   0 FALSE   FALSE DE <= 1.3 * US

Q. 1

library(knitr)
library(pander)
url <- "http://jamessuleiman.com/mba676/assets/units/unit07/me_flights2.xlsx"

destfile <- "me_flights.xlsx"
download.file(url, destfile, mode = "wb")
flights <- read_excel(destfile)
pander(head(flights))
Table continues below
FlightDate Carrier TailNum FlightNum Origin Dest DestCityName
2016-01-05 AA N745VJ 1815 PWM CLT Charlotte, NC
2016-01-01 AA N748UW 1815 PWM CLT Charlotte, NC
2016-01-06 AA N829AW 1815 PWM CLT Charlotte, NC
2016-01-04 AA N755US 1815 PWM CLT Charlotte, NC
2016-01-07 AA N818AW 1815 PWM CLT Charlotte, NC
2016-01-08 AA N765US 1815 PWM CLT Charlotte, NC
Table continues below
DestState DestStateName CRSDepTime DepTime WheelsOff WheelsOn
NC North Carolina 800 906 923 1111
NC North Carolina 805 802 814 1034
NC North Carolina 800 755 815 1022
NC North Carolina 805 803 817 1037
NC North Carolina 800 758 821 1034
NC North Carolina 800 801 842 1048
Table continues below
CRSArrTime ArrTime Cancelled Diverted CRSElapsedTime
1047 1127 0 0 167
1050 1040 0 0 165
1047 1027 0 0 167
1050 1044 0 0 165
1047 1039 0 0 167
1047 1053 0 0 167
Table continues below
ActualElapsedTime Distance new_CRSDepTime new_CRSArrTime
141 813 2016-01-05 08:00:00 2016-01-05 10:47:00
158 813 2016-01-01 08:05:00 2016-01-01 10:50:00
152 813 2016-01-06 08:00:00 2016-01-06 10:47:00
161 813 2016-01-04 08:05:00 2016-01-04 10:50:00
161 813 2016-01-07 08:00:00 2016-01-07 10:47:00
172 813 2016-01-08 08:00:00 2016-01-08 10:47:00
Table continues below
new_DepTime new_WheelsOff new_WheelsOn new_ArrTime
2016-01-05 09:06:00 2016-01-05 09:23:00 2016-01-05 11:10:59 2016-01-05 11:26:59
2016-01-01 08:02:00 2016-01-01 08:13:59 2016-01-01 10:34:00 2016-01-01 10:40:00
2016-01-06 07:54:59 2016-01-06 08:15:00 2016-01-06 10:21:59 2016-01-06 10:27:00
2016-01-04 08:03:00 2016-01-04 08:16:59 2016-01-04 10:37:00 2016-01-04 10:44:00
2016-01-07 07:57:59 2016-01-07 08:21:00 2016-01-07 10:34:00 2016-01-07 10:38:59
2016-01-08 08:00:59 2016-01-08 08:42:00 2016-01-08 10:47:59 2016-01-08 10:53:00
Table continues below
DepDelay DepDelayMinutes DepDel15 TaxiOut TaxiIn ArrDelay
66 66 1 17 16 40
-3 0 0 12 6 -10
-5 0 0 20 5 -20
-2 0 0 14 7 -6
-2 0 0 23 5 -8
1 1 0 41 5 6
ArrDelayMinutes ArrDel15 FlightTimeBuffer AirTime AirSpeed
40 1 26 108 451.6667
0 0 7 140 348.4286
0 0 15 127 384.0945
0 0 4 140 348.4286
0 0 6 133 366.7669
6 0 -5 126 387.1429

Q.2 Write a rule that causes flights with an airtime of less than 30 minutes to fail

kable(flights %>% filter(Cancelled == 0) %>% check_that(AirTime > 30) %>% summary())
rule items passes fails nNA error warning expression
V1 361 361 0 0 FALSE FALSE AirTime > 30

Q.3 Using validate, write a rule that checks for flights that arrive before they leave.

flights %>% check_that(Diff = new_ArrTime > new_DepTime) %>% summary()
##   rule items passes fails nNA error warning                expression
## 1 Diff   361    361     0   0 FALSE   FALSE new_ArrTime > new_DepTime

Q.4 Using validate write a rule that checks for flightswithout a FlightDate in January

kable(flights %>% check_that(months(FlightDate) == "March") %>% summary())
rule items passes fails nNA error warning expression
V1 361 0 361 0 FALSE FALSE months(FlightDate) == “March”

Q.5 How many unique TailNum’s are there?

Use the **distinct() and n = n()

kable(flights %>% distinct(TailNum) %>% summarise(n = n()))

n

263

Q.6 What airlines have average departure delays >= 3 minutes?

flights %>% group_by(Carrier) %>% summarise(meanDepDelay = mean(DepDelay)) %>% filter (meanDepDelay >= 3)
## # A tibble: 4 × 2
##   Carrier meanDepDelay
##     <chr>        <dbl>
## 1      AA    13.000000
## 2      DL    19.333333
## 3      EV     4.591195
## 4      WN     8.203125

Q.7 Graph the average delay (DepDelayMinutes) by carrier.

flights %>% filter(DepDelay >=0) %>% group_by(Carrier) %>% summarise(MeanDelay = mean(DepDelay)) %>% ggvis(~Carrier, ~MeanDelay, fill := "blue") %>% layer_bars()

Q.8 Most important calculation. What percent of the reported flights in Maine originated in the Bangor Airport

pander(flights %>% group_by(Origin) %>% summarise(n = n()) %>% mutate(freq = n/sum(n)))
Origin n freq
BGR 6 0.0166205
PWM 355 0.9833795

Q.9 Were there any flights reported from Bangor to Portland or vice versa?

flights %>%  filter(Origin == "Portland" & Origin == "Bangor" , Dest == "Portland" & Dest == "Bangor") 
## # A tibble: 0 × 37
## # ... with 37 variables: FlightDate <dttm>, Carrier <chr>, TailNum <chr>,
## #   FlightNum <dbl>, Origin <chr>, Dest <chr>, DestCityName <chr>,
## #   DestState <chr>, DestStateName <chr>, CRSDepTime <dbl>, DepTime <dbl>,
## #   WheelsOff <dbl>, WheelsOn <dbl>, CRSArrTime <dbl>, ArrTime <dbl>,
## #   Cancelled <dbl>, Diverted <dbl>, CRSElapsedTime <dbl>,
## #   ActualElapsedTime <dbl>, Distance <dbl>, new_CRSDepTime <dttm>,
## #   new_CRSArrTime <dttm>, new_DepTime <dttm>, new_WheelsOff <dttm>,
## #   new_WheelsOn <dttm>, new_ArrTime <dttm>, DepDelay <dbl>,
## #   DepDelayMinutes <dbl>, DepDel15 <dbl>, TaxiOut <dbl>, TaxiIn <dbl>,
## #   ArrDelay <dbl>, ArrDelayMinutes <dbl>, ArrDel15 <dbl>,
## #   FlightTimeBuffer <dbl>, AirTime <dbl>, AirSpeed <dbl>
flights %>% filter(Dest %in% c('PWM', 'BGR'))
## # A tibble: 0 × 37
## # ... with 37 variables: FlightDate <dttm>, Carrier <chr>, TailNum <chr>,
## #   FlightNum <dbl>, Origin <chr>, Dest <chr>, DestCityName <chr>,
## #   DestState <chr>, DestStateName <chr>, CRSDepTime <dbl>, DepTime <dbl>,
## #   WheelsOff <dbl>, WheelsOn <dbl>, CRSArrTime <dbl>, ArrTime <dbl>,
## #   Cancelled <dbl>, Diverted <dbl>, CRSElapsedTime <dbl>,
## #   ActualElapsedTime <dbl>, Distance <dbl>, new_CRSDepTime <dttm>,
## #   new_CRSArrTime <dttm>, new_DepTime <dttm>, new_WheelsOff <dttm>,
## #   new_WheelsOn <dttm>, new_ArrTime <dttm>, DepDelay <dbl>,
## #   DepDelayMinutes <dbl>, DepDel15 <dbl>, TaxiOut <dbl>, TaxiIn <dbl>,
## #   ArrDelay <dbl>, ArrDelayMinutes <dbl>, ArrDel15 <dbl>,
## #   FlightTimeBuffer <dbl>, AirTime <dbl>, AirSpeed <dbl>