library(readxl)
url <- "http://jamessuleiman.com/mba676/assets/units/unit07/fruit_prices_july_2016.xlsx"
destfile <- "fruit_prices_july_2016.xlsx"
download.file(url, destfile, mode= "wb")
fruit_prices_july_2016_ <- read_excel(destfile)
head(fruit_prices_july_2016_)
## # A tibble: 6 × 4
## Item Country Unit UnitPrice
## <chr> <chr> <chr> <chr>
## 1 Blackberries United Kingdom kg <U+0080>4.33
## 2 Blackcurrants United Kingdom kg £6.69
## 3 Cherries UK kg £4.35
## 4 Cooking Apples UK kg £1.03
## 5 Damsons England lb 0.86
## 6 Dessert Apples United Kingdom lb £1
fruit <- fruit_prices_july_2016_
str(fruit)
## Classes 'tbl_df', 'tbl' and 'data.frame': 30 obs. of 4 variables:
## $ Item : chr "Blackberries" "Blackcurrants" "Cherries" "Cooking Apples" ...
## $ Country : chr "United Kingdom" "United Kingdom" "UK" "UK" ...
## $ Unit : chr "kg" "kg" "kg" "kg" ...
## $ UnitPrice: chr "<U+0080>4.33""| __truncated__ "£6.69" "£4.35" "£1.03" ...
summary(fruit)
## Item Country Unit
## Length:30 Length:30 Length:30
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
## UnitPrice
## Length:30
## Class :character
## Mode :character
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
fruit %>% filter(!complete.cases(.))
## # A tibble: 2 × 4
## Item Country Unit UnitPrice
## <chr> <chr> <chr> <chr>
## 1 Dessert Apples USA <NA> 1.31
## 2 Plums and Gages USA <NA> $2.52
fruit %>% group_by(Country) %>% summarise(n=n())
## # A tibble: 7 × 2
## Country n
## <chr> <int>
## 1 Deutschland 1
## 2 England 1
## 3 Germany 9
## 4 UK 2
## 5 United Kingdom 7
## 6 United States of America 1
## 7 USA 9
fruit$Country <- ifelse(fruit$Country %in% c("Deutschland", "Germany"), "DE", fruit$Country)
fruit$Country <- ifelse(fruit$Country %in% c("England", "United Kingdom"), "UK", fruit$Country)
fruit$Country <- ifelse(fruit$Country %in% c("United States of America","USA"), "US", fruit$Country)
fruit %>% group_by(Country, Unit) %>% summarise(n=n())
## Source: local data frame [6 x 3]
## Groups: Country [?]
##
## Country Unit n
## <chr> <chr> <int>
## 1 DE kg 10
## 2 UK kg 8
## 3 UK lb 2
## 4 US kg 7
## 5 US lb 1
## 6 US <NA> 2
names(fruit)
## [1] "Item" "Country" "Unit" "UnitPrice"
fruit %>% group_by(Country) %>% summarise(n=n())
## # A tibble: 3 × 2
## Country n
## <chr> <int>
## 1 DE 10
## 2 UK 10
## 3 US 10
fruit %>% mutate(Currency = substr(UnitPrice,1,1)) %>% group_by (Country, Currency) %>% summarise(n = n())
## Source: local data frame [8 x 3]
## Groups: Country [?]
##
## Country Currency n
## <chr> <chr> <int>
## 1 DE <U+0080> 10
## 2 UK £ 6
## 3 UK <U+0080> 1
## 4 UK 0 1
## 5 UK 1 1
## 6 UK 6 1
## 7 US $ 9
## 8 US 1 1
fruit$UnitPrice
## [1] "<U+0080>4.33" "£6.69" "£4.35" "£1.03" "0.86" "£1" "£1.93" "6.91"
## [9] "£6.32" "1.49" "<U+0080>4,99" "<U+0080>7,99" "<U+0080>5,00" "<U+0080>1,22" "<U+0080>0,99" "<U+0080>1,15"
## [17] "<U+0080>2,28" "<U+0080>7,94" "<U+0080>7,29" "<U+0080>1,79" "$5.67" "$8.77" "$5.70" "$1.35"
## [25] "$1.13" "1.31" "$2.52" "$9.05" "$8.27" "$1.95"
fruit$UnitPrice <- gsub(",",".",fruit$UnitPrice)
fruit$UnitPrice <- as.numeric(ifelse(is.na(as.numeric(fruit$UnitPrice)), substr(fruit$UnitPrice, 2,nchar(fruit$UnitPrice)), fruit$UnitPrice))
## Warning in ifelse(is.na(as.numeric(fruit$UnitPrice)), substr(fruit
## $UnitPrice, : NAs introduced by coercion
head(fruit)
## # A tibble: 6 × 4
## Item Country Unit UnitPrice
## <chr> <chr> <chr> <dbl>
## 1 Blackberries UK kg 4.33
## 2 Blackcurrants UK kg 6.69
## 3 Cherries UK kg 4.35
## 4 Cooking Apples UK kg 1.03
## 5 Damsons UK lb 0.86
## 6 Dessert Apples UK lb 1.00
#1 British Pound = 1.31 Dollars, 1 #Euro = 1.13 Dollars
library(ggvis)
fruit <- fruit %>% mutate(c_rate = ifelse(Country == "UK", 1.31, ifelse(Country == "DE", 1.13,1))) %>% mutate(DollarPrice = UnitPrice * c_rate)
head(fruit)
## # A tibble: 6 × 6
## Item Country Unit UnitPrice c_rate DollarPrice
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 Blackberries UK kg 4.33 1.31 5.6723
## 2 Blackcurrants UK kg 6.69 1.31 8.7639
## 3 Cherries UK kg 4.35 1.31 5.6985
## 4 Cooking Apples UK kg 1.03 1.31 1.3493
## 5 Damsons UK lb 0.86 1.31 1.1266
## 6 Dessert Apples UK lb 1.00 1.31 1.3100
fruit %>% ggvis(~Item, ~DollarPrice) %>% layer_points(fill = ~Country)
fruit %>% group_by(Country) %>% summarise(mean_usd = mean(DollarPrice))
## # A tibble: 3 × 2
## Country mean_usd
## <chr> <dbl>
## 1 DE 4.59232
## 2 UK 4.57321
## 3 US 4.57200
library(validate)
library(tidyr)
fruit_wide <- fruit %>% select(Country, Item, DollarPrice) %>% spread(Country, DollarPrice)
fruit
## # A tibble: 30 × 6
## Item Country Unit UnitPrice c_rate DollarPrice
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 Blackberries UK kg 4.33 1.31 5.6723
## 2 Blackcurrants UK kg 6.69 1.31 8.7639
## 3 Cherries UK kg 4.35 1.31 5.6985
## 4 Cooking Apples UK kg 1.03 1.31 1.3493
## 5 Damsons UK lb 0.86 1.31 1.1266
## 6 Dessert Apples UK lb 1.00 1.31 1.3100
## 7 Plums and Gages UK kg 1.93 1.31 2.5283
## 8 Raspberries UK kg 6.91 1.31 9.0521
## 9 Redcurrants UK kg 6.32 1.31 8.2792
## 10 Strawberries UK kg 1.49 1.31 1.9519
## # ... with 20 more rows
fruit_wide %>% check_that(DE <=1.2 * UK, DE<=1.3 * US) %>% summary()
## rule items passes fails nNA error warning expression
## 1 V1 10 10 0 0 FALSE FALSE DE <= 1.2 * UK
## 2 V2 10 10 0 0 FALSE FALSE DE <= 1.3 * US
Q. 1
library(knitr)
library(pander)
url <- "http://jamessuleiman.com/mba676/assets/units/unit07/me_flights2.xlsx"
destfile <- "me_flights.xlsx"
download.file(url, destfile, mode = "wb")
flights <- read_excel(destfile)
pander(head(flights))
Table continues below
2016-01-05 |
AA |
N745VJ |
1815 |
PWM |
CLT |
Charlotte, NC |
2016-01-01 |
AA |
N748UW |
1815 |
PWM |
CLT |
Charlotte, NC |
2016-01-06 |
AA |
N829AW |
1815 |
PWM |
CLT |
Charlotte, NC |
2016-01-04 |
AA |
N755US |
1815 |
PWM |
CLT |
Charlotte, NC |
2016-01-07 |
AA |
N818AW |
1815 |
PWM |
CLT |
Charlotte, NC |
2016-01-08 |
AA |
N765US |
1815 |
PWM |
CLT |
Charlotte, NC |
Table continues below
NC |
North Carolina |
800 |
906 |
923 |
1111 |
NC |
North Carolina |
805 |
802 |
814 |
1034 |
NC |
North Carolina |
800 |
755 |
815 |
1022 |
NC |
North Carolina |
805 |
803 |
817 |
1037 |
NC |
North Carolina |
800 |
758 |
821 |
1034 |
NC |
North Carolina |
800 |
801 |
842 |
1048 |
Table continues below
1047 |
1127 |
0 |
0 |
167 |
1050 |
1040 |
0 |
0 |
165 |
1047 |
1027 |
0 |
0 |
167 |
1050 |
1044 |
0 |
0 |
165 |
1047 |
1039 |
0 |
0 |
167 |
1047 |
1053 |
0 |
0 |
167 |
Table continues below
141 |
813 |
2016-01-05 08:00:00 |
2016-01-05 10:47:00 |
158 |
813 |
2016-01-01 08:05:00 |
2016-01-01 10:50:00 |
152 |
813 |
2016-01-06 08:00:00 |
2016-01-06 10:47:00 |
161 |
813 |
2016-01-04 08:05:00 |
2016-01-04 10:50:00 |
161 |
813 |
2016-01-07 08:00:00 |
2016-01-07 10:47:00 |
172 |
813 |
2016-01-08 08:00:00 |
2016-01-08 10:47:00 |
Table continues below
2016-01-05 09:06:00 |
2016-01-05 09:23:00 |
2016-01-05 11:10:59 |
2016-01-05 11:26:59 |
2016-01-01 08:02:00 |
2016-01-01 08:13:59 |
2016-01-01 10:34:00 |
2016-01-01 10:40:00 |
2016-01-06 07:54:59 |
2016-01-06 08:15:00 |
2016-01-06 10:21:59 |
2016-01-06 10:27:00 |
2016-01-04 08:03:00 |
2016-01-04 08:16:59 |
2016-01-04 10:37:00 |
2016-01-04 10:44:00 |
2016-01-07 07:57:59 |
2016-01-07 08:21:00 |
2016-01-07 10:34:00 |
2016-01-07 10:38:59 |
2016-01-08 08:00:59 |
2016-01-08 08:42:00 |
2016-01-08 10:47:59 |
2016-01-08 10:53:00 |
Table continues below
66 |
66 |
1 |
17 |
16 |
40 |
-3 |
0 |
0 |
12 |
6 |
-10 |
-5 |
0 |
0 |
20 |
5 |
-20 |
-2 |
0 |
0 |
14 |
7 |
-6 |
-2 |
0 |
0 |
23 |
5 |
-8 |
1 |
1 |
0 |
41 |
5 |
6 |
40 |
1 |
26 |
108 |
451.6667 |
0 |
0 |
7 |
140 |
348.4286 |
0 |
0 |
15 |
127 |
384.0945 |
0 |
0 |
4 |
140 |
348.4286 |
0 |
0 |
6 |
133 |
366.7669 |
6 |
0 |
-5 |
126 |
387.1429 |
Q.2 Write a rule that causes flights with an airtime of less than 30 minutes to fail
kable(flights %>% filter(Cancelled == 0) %>% check_that(AirTime > 30) %>% summary())
V1 |
361 |
361 |
0 |
0 |
FALSE |
FALSE |
AirTime > 30 |
Q.3 Using validate, write a rule that checks for flights that arrive before they leave.
flights %>% check_that(Diff = new_ArrTime > new_DepTime) %>% summary()
## rule items passes fails nNA error warning expression
## 1 Diff 361 361 0 0 FALSE FALSE new_ArrTime > new_DepTime
Q.4 Using validate write a rule that checks for flightswithout a FlightDate in January
kable(flights %>% check_that(months(FlightDate) == "March") %>% summary())
V1 |
361 |
0 |
361 |
0 |
FALSE |
FALSE |
months(FlightDate) == “March” |
Q.5 How many unique TailNum’s are there?
Use the **distinct() and n = n()
kable(flights %>% distinct(TailNum) %>% summarise(n = n()))
Q.6 What airlines have average departure delays >= 3 minutes?
flights %>% group_by(Carrier) %>% summarise(meanDepDelay = mean(DepDelay)) %>% filter (meanDepDelay >= 3)
## # A tibble: 4 × 2
## Carrier meanDepDelay
## <chr> <dbl>
## 1 AA 13.000000
## 2 DL 19.333333
## 3 EV 4.591195
## 4 WN 8.203125
Q.7 Graph the average delay (DepDelayMinutes) by carrier.
flights %>% filter(DepDelay >=0) %>% group_by(Carrier) %>% summarise(MeanDelay = mean(DepDelay)) %>% ggvis(~Carrier, ~MeanDelay, fill := "blue") %>% layer_bars()
Q.8 Most important calculation. What percent of the reported flights in Maine originated in the Bangor Airport
pander(flights %>% group_by(Origin) %>% summarise(n = n()) %>% mutate(freq = n/sum(n)))
BGR |
6 |
0.0166205 |
PWM |
355 |
0.9833795 |
Q.9 Were there any flights reported from Bangor to Portland or vice versa?
flights %>% filter(Origin == "Portland" & Origin == "Bangor" , Dest == "Portland" & Dest == "Bangor")
## # A tibble: 0 × 37
## # ... with 37 variables: FlightDate <dttm>, Carrier <chr>, TailNum <chr>,
## # FlightNum <dbl>, Origin <chr>, Dest <chr>, DestCityName <chr>,
## # DestState <chr>, DestStateName <chr>, CRSDepTime <dbl>, DepTime <dbl>,
## # WheelsOff <dbl>, WheelsOn <dbl>, CRSArrTime <dbl>, ArrTime <dbl>,
## # Cancelled <dbl>, Diverted <dbl>, CRSElapsedTime <dbl>,
## # ActualElapsedTime <dbl>, Distance <dbl>, new_CRSDepTime <dttm>,
## # new_CRSArrTime <dttm>, new_DepTime <dttm>, new_WheelsOff <dttm>,
## # new_WheelsOn <dttm>, new_ArrTime <dttm>, DepDelay <dbl>,
## # DepDelayMinutes <dbl>, DepDel15 <dbl>, TaxiOut <dbl>, TaxiIn <dbl>,
## # ArrDelay <dbl>, ArrDelayMinutes <dbl>, ArrDel15 <dbl>,
## # FlightTimeBuffer <dbl>, AirTime <dbl>, AirSpeed <dbl>
flights %>% filter(Dest %in% c('PWM', 'BGR'))
## # A tibble: 0 × 37
## # ... with 37 variables: FlightDate <dttm>, Carrier <chr>, TailNum <chr>,
## # FlightNum <dbl>, Origin <chr>, Dest <chr>, DestCityName <chr>,
## # DestState <chr>, DestStateName <chr>, CRSDepTime <dbl>, DepTime <dbl>,
## # WheelsOff <dbl>, WheelsOn <dbl>, CRSArrTime <dbl>, ArrTime <dbl>,
## # Cancelled <dbl>, Diverted <dbl>, CRSElapsedTime <dbl>,
## # ActualElapsedTime <dbl>, Distance <dbl>, new_CRSDepTime <dttm>,
## # new_CRSArrTime <dttm>, new_DepTime <dttm>, new_WheelsOff <dttm>,
## # new_WheelsOn <dttm>, new_ArrTime <dttm>, DepDelay <dbl>,
## # DepDelayMinutes <dbl>, DepDel15 <dbl>, TaxiOut <dbl>, TaxiIn <dbl>,
## # ArrDelay <dbl>, ArrDelayMinutes <dbl>, ArrDel15 <dbl>,
## # FlightTimeBuffer <dbl>, AirTime <dbl>, AirSpeed <dbl>