Modern Dive 2

library(dplyr)


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

library(ggplot2)
library(nycflights13)

Learning Check 3.1

not_BTV_SEA <- flights %>%
  filter(!dest == "BTV" & !dest == "SEA" )

Learning Check 3.2

Her data is skewed because the missing patients could of had lung cancer. It will have bias.

Learning Check 3.3

summary_temp <- weather %>%
  summarize(count = n())
summary_temp

# A tibble: 1 x 1
  count
  <int>
1 26115

It corresponds to the total points of data.

Learning Check 3.4

summary_temp <- weather %>% summarize(mean = mean(temp, na.rm = TRUE))

You have to run it in two different lines.

weather %>%
  summarize(mean = mean(temp, na.rm = TRUE))

# A tibble: 1 x 1
   mean
  <dbl>
1  55.3

Learning Check 3.5

summary_monthly_temp <- weather %>%
  group_by(month) %>%
  summarize(mean = mean(temp, na.rm = TRUE),
            std_dev = sd(temp, na.rm = TRUE))
summary_monthly_temp

# A tibble: 12 x 3
   month  mean std_dev
   <int> <dbl>   <dbl>
 1     1  35.6   10.2 
 2     2  34.3    6.98
 3     3  39.9    6.25
 4     4  51.7    8.79
 5     5  61.8    9.68
 6     6  72.2    7.55
 7     7  80.1    7.12
 8     8  74.5    5.19
 9     9  67.4    8.47
10    10  60.1    8.85
11    11  45.0   10.4 
12    12  38.4    9.98

The standard deviation is lower than the mean.

Learning Check 3.6

summary_monthly_temp <- weather %>%
  group_by(year, month, day) %>%
  summarize(mean = mean(temp, na.rm = TRUE),
            std_dev = sd(temp, na.rm = TRUE))
summary_monthly_temp

# A tibble: 364 x 5
# Groups:   year, month [12]
    year month   day  mean std_dev
   <int> <int> <int> <dbl>   <dbl>
 1  2013     1     1  37.0    4.00
 2  2013     1     2  28.7    3.45
 3  2013     1     3  30.0    2.58
 4  2013     1     4  34.9    2.45
 5  2013     1     5  37.2    4.01
 6  2013     1     6  40.1    4.40
 7  2013     1     7  40.6    3.68
 8  2013     1     8  40.1    5.77
 9  2013     1     9  43.2    5.40
10  2013     1    10  43.8    2.95
# ... with 354 more rows

Learning 3.7

by_monthly_origin <- flights %>%
  group_by(origin, month)
by_monthly_origin

# A tibble: 336,776 x 19
# Groups:   origin, month [36]
    year month   day dep_time sched_dep_time dep_delay arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>
 1  2013     1     1      517            515         2      830
 2  2013     1     1      533            529         4      850
 3  2013     1     1      542            540         2      923
 4  2013     1     1      544            545        -1     1004
 5  2013     1     1      554            600        -6      812
 6  2013     1     1      554            558        -4      740
 7  2013     1     1      555            600        -5      913
 8  2013     1     1      557            600        -3      709
 9  2013     1     1      557            600        -3      838
10  2013     1     1      558            600        -2      753
# ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
#   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
#   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#   minute <dbl>, time_hour <dttm>

The values are presented differently.

Learning Check 3.8

by_carrier <- flights %>%
  group_by(n())
by_carrier

# A tibble: 336,776 x 20
# Groups:   n() [1]
    year month   day dep_time sched_dep_time dep_delay arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>
 1  2013     1     1      517            515         2      830
 2  2013     1     1      533            529         4      850
 3  2013     1     1      542            540         2      923
 4  2013     1     1      544            545        -1     1004
 5  2013     1     1      554            600        -6      812
 6  2013     1     1      554            558        -4      740
 7  2013     1     1      555            600        -5      913
 8  2013     1     1      557            600        -3      709
 9  2013     1     1      557            600        -3      838
10  2013     1     1      558            600        -2      753
# ... with 336,766 more rows, and 13 more variables: sched_arr_time <int>,
#   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
#   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
#   minute <dbl>, time_hour <dttm>, `n()` <int>

Learning Check 3.9

Filter shows the rows and group by show the summary of the numerical data.

Learning Check 3.10

Positive Values are flights that made up time and did it faster than normal. Negative Values means the flight took longer than normal. 0 values means that the flight took the normal amount of time.

Learning Check 3.11

No because it would not be able to judge the times that go past 12 and realize it is continuing.

Learning Check 3.12

The gain is usually right under zero. It can be higher or lower.

Learning Check 3.13

Hour is just a number and not catagory.

Learning Check 3.14

Most of the destinations are close.

Learning Check 3.15

You can join the normal form easier.

Learning Check 3.16

flight_arr_times <- flights %>% 
  select(dest:air_time, distance)
flight_arr_times

# A tibble: 336,776 x 3
   dest  air_time distance
   <chr>    <dbl>    <dbl>
 1 IAH        227     1400
 2 IAH        227     1416
 3 MIA        160     1089
 4 BQN        183     1576
 5 ATL        116      762
 6 ORD        150      719
 7 FLL        158     1065
 8 IAD         53      229
 9 MCO        140      944
10 ORD        138      733
# ... with 336,766 more rows

Learning Check 3.17

flights_begin_dest <- flights %>% 
  select(starts_with("dest"))
flights_begin_dest

# A tibble: 336,776 x 1
   dest 
   <chr>
 1 IAH  
 2 IAH  
 3 MIA  
 4 BQN  
 5 ATL  
 6 ORD  
 7 FLL  
 8 IAD  
 9 MCO  
10 ORD  
# ... with 336,766 more rows

flights_ends_time <- flights %>% 
  select(ends_with("time"))
flights_ends_time

# A tibble: 336,776 x 5
   dep_time sched_dep_time arr_time sched_arr_time air_time
      <int>          <int>    <int>          <int>    <dbl>
 1      517            515      830            819      227
 2      533            529      850            830      227
 3      542            540      923            850      160
 4      544            545     1004           1022      183
 5      554            600      812            837      116
 6      554            558      740            728      150
 7      555            600      913            854      158
 8      557            600      709            723       53
 9      557            600      838            846      140
10      558            600      753            745      138
# ... with 336,766 more rows

flights_miles <- flights %>% 
  select(contains("miles"))
flights_miles

# A tibble: 336,776 x 0

Learning Check 3.18

you can pick out individual data.

Learning Check 3.19

named_airports <- flights %>% 
  top_n(n = 5, wt = arr_delay) %>% 
  arrange(desc(arr_delay))

Stroytelling with Data a data visualization guide for business professional Chapter 3

Chapter 3 tells us how to arange information so that the person that is viewing the data will be able to better understand it. The key points the chapter touches on is limit clutter, showing to much at one time to the reader, were they are unable to understand it fully and how you should arange data and main topics so the viewer will find it easiest to comprehend.