Problem Set # 3

Jenine Massat

date()

## [1] "Tue Oct 03 19:56:59 2017"

Due Date: September 27, 2017 Total Points: 45

1 The dataset hflights from the hflights package contains all 227,496 flights that departed Houston in 2011. Using the functions in the dplyr package

Create a new data frame from hflights containing only those flights that departed on September 11th of that year. (5)

library(hflights)
library(dplyr)

## Warning: package 'dplyr' was built under R version 3.4.2

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
hflights.df = tbl_df(hflights)

sep11.df = hflights %>%
  filter(Month == "9" & DayofMonth == "11")
head(sep11.df)

##   Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1 2011     9         11         7    1546    1651            AA       458
## 2 2011     9         11         7     551     904            AA       466
## 3 2011     9         11         7    1936    2036            AA       657
## 4 2011     9         11         7    1438    1544            AA       742
## 5 2011     9         11         7    1720    2030            AA      1294
## 6 2011     9         11         7    1142    1258            AA      1848
##   TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance
## 1  N559AA                65      40      -14       -4    IAH  DFW      224
## 2  N3EGAA               133     115      -16       -9    IAH  MIA      964
## 3  N498AA                60      40      -19       -4    IAH  DFW      224
## 4  N470AA                66      43        9       18    IAH  DFW      224
## 5  N3BVAA               130     118      -20       -5    IAH  MIA      964
## 6  N598AA                76      40       -2       -3    IAH  DFW      224
##   TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 1     12      13         0                         0
## 2      5      13         0                         0
## 3      8      12         0                         0
## 4      6      17         0                         0
## 5      5       7         0                         0
## 6     22      14         0                         0

How many flights departed on that day? (5)

taxiout.df = sep11.df %>%
  select(TaxiOut)
head(taxiout.df)

##   TaxiOut
## 1      13
## 2      13
## 3      12
## 4      17
## 5       7
## 6      14

sum(taxiout.df)

## [1] 8011

8011 were taxied out or departed.

Create a data frame with the first column being the tail number and the second being the number of departures from Houston the plane made that year sorted by most to least number of flights. (5)

tailanddepart.df = hflights %>% 
  select(TailNum, TaxiOut) %>%
  arrange(desc(TaxiOut))
head(tailanddepart.df)

##   TailNum TaxiOut
## 1  N14998     163
## 2  N27190     152
## 3  N29917     151
## 4  N14558     151
## 5  N510SW     150
## 6  N31412     146

2 Answer the following questions using the tornado data set (http://myweb.fsu.edu/jelsner/temp/data/Tornadoes.txt).

L = "http://myweb.fsu.edu/jelsner/temp/data/Tornadoes.txt"
Torn.df = read.table(L, header = TRUE)
head(Torn.df)

##   OM YEAR MONTH DAY       DATE TIME TIMEZONE STATE FIPS STATENUMBE FSCALE
## 1  1 1950     1   3 1950-01-03 1100        3    MO   29          1      3
## 2  2 1950     1   3 1950-01-03 1155        3    IL   17          2      3
## 3  3 1950     1   3 1950-01-03 1600        3    OH   39          1      1
## 4  4 1950     1  13 1950-01-13  525        3    AR    5          1      3
## 5  5 1950     1  25 1950-01-25 1930        3    MO   29          2      2
## 6  6 1950     1  25 1950-01-25 2100        3    IL   17          3      2
##   INJURIES FATALITIES LOSS CROPLOSS  SLAT   SLON  ELAT   ELON LENGTH WIDTH
## 1        3          0    6        0 38.77 -90.22 38.83 -90.03    9.5   150
## 2        3          0    5        0 39.10 -89.30 39.12 -89.23    3.6   130
## 3        1          0    4        0 40.88 -84.58  0.00   0.00    0.1    10
## 4        1          1    3        0 34.40 -94.37  0.00   0.00    0.6    17
## 5        5          0    5        0 37.60 -90.68 37.63 -90.65    2.3   300
## 6        0          0    5        0 41.17 -87.33  0.00   0.00    0.1   100
##   NS SN SG  F1 F2 F3 F4
## 1  2  0  1   0  0  0  0
## 2  1  1  1 135  0  0  0
## 3  1  1  1 161  0  0  0
## 4  1  1  1 113  0  0  0
## 5  1  1  1  93  0  0  0
## 6  1  1  1  91  0  0  0

Torn.df = tbl_df(Torn.df)
Torn.df

## # A tibble: 56,221 x 28
##       OM  YEAR MONTH   DAY       DATE  TIME TIMEZONE  STATE  FIPS
##  * <int> <int> <int> <int>     <fctr> <int>    <int> <fctr> <int>
##  1     1  1950     1     3 1950-01-03  1100        3     MO    29
##  2     2  1950     1     3 1950-01-03  1155        3     IL    17
##  3     3  1950     1     3 1950-01-03  1600        3     OH    39
##  4     4  1950     1    13 1950-01-13   525        3     AR     5
##  5     5  1950     1    25 1950-01-25  1930        3     MO    29
##  6     6  1950     1    25 1950-01-25  2100        3     IL    17
##  7     7  1950     1    26 1950-01-26  1800        3     TX    48
##  8     8  1950     2    11 1950-02-11  1310        3     TX    48
##  9     9  1950     2    11 1950-02-11  1350        3     TX    48
## 10    10  1950     2    11 1950-02-11  2100        3     TX    48
## # ... with 56,211 more rows, and 19 more variables: STATENUMBE <int>,
## #   FSCALE <int>, INJURIES <int>, FATALITIES <int>, LOSS <int>,
## #   CROPLOSS <int>, SLAT <dbl>, SLON <dbl>, ELAT <dbl>, ELON <dbl>,
## #   LENGTH <dbl>, WIDTH <int>, NS <int>, SN <int>, SG <int>, F1 <int>,
## #   F2 <int>, F3 <int>, F4 <int>

Use the functions group_by() and summarize() to determine the number of tornadoes by each state. (5)

Torn.df %>%
group_by(STATE) %>%
  summarize(STATENUMBE = max(STATENUMBE, na.rm = TRUE))

## # A tibble: 52 x 2
##     STATE STATENUMBE
##    <fctr>      <dbl>
##  1     AK          2
##  2     AL        145
##  3     AR        105
##  4     AZ         17
##  5     CA         30
##  6     CO         98
##  7     CT          8
##  8     DC          1
##  9     DE          6
## 10     FL        115
## # ... with 42 more rows

Create a new column with path length in meters. Create a new data frame by removing rows with EF damage rating below 3. Group by year and summarize the average path length. Make a graph with year on the x axis and average path length on the y axis. (10)

EFless3.df = Torn.df %>%
  mutate(Length = LENGTH * 1609) %>%
  filter(FSCALE > 3) %>%
  select(OM:SG,F3:F4) 
EFless3.df

## # A tibble: 594 x 26
##       OM  YEAR MONTH   DAY       DATE  TIME TIMEZONE  STATE  FIPS
##    <int> <int> <int> <int>     <fctr> <int>    <int> <fctr> <int>
##  1    20  1950     2    12 1950-02-12  1300        3     LA    22
##  2    59  1950     4    28 1950-04-28  1800        3     TX    48
##  3    60  1950     4    28 1950-04-28  1905        3     OK    40
##  4    79  1950     5     4 1950-05-04  2310        3     KS    20
##  5   132  1950     6     8 1950-06-08  2010        3     KS    20
##  6   150  1950     6    25 1950-06-25  2100        3     WI    55
##  7   164  1950     7    15 1950-07-15  1730        3     NE    31
##  8    66  1951     5    18 1951-05-18  1545        3     TX    48
##  9   149  1951     6    19 1951-06-19  1730        3     MN    27
## 10   171  1951     6    27 1951-06-27    10        3     KS    20
## # ... with 584 more rows, and 17 more variables: STATENUMBE <int>,
## #   FSCALE <int>, INJURIES <int>, FATALITIES <int>, LOSS <int>,
## #   CROPLOSS <int>, SLAT <dbl>, SLON <dbl>, ELAT <dbl>, ELON <dbl>,
## #   LENGTH <dbl>, WIDTH <int>, NS <int>, SN <int>, SG <int>, F3 <int>,
## #   F4 <int>

mlyear.df = EFless3.df %>%
  group_by(YEAR) %>%
  summarise(mL = mean(LENGTH))
mlyear.df

## # A tibble: 62 x 2
##     YEAR       mL
##    <int>    <dbl>
##  1  1950 23.27143
##  2  1951 18.16000
##  3  1952 16.79444
##  4  1953 27.43182
##  5  1954 38.05714
##  6  1955 36.57000
##  7  1956 42.01538
##  8  1957 28.09231
##  9  1958 22.62000
## 10  1959 18.14286
## # ... with 52 more rows

mlyear.df %>%
ggplot(aes(YEAR, mL))+
  geom_point()

c. Compute the number of tornadoes with EF damage rating 1 or higher by year and graph the results. (10)

Torn.df %>%
  filter(FSCALE >= 1)%>%
select(YEAR,FSCALE) %>%
  ggplot(aes(YEAR, FSCALE))+
  geom_bar(stat = "identity")

Create a data frame with the year in the first column and the total number of tornadoes in Kansas by year in the second column. List the first six rows of this new data frame. (5)

KSTor.df = Torn.df %>%
  select(YEAR,STATE,STATENUMBE) %>%
  group_by(STATE) %>%
  filter(STATE == "KS")
head(KSTor.df)

## # A tibble: 6 x 3
## # Groups:   STATE [1]
##    YEAR  STATE STATENUMBE
##   <int> <fctr>      <int>
## 1  1950     KS          1
## 2  1950     KS          2
## 3  1950     KS          3
## 4  1950     KS          4
## 5  1950     KS          5
## 6  1950     KS          6