date()
## [1] "Tue Oct 03 19:56:59 2017"
Due Date: September 27, 2017 Total Points: 45
1 The dataset hflights from the hflights package contains all 227,496 flights that departed Houston in 2011. Using the functions in the dplyr package
library(hflights)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
hflights.df = tbl_df(hflights)
sep11.df = hflights %>%
filter(Month == "9" & DayofMonth == "11")
head(sep11.df)
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1 2011 9 11 7 1546 1651 AA 458
## 2 2011 9 11 7 551 904 AA 466
## 3 2011 9 11 7 1936 2036 AA 657
## 4 2011 9 11 7 1438 1544 AA 742
## 5 2011 9 11 7 1720 2030 AA 1294
## 6 2011 9 11 7 1142 1258 AA 1848
## TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance
## 1 N559AA 65 40 -14 -4 IAH DFW 224
## 2 N3EGAA 133 115 -16 -9 IAH MIA 964
## 3 N498AA 60 40 -19 -4 IAH DFW 224
## 4 N470AA 66 43 9 18 IAH DFW 224
## 5 N3BVAA 130 118 -20 -5 IAH MIA 964
## 6 N598AA 76 40 -2 -3 IAH DFW 224
## TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 1 12 13 0 0
## 2 5 13 0 0
## 3 8 12 0 0
## 4 6 17 0 0
## 5 5 7 0 0
## 6 22 14 0 0
taxiout.df = sep11.df %>%
select(TaxiOut)
head(taxiout.df)
## TaxiOut
## 1 13
## 2 13
## 3 12
## 4 17
## 5 7
## 6 14
sum(taxiout.df)
## [1] 8011
8011 were taxied out or departed.
tailanddepart.df = hflights %>%
select(TailNum, TaxiOut) %>%
arrange(desc(TaxiOut))
head(tailanddepart.df)
## TailNum TaxiOut
## 1 N14998 163
## 2 N27190 152
## 3 N29917 151
## 4 N14558 151
## 5 N510SW 150
## 6 N31412 146
2 Answer the following questions using the tornado data set (http://myweb.fsu.edu/jelsner/temp/data/Tornadoes.txt).
L = "http://myweb.fsu.edu/jelsner/temp/data/Tornadoes.txt"
Torn.df = read.table(L, header = TRUE)
head(Torn.df)
## OM YEAR MONTH DAY DATE TIME TIMEZONE STATE FIPS STATENUMBE FSCALE
## 1 1 1950 1 3 1950-01-03 1100 3 MO 29 1 3
## 2 2 1950 1 3 1950-01-03 1155 3 IL 17 2 3
## 3 3 1950 1 3 1950-01-03 1600 3 OH 39 1 1
## 4 4 1950 1 13 1950-01-13 525 3 AR 5 1 3
## 5 5 1950 1 25 1950-01-25 1930 3 MO 29 2 2
## 6 6 1950 1 25 1950-01-25 2100 3 IL 17 3 2
## INJURIES FATALITIES LOSS CROPLOSS SLAT SLON ELAT ELON LENGTH WIDTH
## 1 3 0 6 0 38.77 -90.22 38.83 -90.03 9.5 150
## 2 3 0 5 0 39.10 -89.30 39.12 -89.23 3.6 130
## 3 1 0 4 0 40.88 -84.58 0.00 0.00 0.1 10
## 4 1 1 3 0 34.40 -94.37 0.00 0.00 0.6 17
## 5 5 0 5 0 37.60 -90.68 37.63 -90.65 2.3 300
## 6 0 0 5 0 41.17 -87.33 0.00 0.00 0.1 100
## NS SN SG F1 F2 F3 F4
## 1 2 0 1 0 0 0 0
## 2 1 1 1 135 0 0 0
## 3 1 1 1 161 0 0 0
## 4 1 1 1 113 0 0 0
## 5 1 1 1 93 0 0 0
## 6 1 1 1 91 0 0 0
Torn.df = tbl_df(Torn.df)
Torn.df
## # A tibble: 56,221 x 28
## OM YEAR MONTH DAY DATE TIME TIMEZONE STATE FIPS
## * <int> <int> <int> <int> <fctr> <int> <int> <fctr> <int>
## 1 1 1950 1 3 1950-01-03 1100 3 MO 29
## 2 2 1950 1 3 1950-01-03 1155 3 IL 17
## 3 3 1950 1 3 1950-01-03 1600 3 OH 39
## 4 4 1950 1 13 1950-01-13 525 3 AR 5
## 5 5 1950 1 25 1950-01-25 1930 3 MO 29
## 6 6 1950 1 25 1950-01-25 2100 3 IL 17
## 7 7 1950 1 26 1950-01-26 1800 3 TX 48
## 8 8 1950 2 11 1950-02-11 1310 3 TX 48
## 9 9 1950 2 11 1950-02-11 1350 3 TX 48
## 10 10 1950 2 11 1950-02-11 2100 3 TX 48
## # ... with 56,211 more rows, and 19 more variables: STATENUMBE <int>,
## # FSCALE <int>, INJURIES <int>, FATALITIES <int>, LOSS <int>,
## # CROPLOSS <int>, SLAT <dbl>, SLON <dbl>, ELAT <dbl>, ELON <dbl>,
## # LENGTH <dbl>, WIDTH <int>, NS <int>, SN <int>, SG <int>, F1 <int>,
## # F2 <int>, F3 <int>, F4 <int>
group_by() and summarize() to determine the number of tornadoes by each state. (5)Torn.df %>%
group_by(STATE) %>%
summarize(STATENUMBE = max(STATENUMBE, na.rm = TRUE))
## # A tibble: 52 x 2
## STATE STATENUMBE
## <fctr> <dbl>
## 1 AK 2
## 2 AL 145
## 3 AR 105
## 4 AZ 17
## 5 CA 30
## 6 CO 98
## 7 CT 8
## 8 DC 1
## 9 DE 6
## 10 FL 115
## # ... with 42 more rows
EFless3.df = Torn.df %>%
mutate(Length = LENGTH * 1609) %>%
filter(FSCALE > 3) %>%
select(OM:SG,F3:F4)
EFless3.df
## # A tibble: 594 x 26
## OM YEAR MONTH DAY DATE TIME TIMEZONE STATE FIPS
## <int> <int> <int> <int> <fctr> <int> <int> <fctr> <int>
## 1 20 1950 2 12 1950-02-12 1300 3 LA 22
## 2 59 1950 4 28 1950-04-28 1800 3 TX 48
## 3 60 1950 4 28 1950-04-28 1905 3 OK 40
## 4 79 1950 5 4 1950-05-04 2310 3 KS 20
## 5 132 1950 6 8 1950-06-08 2010 3 KS 20
## 6 150 1950 6 25 1950-06-25 2100 3 WI 55
## 7 164 1950 7 15 1950-07-15 1730 3 NE 31
## 8 66 1951 5 18 1951-05-18 1545 3 TX 48
## 9 149 1951 6 19 1951-06-19 1730 3 MN 27
## 10 171 1951 6 27 1951-06-27 10 3 KS 20
## # ... with 584 more rows, and 17 more variables: STATENUMBE <int>,
## # FSCALE <int>, INJURIES <int>, FATALITIES <int>, LOSS <int>,
## # CROPLOSS <int>, SLAT <dbl>, SLON <dbl>, ELAT <dbl>, ELON <dbl>,
## # LENGTH <dbl>, WIDTH <int>, NS <int>, SN <int>, SG <int>, F3 <int>,
## # F4 <int>
mlyear.df = EFless3.df %>%
group_by(YEAR) %>%
summarise(mL = mean(LENGTH))
mlyear.df
## # A tibble: 62 x 2
## YEAR mL
## <int> <dbl>
## 1 1950 23.27143
## 2 1951 18.16000
## 3 1952 16.79444
## 4 1953 27.43182
## 5 1954 38.05714
## 6 1955 36.57000
## 7 1956 42.01538
## 8 1957 28.09231
## 9 1958 22.62000
## 10 1959 18.14286
## # ... with 52 more rows
mlyear.df %>%
ggplot(aes(YEAR, mL))+
geom_point()
c. Compute the number of tornadoes with EF damage rating 1 or higher by year and graph the results. (10)
Torn.df %>%
filter(FSCALE >= 1)%>%
select(YEAR,FSCALE) %>%
ggplot(aes(YEAR, FSCALE))+
geom_bar(stat = "identity")
KSTor.df = Torn.df %>%
select(YEAR,STATE,STATENUMBE) %>%
group_by(STATE) %>%
filter(STATE == "KS")
head(KSTor.df)
## # A tibble: 6 x 3
## # Groups: STATE [1]
## YEAR STATE STATENUMBE
## <int> <fctr> <int>
## 1 1950 KS 1
## 2 1950 KS 2
## 3 1950 KS 3
## 4 1950 KS 4
## 5 1950 KS 5
## 6 1950 KS 6