library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(Lahman)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ stringr 1.4.1
## ✔ tidyr 1.2.1 ✔ forcats 0.5.2
## ✔ readr 2.1.3
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ggplot2)
library(datasets)
library(gapminder)
library(nycflights13)
library(gt)
library(gtExtras)
#1 I believe the longer a flight is delayed, the more likely it is to be cancelled. According to the graph, this assumption is held to be true. As the average delayed flights in a day increased, the proportion of cancelled flights within that day also increased. The two outliers could be explained due to bad weather in the winter months, since the graph shows us it was taken during the winter.
cancel_delayed <- flights %>%
mutate(cancel = ifelse(is.na(dep_delay) & is.na(arr_delay),1,0)) %>%
group_by(month,day) %>%
summarize(
count = n(),
avg_delay = mean(arr_delay[arr_delay >0], na.rm=TRUE),
prop_cancel = (sum(cancel)/count)
)
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
cancel_delayed
## # A tibble: 365 × 5
## # Groups: month [12]
## month day count avg_delay prop_cancel
## <int> <int> <int> <dbl> <dbl>
## 1 1 1 842 32.5 0.00475
## 2 1 2 943 32.0 0.00848
## 3 1 3 914 27.7 0.0109
## 4 1 4 915 28.3 0.00656
## 5 1 5 720 22.6 0.00417
## 6 1 6 832 24.4 0.00120
## 7 1 7 933 27.8 0.00322
## 8 1 8 899 20.8 0.00445
## 9 1 9 902 25.6 0.00554
## 10 1 10 932 27.3 0.00322
## # … with 355 more rows
ggplot(data = cancel_delayed) +
geom_point(aes(x = avg_delay, y = prop_cancel, color = month))
#2A The hour of the day with the longest average depature delay is at the 19th hour, or 7 pm.
group_by(flights, hour)%>%
summarise(
avg_dep_delay = mean(dep_delay, na.rm = TRUE),
) %>%
arrange(desc(avg_dep_delay))
## # A tibble: 20 × 2
## hour avg_dep_delay
## <dbl> <dbl>
## 1 19 24.8
## 2 20 24.3
## 3 21 24.2
## 4 18 21.1
## 5 17 21.1
## 6 22 18.8
## 7 16 18.8
## 8 15 16.9
## 9 23 14.0
## 10 14 13.8
## 11 13 11.4
## 12 12 8.61
## 13 11 7.19
## 14 10 6.50
## 15 9 4.58
## 16 8 4.13
## 17 7 1.91
## 18 6 1.64
## 19 5 0.688
## 20 1 NaN
#2B You are most likely to find a flight that leaves or on time at the first hour, or one am.
results <- flights %>%
mutate(early_ot = ifelse(!is.na(dep_time) & !is.na(arr_time), 0, 1))%>%
group_by(hour)%>%
summarise(
count = n(),
number_of_eot = sum(early_ot),
percentage_eot = (number_of_eot/count)
)%>%
arrange(desc(percentage_eot))
results
## # A tibble: 20 × 4
## hour count number_of_eot percentage_eot
## <dbl> <int> <dbl> <dbl>
## 1 1 1 1 1
## 2 19 21441 898 0.0419
## 3 20 16739 656 0.0392
## 4 16 23002 891 0.0387
## 5 21 10933 418 0.0382
## 6 22 2639 81 0.0307
## 7 15 23888 733 0.0307
## 8 18 21783 667 0.0306
## 9 17 24426 691 0.0283
## 10 14 21706 611 0.0281
## 11 13 19956 456 0.0229
## 12 12 18181 404 0.0222
## 13 11 16033 314 0.0196
## 14 10 16708 303 0.0181
## 15 6 25951 454 0.0175
## 16 8 27242 465 0.0171
## 17 9 20312 343 0.0169
## 18 7 22821 305 0.0134
## 19 23 1061 13 0.0123
## 20 5 1953 9 0.00461
#3 The ones the “worst” batting average are those who have a total of
0 hits, therefore causing their average to be zero as well. However, due
to the lack of chances at bat, this cannot be a true representation of
the players skills. Similar to the ones with the batting average of
100%, because they hit one while having only one at bat. Therefore these
findings cannot be taken literally due to a lack of accuracy.
# Worst player is “aardsda01” in 2006 and the best player is “abernte02”
in 1960.
library(Lahman)
batting <- as_tibble(Lahman::Batting)
## compute batting averagre by player
## arrange
## find best/worst using head() and tail()
findings_2 <- Batting %>%
group_by(playerID, yearID) %>%
summarise(
total_hits = sum(H),
total_bats = sum(AB),
BA = (sum(H)/sum(AB))
)%>%
arrange((BA))
## `summarise()` has grouped output by 'playerID'. You can override using the
## `.groups` argument.
findings_2
## # A tibble: 102,198 × 5
## # Groups: playerID [20,166]
## playerID yearID total_hits total_bats BA
## <chr> <int> <int> <int> <dbl>
## 1 aardsda01 2006 0 2 0
## 2 aardsda01 2008 0 1 0
## 3 aardsda01 2015 0 1 0
## 4 aasedo01 1989 0 5 0
## 5 abadan01 2001 0 1 0
## 6 abadan01 2006 0 3 0
## 7 abadfe01 2010 0 1 0
## 8 abadfe01 2016 0 1 0
## 9 aberal01 1950 0 2 0
## 10 abercda01 1871 0 4 0
## # … with 102,188 more rows
findings_3 <- Batting%>%
group_by(playerID, yearID) %>%
summarise(
total_hits = sum(H),
total_bats = sum(AB),
BA = (sum(H)/sum(AB))
)%>%
arrange(desc(BA))
## `summarise()` has grouped output by 'playerID'. You can override using the
## `.groups` argument.
findings_3
## # A tibble: 102,198 × 5
## # Groups: playerID [20,166]
## playerID yearID total_hits total_bats BA
## <chr> <int> <int> <int> <dbl>
## 1 abernte02 1960 1 1 1
## 2 abramge01 1923 1 1 1
## 3 acklefr01 1964 1 1 1
## 4 alanirj01 2019 1 1 1
## 5 alberan01 2017 1 1 1
## 6 alberma01 2016 1 1 1
## 7 allarko01 2018 1 1 1
## 8 allismi01 1914 1 1 1
## 9 altroni01 1924 1 1 1
## 10 altroni01 1929 1 1 1
## # … with 102,188 more rows
#3B The worst player, based on those with at least 100 at bats, is jonesra01 of 1976 with a batting average of 5.83 percent
batting <- as_tibble(Lahman::Batting)
findings_4 <- Batting%>%
filter(AB > 99)%>%
group_by(playerID, yearID) %>%
summarise(
total_hits = sum(H),
total_bats = sum(AB),
BA = (sum(H)/sum(AB))
)%>%
arrange((BA))
## `summarise()` has grouped output by 'playerID'. You can override using the
## `.groups` argument.
findings_4
## # A tibble: 39,450 × 5
## # Groups: playerID [7,158]
## playerID yearID total_hits total_bats BA
## <chr> <int> <int> <int> <dbl>
## 1 jonesra01 1976 6 103 0.0583
## 2 koufasa01 1963 7 110 0.0636
## 3 wilheka01 1904 7 100 0.07
## 4 garvine01 1903 8 106 0.0755
## 5 koufasa01 1966 9 118 0.0763
## 6 keefeti01 1892 10 117 0.0855
## 7 grovele01 1933 9 105 0.0857
## 8 leonadu01 1917 9 104 0.0865
## 9 viaule01 1888 13 149 0.0872
## 10 frommar01 1912 9 103 0.0874
## # … with 39,440 more rows
#3B The best player, based on those with at least 100 at bats, is meyerle01 of 1871 with a batting average of 49.2 percent.
batting <- as_tibble(Lahman::Batting)
findings_5 <- Batting%>%
filter(AB > 99)%>%
group_by(playerID, yearID) %>%
summarise(
total_hits = sum(H),
total_bats = sum(AB),
BA = (sum(H)/sum(AB))
)%>%
arrange(desc(BA))
## `summarise()` has grouped output by 'playerID'. You can override using the
## `.groups` argument.
findings_5
## # A tibble: 39,450 × 5
## # Groups: playerID [7,158]
## playerID yearID total_hits total_bats BA
## <chr> <int> <int> <int> <dbl>
## 1 meyerle01 1871 64 130 0.492
## 2 duffyhu01 1894 237 539 0.440
## 3 oneilti01 1887 225 517 0.435
## 4 mcveyca01 1871 66 153 0.431
## 5 barnero01 1873 138 320 0.431
## 6 barnero01 1872 99 230 0.430
## 7 barnero01 1876 138 322 0.429
## 8 lajoina01 1901 232 544 0.426
## 9 keelewi01 1897 239 564 0.424
## 10 hornsro01 1924 227 536 0.424
## # … with 39,440 more rows
#4 Compared to the beginning, the batting averages seem to overall decrease as time progresses. This can be seen where the max batting average per year decreases over time while generating a wider gap between the average batting average as well.
batting <- as_tibble(Lahman::Batting)
findings_6 <- Batting%>%
filter(AB > 149)%>%
group_by(yearID) %>%
summarise(
avg_BA = mean(H/AB),
max_BA = max(H/AB)
)
findings_6
## # A tibble: 151 × 3
## yearID avg_BA max_BA
## <int> <dbl> <dbl>
## 1 1871 0.297 0.431
## 2 1872 0.294 0.430
## 3 1873 0.300 0.431
## 4 1874 0.279 0.394
## 5 1875 0.263 0.367
## 6 1876 0.268 0.429
## 7 1877 0.275 0.387
## 8 1878 0.258 0.358
## 9 1879 0.253 0.357
## 10 1880 0.249 0.360
## # … with 141 more rows
ggplot(findings_6, x = yearID) + geom_line(aes(x = yearID, y = avg_BA, color = "red")) + geom_line(aes(x=yearID,y = max_BA))
#5 It seems as though this finding is true until about 1980, then standard deviation for batting averages begins to increase
batting <- as_tibble(Lahman::Batting)
findings_7 <- batting %>%
filter(AB > 149)%>%
group_by(yearID)%>%
summarise(
standard_deviation = sd(H/AB)
)%>%
arrange(desc(standard_deviation))
findings_7
## # A tibble: 151 × 2
## yearID standard_deviation
## <int> <dbl>
## 1 1871 0.0652
## 2 1878 0.0521
## 3 1876 0.0506
## 4 1877 0.0500
## 5 1872 0.0498
## 6 1873 0.0498
## 7 1884 0.0480
## 8 1879 0.0475
## 9 1885 0.0473
## 10 1886 0.0468
## # … with 141 more rows
ggplot(data = findings_7, aes(x = yearID, y = standard_deviation)) + geom_point() + geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#Final project: I believe that I would collect the total number of people who walk through Locust between the hours of 9:00 am and 5:00 pm for my x variable. This is because the number of people that walk through Locust Walk is an independent variable, meaning the number of people is not affected by the y variable. The y, dependent, variable would be the number of people that walked through Locust Walk between the hours of 9:00 am and 5:00 pm that registered to vote. I choose Locust Walk specifically because this is where most students walk to get around campus. It is considered the midpoint of the campus, therefore the population pool would be more diverse. The diversity allows the results to be more accurate because the population is more representative of the group being studied which is college students at UPenn. Lastly, the hours were chosen purposefully because that is the typical workday hours, therefore at some point in time students will be walking through Locust to get to their classes, etc. It is the perfect sweet spot to collect data.