Homework #2 Data Wrangling

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(Lahman)
library(tidyverse)

## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──

## ✔ ggplot2 3.3.6     ✔ purrr   0.3.5
## ✔ tibble  3.1.8     ✔ stringr 1.4.1
## ✔ tidyr   1.2.1     ✔ forcats 0.5.2
## ✔ readr   2.1.3     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(ggplot2)
library(datasets)
library(gapminder)
library(nycflights13)
library(gt) 
library(gtExtras)

#1 I believe the longer a flight is delayed, the more likely it is to be cancelled. According to the graph, this assumption is held to be true. As the average delayed flights in a day increased, the proportion of cancelled flights within that day also increased. The two outliers could be explained due to bad weather in the winter months, since the graph shows us it was taken during the winter.

cancel_delayed <- flights %>%
mutate(cancel = ifelse(is.na(dep_delay) & is.na(arr_delay),1,0)) %>%
  group_by(month,day) %>%
  summarize(
    count = n(),
    avg_delay = mean(arr_delay[arr_delay >0], na.rm=TRUE),
    prop_cancel = (sum(cancel)/count)
    )

## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.

cancel_delayed

## # A tibble: 365 × 5
## # Groups:   month [12]
##    month   day count avg_delay prop_cancel
##    <int> <int> <int>     <dbl>       <dbl>
##  1     1     1   842      32.5     0.00475
##  2     1     2   943      32.0     0.00848
##  3     1     3   914      27.7     0.0109 
##  4     1     4   915      28.3     0.00656
##  5     1     5   720      22.6     0.00417
##  6     1     6   832      24.4     0.00120
##  7     1     7   933      27.8     0.00322
##  8     1     8   899      20.8     0.00445
##  9     1     9   902      25.6     0.00554
## 10     1    10   932      27.3     0.00322
## # … with 355 more rows

ggplot(data = cancel_delayed) +
    geom_point(aes(x = avg_delay, y = prop_cancel, color = month))

#2A The hour of the day with the longest average depature delay is at the 19th hour, or 7 pm.

group_by(flights, hour)%>%
summarise( 
avg_dep_delay = mean(dep_delay, na.rm = TRUE),
) %>% 
arrange(desc(avg_dep_delay))

## # A tibble: 20 × 2
##     hour avg_dep_delay
##    <dbl>         <dbl>
##  1    19        24.8  
##  2    20        24.3  
##  3    21        24.2  
##  4    18        21.1  
##  5    17        21.1  
##  6    22        18.8  
##  7    16        18.8  
##  8    15        16.9  
##  9    23        14.0  
## 10    14        13.8  
## 11    13        11.4  
## 12    12         8.61 
## 13    11         7.19 
## 14    10         6.50 
## 15     9         4.58 
## 16     8         4.13 
## 17     7         1.91 
## 18     6         1.64 
## 19     5         0.688
## 20     1       NaN

#2B You are most likely to find a flight that leaves or on time at the first hour, or one am.

results <- flights %>%
mutate(early_ot = ifelse(!is.na(dep_time) & !is.na(arr_time), 0, 1))%>%
group_by(hour)%>%
summarise( 
count = n(),
number_of_eot = sum(early_ot),
percentage_eot = (number_of_eot/count)
)%>%
arrange(desc(percentage_eot))
results

## # A tibble: 20 × 4
##     hour count number_of_eot percentage_eot
##    <dbl> <int>         <dbl>          <dbl>
##  1     1     1             1        1      
##  2    19 21441           898        0.0419 
##  3    20 16739           656        0.0392 
##  4    16 23002           891        0.0387 
##  5    21 10933           418        0.0382 
##  6    22  2639            81        0.0307 
##  7    15 23888           733        0.0307 
##  8    18 21783           667        0.0306 
##  9    17 24426           691        0.0283 
## 10    14 21706           611        0.0281 
## 11    13 19956           456        0.0229 
## 12    12 18181           404        0.0222 
## 13    11 16033           314        0.0196 
## 14    10 16708           303        0.0181 
## 15     6 25951           454        0.0175 
## 16     8 27242           465        0.0171 
## 17     9 20312           343        0.0169 
## 18     7 22821           305        0.0134 
## 19    23  1061            13        0.0123 
## 20     5  1953             9        0.00461

#3 The ones the “worst” batting average are those who have a total of 0 hits, therefore causing their average to be zero as well. However, due to the lack of chances at bat, this cannot be a true representation of the players skills. Similar to the ones with the batting average of 100%, because they hit one while having only one at bat. Therefore these findings cannot be taken literally due to a lack of accuracy.
# Worst player is “aardsda01” in 2006 and the best player is “abernte02” in 1960.

library(Lahman)
batting  <- as_tibble(Lahman::Batting)

## compute batting averagre by player 
## arrange 
## find best/worst using head() and tail()

findings_2 <- Batting %>%
group_by(playerID, yearID) %>% 
  summarise(
    total_hits = sum(H),
    total_bats = sum(AB),
    BA = (sum(H)/sum(AB))

)%>%
arrange((BA))

## `summarise()` has grouped output by 'playerID'. You can override using the
## `.groups` argument.

findings_2

## # A tibble: 102,198 × 5
## # Groups:   playerID [20,166]
##    playerID  yearID total_hits total_bats    BA
##    <chr>      <int>      <int>      <int> <dbl>
##  1 aardsda01   2006          0          2     0
##  2 aardsda01   2008          0          1     0
##  3 aardsda01   2015          0          1     0
##  4 aasedo01    1989          0          5     0
##  5 abadan01    2001          0          1     0
##  6 abadan01    2006          0          3     0
##  7 abadfe01    2010          0          1     0
##  8 abadfe01    2016          0          1     0
##  9 aberal01    1950          0          2     0
## 10 abercda01   1871          0          4     0
## # … with 102,188 more rows

findings_3 <- Batting%>%
group_by(playerID, yearID) %>% 
  summarise(
    total_hits = sum(H),
    total_bats = sum(AB),
    BA = (sum(H)/sum(AB))

)%>%
arrange(desc(BA))

## `summarise()` has grouped output by 'playerID'. You can override using the
## `.groups` argument.

findings_3

## # A tibble: 102,198 × 5
## # Groups:   playerID [20,166]
##    playerID  yearID total_hits total_bats    BA
##    <chr>      <int>      <int>      <int> <dbl>
##  1 abernte02   1960          1          1     1
##  2 abramge01   1923          1          1     1
##  3 acklefr01   1964          1          1     1
##  4 alanirj01   2019          1          1     1
##  5 alberan01   2017          1          1     1
##  6 alberma01   2016          1          1     1
##  7 allarko01   2018          1          1     1
##  8 allismi01   1914          1          1     1
##  9 altroni01   1924          1          1     1
## 10 altroni01   1929          1          1     1
## # … with 102,188 more rows

#3B The worst player, based on those with at least 100 at bats, is jonesra01 of 1976 with a batting average of 5.83 percent

batting <- as_tibble(Lahman::Batting)
findings_4 <- Batting%>%
filter(AB > 99)%>%
group_by(playerID, yearID) %>% 
  summarise(
    total_hits = sum(H),
    total_bats = sum(AB),
    BA = (sum(H)/sum(AB))

)%>%
arrange((BA))

## `summarise()` has grouped output by 'playerID'. You can override using the
## `.groups` argument.

findings_4

## # A tibble: 39,450 × 5
## # Groups:   playerID [7,158]
##    playerID  yearID total_hits total_bats     BA
##    <chr>      <int>      <int>      <int>  <dbl>
##  1 jonesra01   1976          6        103 0.0583
##  2 koufasa01   1963          7        110 0.0636
##  3 wilheka01   1904          7        100 0.07  
##  4 garvine01   1903          8        106 0.0755
##  5 koufasa01   1966          9        118 0.0763
##  6 keefeti01   1892         10        117 0.0855
##  7 grovele01   1933          9        105 0.0857
##  8 leonadu01   1917          9        104 0.0865
##  9 viaule01    1888         13        149 0.0872
## 10 frommar01   1912          9        103 0.0874
## # … with 39,440 more rows

#3B The best player, based on those with at least 100 at bats, is meyerle01 of 1871 with a batting average of 49.2 percent.

batting <- as_tibble(Lahman::Batting)
findings_5 <- Batting%>%
filter(AB > 99)%>%
group_by(playerID, yearID) %>% 
  summarise(
    total_hits = sum(H),
    total_bats = sum(AB),
    BA = (sum(H)/sum(AB))

)%>%
arrange(desc(BA))

## `summarise()` has grouped output by 'playerID'. You can override using the
## `.groups` argument.

findings_5

## # A tibble: 39,450 × 5
## # Groups:   playerID [7,158]
##    playerID  yearID total_hits total_bats    BA
##    <chr>      <int>      <int>      <int> <dbl>
##  1 meyerle01   1871         64        130 0.492
##  2 duffyhu01   1894        237        539 0.440
##  3 oneilti01   1887        225        517 0.435
##  4 mcveyca01   1871         66        153 0.431
##  5 barnero01   1873        138        320 0.431
##  6 barnero01   1872         99        230 0.430
##  7 barnero01   1876        138        322 0.429
##  8 lajoina01   1901        232        544 0.426
##  9 keelewi01   1897        239        564 0.424
## 10 hornsro01   1924        227        536 0.424
## # … with 39,440 more rows

#4 Compared to the beginning, the batting averages seem to overall decrease as time progresses. This can be seen where the max batting average per year decreases over time while generating a wider gap between the average batting average as well.

batting <- as_tibble(Lahman::Batting)
findings_6 <- Batting%>%
filter(AB > 149)%>%
group_by(yearID) %>% 
  summarise(
    avg_BA = mean(H/AB),
    max_BA = max(H/AB)
)
findings_6

## # A tibble: 151 × 3
##    yearID avg_BA max_BA
##     <int>  <dbl>  <dbl>
##  1   1871  0.297  0.431
##  2   1872  0.294  0.430
##  3   1873  0.300  0.431
##  4   1874  0.279  0.394
##  5   1875  0.263  0.367
##  6   1876  0.268  0.429
##  7   1877  0.275  0.387
##  8   1878  0.258  0.358
##  9   1879  0.253  0.357
## 10   1880  0.249  0.360
## # … with 141 more rows

ggplot(findings_6, x = yearID) + geom_line(aes(x = yearID, y = avg_BA, color = "red")) + geom_line(aes(x=yearID,y = max_BA))

#5 It seems as though this finding is true until about 1980, then standard deviation for batting averages begins to increase

batting <- as_tibble(Lahman::Batting)
findings_7 <- batting %>%
filter(AB > 149)%>%
group_by(yearID)%>%
summarise(
standard_deviation = sd(H/AB)
)%>%
arrange(desc(standard_deviation))
findings_7

## # A tibble: 151 × 2
##    yearID standard_deviation
##     <int>              <dbl>
##  1   1871             0.0652
##  2   1878             0.0521
##  3   1876             0.0506
##  4   1877             0.0500
##  5   1872             0.0498
##  6   1873             0.0498
##  7   1884             0.0480
##  8   1879             0.0475
##  9   1885             0.0473
## 10   1886             0.0468
## # … with 141 more rows

ggplot(data = findings_7, aes(x = yearID, y = standard_deviation)) + geom_point() + geom_smooth()

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Final project: I believe that I would collect the total number of people who walk through Locust between the hours of 9:00 am and 5:00 pm for my x variable. This is because the number of people that walk through Locust Walk is an independent variable, meaning the number of people is not affected by the y variable. The y, dependent, variable would be the number of people that walked through Locust Walk between the hours of 9:00 am and 5:00 pm that registered to vote. I choose Locust Walk specifically because this is where most students walk to get around campus. It is considered the midpoint of the campus, therefore the population pool would be more diverse. The diversity allows the results to be more accurate because the population is more representative of the group being studied which is college students at UPenn. Lastly, the hours were chosen purposefully because that is the typical workday hours, therefore at some point in time students will be walking through Locust to get to their classes, etc. It is the perfect sweet spot to collect data.

Homework #2 Data Wrangling

85015583

2022-10-09