library session

library(tidyverse)
## Warning: 패키지 'tidyverse'는 R 버전 4.3.2에서 작성되었습니다
## Warning: 패키지 'ggplot2'는 R 버전 4.3.2에서 작성되었습니다
## Warning: 패키지 'dplyr'는 R 버전 4.3.2에서 작성되었습니다
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(haven)
## Warning: 패키지 'haven'는 R 버전 4.3.2에서 작성되었습니다
library(ggplot2)
library(GGally)
## Warning: 패키지 'GGally'는 R 버전 4.3.2에서 작성되었습니다
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(epitools)
library(BSDA)
## 필요한 패키지를 로딩중입니다: lattice
## 
## 다음의 패키지를 부착합니다: 'BSDA'
## 
## The following object is masked from 'package:datasets':
## 
##     Orange
library(lme4)
## Warning: 패키지 'lme4'는 R 버전 4.3.2에서 작성되었습니다
## 필요한 패키지를 로딩중입니다: Matrix
## 
## 다음의 패키지를 부착합니다: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
library(daewr)
## Warning: 패키지 'daewr'는 R 버전 4.3.2에서 작성되었습니다
library(rsm)
## Warning: 패키지 'rsm'는 R 버전 4.3.2에서 작성되었습니다
library(openxlsx)
## Warning: 패키지 'openxlsx'는 R 버전 4.3.2에서 작성되었습니다
library(swirl)
## Warning: 패키지 'swirl'는 R 버전 4.3.2에서 작성되었습니다
## 
## | Hi! Type swirl() when you are ready to begin.
library(dplyr)

3.0.Reading Data File & 3.1.Table Screenshot of Data

rb_stats <- readxl::read_excel("D:/OneDrive - 인하대학교/문서/Biostatistics/RB_stats_raw.xlsx",
                               sheet = 2)
head(rb_stats)
## # A tibble: 6 × 15
##   `Player Name`     `Height(ft-in)` `Height(cm)` `Weight(lb)` `Weight(kg)`   Age
##   <chr>             <chr>                  <dbl>        <dbl>        <dbl> <dbl>
## 1 Christian McCaff… 5-11                    180.          210         95.3    25
## 2 Raheem Mostert    5-10                    178.          205         93.0    29
## 3 Derrick Henry     6-3                     190.          247        112.     27
## 4 Travis Etienne    5-10                    178.          215         97.5    22
## 5 Josh Jacobs       5-10                    178.          223        101.     23
## 6 Saquon Barkely    6-0                     183.          232        105.     24
## # ℹ 9 more variables: Exp <dbl>, Pick <chr>, Games <dbl>, Att <dbl>, Yds <dbl>,
## #   YBC <dbl>, YAC <dbl>, BTK <dbl>, Year <dbl>
summary(rb_stats)
##  Player Name        Height(ft-in)        Height(cm)      Weight(lb)   
##  Length:60          Length:60          Min.   :170.2   Min.   :200.0  
##  Class :character   Class :character   1st Qu.:177.8   1st Qu.:210.0  
##  Mode  :character   Mode  :character   Median :179.1   Median :221.5  
##                                        Mean   :180.0   Mean   :219.8  
##                                        3rd Qu.:182.9   3rd Qu.:226.2  
##                                        Max.   :190.5   Max.   :247.0  
##    Weight(kg)          Age          Exp           Pick          
##  Min.   : 90.72   Min.   :22   Min.   :1.00   Length:60         
##  1st Qu.: 95.25   1st Qu.:25   1st Qu.:4.00   Class :character  
##  Median :100.47   Median :26   Median :5.00   Mode  :character  
##  Mean   : 99.70   Mean   :26   Mean   :5.15                     
##  3rd Qu.:102.63   3rd Qu.:27   3rd Qu.:6.25                     
##  Max.   :112.04   Max.   :31   Max.   :9.00                     
##      Games            Att             Yds              YBC       
##  Min.   : 0.00   Min.   :  0.0   Min.   :   0.0   Min.   :  0.0  
##  1st Qu.:12.00   1st Qu.:151.0   1st Qu.: 599.0   1st Qu.:379.5  
##  Median :14.00   Median :202.5   Median : 855.0   Median :478.0  
##  Mean   :13.05   Mean   :191.0   Mean   : 831.3   Mean   :464.7  
##  3rd Qu.:16.25   3rd Qu.:234.0   3rd Qu.:1079.8   3rd Qu.:586.0  
##  Max.   :17.00   Max.   :349.0   Max.   :1811.0   Max.   :865.0  
##       YAC             BTK             Year     
##  Min.   :  0.0   Min.   : 0.00   Min.   :2021  
##  1st Qu.:259.5   1st Qu.: 7.00   1st Qu.:2021  
##  Median :377.0   Median :12.00   Median :2022  
##  Mean   :366.6   Mean   :12.72   Mean   :2022  
##  3rd Qu.:451.5   3rd Qu.:16.00   3rd Qu.:2023  
##  Max.   :946.0   Max.   :35.00   Max.   :2023

3.2. More Calculation with stats

rb_stats$Avg <- rb_stats$`Yds`/rb_stats$`Att` #AVG
rb_stats$ABR <- rb_stats$YAC / rb_stats$YBC   #ABR
rb_stats$BTG <- rb_stats$BTK / rb_stats$Games #BTG
rb_stats$MR <- rb_stats$`Weight(kg)`/(rb_stats$`Height(cm)`/100)^2 #Muscle Rate

rb_stats_filtered <-  filter(rb_stats, rb_stats$Games >= 5 ) #Filtering

head(rb_stats_filtered)
## # A tibble: 6 × 19
##   `Player Name`     `Height(ft-in)` `Height(cm)` `Weight(lb)` `Weight(kg)`   Age
##   <chr>             <chr>                  <dbl>        <dbl>        <dbl> <dbl>
## 1 Christian McCaff… 5-11                    180.          210         95.3    25
## 2 Derrick Henry     6-3                     190.          247        112.     27
## 3 Josh Jacobs       5-10                    178.          223        101.     23
## 4 Saquon Barkely    6-0                     183.          232        105.     24
## 5 David Montgomery  5-11                    180.          224        102.     24
## 6 Joe Mixon         6-1                     185.          220         99.8    25
## # ℹ 13 more variables: Exp <dbl>, Pick <chr>, Games <dbl>, Att <dbl>,
## #   Yds <dbl>, YBC <dbl>, YAC <dbl>, BTK <dbl>, Year <dbl>, Avg <dbl>,
## #   ABR <dbl>, BTG <dbl>, MR <dbl>

5.0. The Effect of Excluding Outlier.

#5.0. The Effect of Excluding Outlier.
lm(data = rb_stats, Avg ~ `Height(cm)`) %>% summary()
## 
## Call:
## lm(formula = Avg ~ `Height(cm)`, data = rb_stats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.3046 -0.5535 -0.1093  0.3502  5.5136 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  11.01740    5.40872   2.037   0.0464 *
## `Height(cm)` -0.03673    0.03005  -1.222   0.2267  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9703 on 56 degrees of freedom
##   (결측으로 인하여 2개의 관측치가 삭제되었습니다.)
## Multiple R-squared:  0.02599,    Adjusted R-squared:  0.008597 
## F-statistic: 1.494 on 1 and 56 DF,  p-value: 0.2267
lm(data = rb_stats_filtered, Avg ~ `Height(cm)`) %>% summary()
## 
## Call:
## lm(formula = Avg ~ `Height(cm)`, data = rb_stats_filtered)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.18741 -0.37057  0.00899  0.34022  1.25324 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   8.91284    3.26696   2.728  0.00858 **
## `Height(cm)` -0.02575    0.01815  -1.419  0.16162   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5846 on 54 degrees of freedom
## Multiple R-squared:  0.03595,    Adjusted R-squared:  0.0181 
## F-statistic: 2.014 on 1 and 54 DF,  p-value: 0.1616
lm(data = rb_stats, Avg ~ `Weight(kg)`) %>% summary()
## 
## Call:
## lm(formula = Avg ~ `Weight(kg)`, data = rb_stats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.3934 -0.5103 -0.1132  0.3202  5.3388 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   8.18726    2.24729   3.643  0.00059 ***
## `Weight(kg)` -0.03792    0.02251  -1.685  0.09765 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9591 on 56 degrees of freedom
##   (결측으로 인하여 2개의 관측치가 삭제되었습니다.)
## Multiple R-squared:  0.04823,    Adjusted R-squared:  0.03123 
## F-statistic: 2.838 on 1 and 56 DF,  p-value: 0.09765
lm(data = rb_stats_filtered, Avg ~ `Weight(kg)`) %>% summary()
## 
## Call:
## lm(formula = Avg ~ `Weight(kg)`, data = rb_stats_filtered)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.22337 -0.42973 -0.01373  0.30021  1.31090 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   6.34588    1.38955   4.567 2.91e-05 ***
## `Weight(kg)` -0.02073    0.01391  -1.490    0.142    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5835 on 54 degrees of freedom
## Multiple R-squared:  0.03951,    Adjusted R-squared:  0.02173 
## F-statistic: 2.222 on 1 and 54 DF,  p-value: 0.1419

4.1. Statistical Questions & 5. Analysis Result

#S.Q.1. 키가 180cm 보다 작으면 average yards에 유리할까?

#우선 180cm를 기준으로 group을 나누어 주었다.
rb_stats_filtered$h_gr <- ifelse(rb_stats_filtered$`Height(cm)`< 180, "under 180cm", "over 180cm")

lm(data = rb_stats_filtered, Avg ~ (`Height(cm)`< 180) + `Weight(kg)`) %>% summary()
## 
## Call:
## lm(formula = Avg ~ (`Height(cm)` < 180) + `Weight(kg)`, data = rb_stats_filtered)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.26544 -0.40323  0.00254  0.31699  1.29893 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             6.87928    1.69979   4.047  0.00017 ***
## `Height(cm)` < 180TRUE -0.10223    0.18514  -0.552  0.58316    
## `Weight(kg)`           -0.02557    0.01651  -1.548  0.12750    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5873 on 53 degrees of freedom
## Multiple R-squared:  0.04501,    Adjusted R-squared:  0.00897 
## F-statistic: 1.249 on 2 and 53 DF,  p-value: 0.2951
lm(data = rb_stats_filtered, Avg ~ h_gr + `Weight(kg)`) %>% summary()
## 
## Call:
## lm(formula = Avg ~ h_gr + `Weight(kg)`, data = rb_stats_filtered)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.26544 -0.40323  0.00254  0.31699  1.29893 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      6.87928    1.69979   4.047  0.00017 ***
## h_grunder 180cm -0.10223    0.18514  -0.552  0.58316    
## `Weight(kg)`    -0.02557    0.01651  -1.548  0.12750    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5873 on 53 degrees of freedom
## Multiple R-squared:  0.04501,    Adjusted R-squared:  0.00897 
## F-statistic: 1.249 on 2 and 53 DF,  p-value: 0.2951
#둘이 결과가가 같다.


#S.Q.2.Weight와 average yard 사이에 관계가 있을까?
#lm(data = rb_stats_filtered, AVG ~ `Weight(kg)`) %>% anova()
#lm(data = rb_stats_filtered, AVG ~ `Weight(kg)`) %>% summary()

#S.Q.3.Height와 Weight 모두 고려했을 때 average yards와 유의미한 상관관계를 보일까?
#lm(data = rb_stats_filtered, AVG ~ `Height(cm)`+`Weight(kg)`+ Age + Exp + MSM) %>% anova()
#lm(data = rb_stats_filtered, AVG ~ `Height(cm)`+`Weight(kg)`+ Age + Exp + MSM) %>% summary()

#S.Q.4.근육비가 충돌 전에 대한 충돌 후 yard 비율에 차이를 만들어 낼까?
#lm(data = rb_stats, AVG ~ `Height(cm)` + `Weight(kg)` + `MSM` + `Age` + Exp) %>% summary()
#lm(data = rb_stats_filtered, YAC ~ `Height(cm)` * `Weight(kg)`* `MSM` + `Age` + Exp ) %>% summary()

#S.Q.5.더 많은 근육비을 가진 선수가 태클을 더 많이 부술까?
#lm(data = rb_stats, YAC ~ `Height(cm)` + `Weight(kg)` + `Age` + Exp) %>% summary()
#lm(data = rb_stats_filtered, YAC ~ `Height(cm)` + `Weight(kg)` + `Age` + Exp ) %>% summary()

#lm(data = rb_stats_filtered, BTG ~ `Height(cm)` + `Weight(kg)` + `Age` + Exp ) %>% summary()
  1. Figure
ggplot(rb_stats_filtered, aes(x=`Weight(kg)`, y=Avg, color = h_gr)) +
  geom_point() +
  ylab("Average Yards") +
  ggtitle("Weight - Average Yards scatter plot, colored by height")

#연도에 따른 평균 야드 비교(Unfiltered)
#ggplot(data = rb_stats, aes(as.factor(rb_stats$Year), rb_stats$AVG)) + 
#  geom_boxplot() +
#  xlab("Year") +
#  ylab("Average Yards") +
#  ggtitle("Yards by Year(Unfiltered)")

#연도에 따른 평균 야드 비교(Filtered)
#ggplot(data = rb_stats_filtered, aes(as.factor(rb_stats_filtered$Year), rb_stats_filtered$AVG)) + 
#  geom_boxplot() +
#  xlab("Year") +
#  ylab("Average Yards") +
#  ggtitle("Yards by Year(Filtered)")

#키에 따른 평균 야드
#ggplot(data = rb_stats_filtered, aes(x = `Height(cm)`, y =  Avg)) + 
#  geom_point() +
#  xlab("Height(cm)") +
#  ylab("Average Yards") +
#  ggtitle("Yards by Height(Filtered)") +
#  geom_smooth()

#몸무게에 따른 평균 야드
#ggplot(data = rb_stats_filtered, aes(x = `Weight(kg)`, y =  Avg)) + 
#  geom_point() +
#  theme_classic(base_family = "serif", base_size = 15) +
#  xlab("Weight(kg)") +
#  ylab("Average Yards") +
#  ggtitle("Yards by Weight(Filtered)") +
#  geom_smooth(method = "auto")

#MR에 따른 평균 야드
#ggplot(data = rb_stats_filtered, aes(x = `MR`, y =  Avg)) + 
#  geom_point() +
#  theme_classic(base_family = "serif", base_size = 15) +
#  xlab("MR") +
#  ylab("Average Yards") +
#  ggtitle("Yards by MR(Filtered)") +
#  geom_smooth(method = "auto")

#픽 순위에 따른 평균 야드 비교
#ggplot(data = rb_stats_filtered, aes(x = Pick, y =  AVG)) + 
#  geom_point() +
#  theme_classic(base_family = "serif", base_size = 10) +
#  scale_x_continuous() +
#  xlab("Overall Pick") +
#  ylab("Average Yards") +
#  ggtitle("Yards by Pick(Filtered)") +
#  geom_smooth()

#boxplot(data = rb_stats_filtered, AVG ~ Pick)
#boxplot(data = rb_stats_filtered, AVG ~ MSM)
#plot(data = rb_stats_filtered, ABR ~ MSM)

RSM(실패)

#rsm(data = rb_stats_filtered, AVG ~ `Height(cm)` + `Weight(kg)`) %>% summary()
#rsm(data = rb_stats_filtered, Avg ~ SO(`Height(cm)`, `Weight(kg)`)) %>% summary()

교수님 code

#rb_stats_with_average <- rb_stats %>% group_by(`Player Name`) %>%

 # summarise( `Height(cm)` = mean(`Height(cm)`),
  #          avg_Att = mean(Att),
   #         avg_Yds = mean(Yds),
#            avg_YBC = mean(YBC),
 #           avg_YAC = mean(YAC),
  #          avg_BTK = mean(BTK))

#rb_stats$avg_y <- rb_stats$Yds/rb_stats$Att 

#5.1. Test 1
#lm(rb_stats$avg_y ~ rb_stats$`Height(cm)`) %>% summary
#lm(rb_stats$avg_y ~ rb_stats$`Height(cm)`)
#avg_y_fit <- lm(rb_stats$avg_y ~ rb_stats$`Height(cm)` + rb_stats$`Weight(kg)` + rb_stats$`Height(cm)`*rb_stats$`Weight(kg)`)
#plot(rb_stats$`Height(cm)`, rb_stats$avg_y, data = rb_stats) +
 # abline(avg_y_fit, col = "red")
#5.2. Test 2

#5.3. Test 3

#5.4. Test 4

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.