library(ggplot2)
# load data
df=read.csv("https://github.com/mianshariq/SPS/raw/dab24b98c3c1d48b96ea619c01caacfefa916386/Data%20606/Projects/NFL%20Data.csv")
df1=read.csv("https://github.com/mianshariq/SPS/raw/4fe676d9723fea08abb22f2021d644177dc16698/Data%20606/Projects/NFL%20Data%20Min.csv")
You should phrase your research question in a way that matches up with the scope of inference your dataset allows for.
Why you should or shouldnt draft a Running Back in the first round of NFL Draft.
What are the cases, and how many are there? 5 Years of 7 rounds of Data 277 Cases
Describe the method of data collection.
Data was extracted and saved on to a CSV from Pro Football Focus PFF Website and DraftHistory.com
What type of study is this (observational/experiment)?
Observational
If you collected the data, state self-collected. If not, provide a citation/link.
What is the response variable? Is it quantitative or qualitative?
Yards Per Game Quantatative Variable
You should have two independent variables, one quantitative and one qualitative.
Draft Order Quantatative Round Quaitative
Provide summary statistics for each the variables. Also include appropriate visualizations related to your research question (e.g. scatter plot, boxplots, etc). This step requires the use of R, hence a code chunk is provided below. Insert more code chunks as needed.
summary(df)
## Year Name College POS
## Min. :2015 Length:277 Length:277 Length:277
## 1st Qu.:2016 Class :character Class :character Class :character
## Median :2017 Mode :character Mode :character Mode :character
## Mean :2017
## 3rd Qu.:2019
## Max. :2020
##
## Height_in Weight_lbs Hand_Size_in Arm_Length_in
## Min. :65.75 Min. :170.0 Min. : 8.250 Min. :27.38
## 1st Qu.:69.13 1st Qu.:203.0 1st Qu.: 8.880 1st Qu.:30.00
## Median :70.38 Median :213.0 Median : 9.250 Median :31.00
## Mean :70.43 Mean :212.1 Mean : 9.237 Mean :30.88
## 3rd Qu.:71.75 3rd Qu.:222.0 3rd Qu.: 9.500 3rd Qu.:31.63
## Max. :75.00 Max. :259.0 Max. :10.500 Max. :33.75
## NA's :1 NA's :1
## X40_Yard_sec Bench_Press Vert_Leap_In Broad_Jump_in
## Min. :4.280 Min. : 5.00 Min. :27.00 Min. :106.0
## 1st Qu.:4.490 1st Qu.:16.00 1st Qu.:32.00 1st Qu.:116.0
## Median :4.560 Median :19.00 Median :34.50 Median :120.0
## Mean :4.558 Mean :18.79 Mean :34.55 Mean :119.7
## 3rd Qu.:4.630 3rd Qu.:22.00 3rd Qu.:36.50 3rd Qu.:123.0
## Max. :4.850 Max. :34.00 Max. :42.50 Max. :135.0
## NA's :14 NA's :28 NA's :16 NA's :25
## Shuttle_Shuttle X3Cone Team Round
## Min. :3.900 Min. :6.570 Length:277 Length:277
## 1st Qu.:4.225 1st Qu.:6.980 Class :character Class :character
## Median :4.320 Median :7.110 Mode :character Mode :character
## Mean :4.326 Mean :7.101
## 3rd Qu.:4.420 3rd Qu.:7.220
## Max. :4.630 Max. :7.680
## NA's :46 NA's :58
## Draft_Order RushYard_Per_Game GP Total_Yrd_pg
## Length:277 Length:277 Length:277 Length:277
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## TD
## Length:277
## Class :character
## Mode :character
##
##
##
##
cols.num <- c("Draft_Order","RushYard_Per_Game", "GP", "TD", "Total_Yrd_pg")
df[cols.num] <- sapply(df[cols.num],as.numeric)
## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion
## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion
## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion
## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion
## Warning in lapply(X = X, FUN = FUN, ...): NAs introduced by coercion
sapply(df, class)
## Year Name College POS
## "integer" "character" "character" "character"
## Height_in Weight_lbs Hand_Size_in Arm_Length_in
## "numeric" "integer" "numeric" "numeric"
## X40_Yard_sec Bench_Press Vert_Leap_In Broad_Jump_in
## "numeric" "integer" "numeric" "integer"
## Shuttle_Shuttle X3Cone Team Round
## "numeric" "numeric" "character" "character"
## Draft_Order RushYard_Per_Game GP Total_Yrd_pg
## "numeric" "numeric" "numeric" "numeric"
## TD
## "numeric"
chart=ggplot(data=df, aes(x=Round, y=Total_Yrd_pg))+
geom_boxplot(color="#69b3a2")+
geom_smooth(method='lm')
chart
## Warning: Removed 130 rows containing non-finite values (stat_boxplot).
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 130 rows containing non-finite values (stat_smooth).