library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(ggrepel)
library(effsize)
library(pwrss)
##
## Attaching package: 'pwrss'
##
## The following object is masked from 'package:stats':
##
## power.t.test
library(dplyr)
library(rmarkdown)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
lahman_data = read.csv("/Users/anuragreddy/Desktop/Statistics with R/Lahmans Databse .csv")
Div_df <- lahman_data |>
filter(yearID != 2020) |>
select(franchID, divID, R) |>
group_by(divID)
Div_df
## # A tibble: 630 × 3
## # Groups: divID [3]
## franchID divID R
## <chr> <chr> <int>
## 1 ANA W 864
## 2 BAL E 794
## 3 BOS E 792
## 4 CHW C 978
## 5 CLE C 950
## 6 DET C 823
## 7 KCR C 879
## 8 MIN C 748
## 9 NYY E 871
## 10 OAK W 947
## # ℹ 620 more rows
Div_df |>
ggplot(aes(x=R, fill = divID))+
geom_boxplot()+
labs(title = "Runs Distribution Division-wise", x= "Runs Scored")+
theme_economist() +
scale_colour_economist()
Div_df |>
group_by(divID)|>
summarise(Avg_Runs_Scored = mean(R))|>
select(divID,Avg_Runs_Scored)
## # A tibble: 3 × 2
## divID Avg_Runs_Scored
## <chr> <dbl>
## 1 C 731.
## 2 E 746.
## 3 W 741.
Div_Anova <- aov(R ~ divID, data = Div_df)
summary(Div_Anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## divID 2 24041 12021 1.75 0.175
## Residuals 627 4307245 6870
pairwise.t.test(Div_df$R, Div_df$divID, p.adjust.method = "bonferroni")
##
## Pairwise comparisons using t tests with pooled SD
##
## data: Div_df$R and Div_df$divID
##
## C E
## E 0.20 -
## W 0.75 1.00
##
## P value adjustment method: bonferroni
Runs_BA <- lahman_data |>
filter(yearID !=2020) |>
mutate(BA = H/AB) |>
select(R,BA)
head(Runs_BA)
## R BA
## 1 864 0.2796731
## 2 794 0.2717607
## 3 792 0.2669627
## 4 978 0.2860432
## 5 950 0.2884040
## 6 823 0.2751595
Runs_BA |>
ggplot(aes(x=BA, y=R))+
geom_point()+
geom_smooth(method = "lm", se = FALSE, color = 'darkblue')+
labs(title = "Runs Scored vs Batting Average", x = "Batting Average", y = "Runs Scored", color='darkblue')+
theme_solarized()
## `geom_smooth()` using formula = 'y ~ x'
ASSUMPTION 1: VARIABLE \(x\) IS LINEARLY CORRELATED WITH RESPONSE \(y\).
ASSUMPTION 2: ERRORS HAVE CONSTANT VARIANCE ACROSS ALL PREDICTIONS
ASSUMPTION 3: OBSERVATIONS ARE INDEPENDENT AND UNCORRELATED
linear_model <- lm(R~BA,Runs_BA)
linear_model
##
## Call:
## lm(formula = R ~ BA, data = Runs_BA)
##
## Coefficients:
## (Intercept) BA
## -477.9 4708.9
Interpretation: