graphics.off()# Clear all graphs# Set working directory and path to datasetwd("/Users/arvindsharma/Dropbox/WCAS/Econometrics/")
Load packages
Now, we load the packages.
# Prepare needed librariespackages<-c("psych", # quick summary stats for data exploration,"stargazer", # summary stats for sharing,"visdat", # for visualisation of missing data"tidyverse", # data manipulation like selecting variables,"corrplot", # correlation plots"ggplot2", # graphing"data.table", # reshape for graphing "car"# vif for multicollinearity)for(iin1:length(packages)){if(!packages[i]%in%rownames(installed.packages())){install.packages(packages[i] , repos ="http://cran.rstudio.com/" , dependencies =TRUE)}library(packages[i], character.only =TRUE)}
Please cite as:
Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.4 ✔ readr 2.1.5
✔ forcats 1.0.0 ✔ stringr 1.5.1
✔ ggplot2 3.5.2 ✔ tibble 3.3.0
✔ lubridate 1.9.4 ✔ tidyr 1.3.1
✔ purrr 1.1.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ ggplot2::%+%() masks psych::%+%()
✖ ggplot2::alpha() masks psych::alpha()
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
corrplot 0.95 loaded
Attaching package: 'data.table'
The following objects are masked from 'package:lubridate':
hour, isoweek, mday, minute, month, quarter, second, wday, week,
yday, year
The following objects are masked from 'package:dplyr':
between, first, last
The following object is masked from 'package:purrr':
transpose
Loading required package: carData
Attaching package: 'car'
The following object is masked from 'package:dplyr':
recode
The following object is masked from 'package:purrr':
some
The following object is masked from 'package:psych':
logit
# Makes sure dplyr::filter and dplyr::select will be usedconflicted::conflict_prefer(name ="select", winner ="dplyr")
[conflicted] Will prefer dplyr::select over any other package.
df_train<-read.csv("moneyball-training-data.csv")df_eval<-read.csv("moneyball-evaluation-data.csv")# describe(df_train)# describe(df_eval)?print# years "Total Years:"print(paste("Games Played:", games=(2006-1871+1)*162, "in years", years =2006-1871+1))
[1] "Games Played: 22032 in years 136"
Instead of using the describe command to summarize the data for the reader, you should use stargazer package to create professional publication looking tables. For the initial data exploration you can use commands like glimpse, str, head, tail, et cetra.
Data Analysis
Knowing a bit about the following concepts/packages will be helpful.
Indexing in base R to keep the relevant variables.
Piping i.e. %>% to clearly expressing a sequence of multiple operations.
mutate function in dplyr cheat sheet to transform variables
Summary Statistics
One approach is to create a new data frame with all the relevant and clean (imputed and labelled) variables.
I begin by dropping irrelevant variables like the index and TEAM_BATTING_HBP (too many missing variables), and then creating a summary statistics of the the relevant raw variables.
# DROPING VARIABLE BY INDEXINGdf_train_cleaned<-df_train[,-1]# DROPING VARIABLE BY SELECTING VARIABLE NAMES, and saving the new dataframe df_train_cleaned<-dplyr::select(.data =df_train_cleaned,-c(TEAM_BATTING_HBP))vis_dat(df_train_cleaned)# see variable types
vis_miss(df_train_cleaned)# see missing values
# CREATE SUMMARY STATISTICS## LABEL VARIABLES labels<-c('Number of wins','Base Hits by batters (1B,2B,3B,HR)','Doubles by batters (2B)','Triples by batters (3B)','Homeruns by batters (4B)', 'Walks by batters','Strikeouts by batters','Stolen bases','Caught stealing','Errors','Double Plays','Walks allowed','Hits allowed','Homeruns allowed','Strikeouts by pitchers')?stargazer## ALL VARIABLES IN THE CLEANED DATAFRAMEstargazer(... =df_train_cleaned, type ="text", # html, latex covariate.labels =labels, digits =2# 2 decimal places # summary.stat = c("mean", "sd", "min", "max")# out = # a character vector that contains the path(s) of output files.)
=======================================================================
Statistic N Mean St. Dev. Min Max
-----------------------------------------------------------------------
Number of wins 2,276 80.79 15.75 0 146
Base Hits by batters (1B,2B,3B,HR) 2,276 1,469.27 144.59 891 2,554
Doubles by batters (2B) 2,276 241.25 46.80 69 458
Triples by batters (3B) 2,276 55.25 27.94 0 223
Homeruns by batters (4B) 2,276 99.61 60.55 0 264
Walks by batters 2,276 501.56 122.67 0 878
Strikeouts by batters 2,174 735.61 248.53 0 1,399
Stolen bases 2,145 124.76 87.79 0 697
Caught stealing 1,504 52.80 22.96 0 201
Errors 2,276 1,779.21 1,406.84 1,137 30,132
Double Plays 2,276 105.70 61.30 0 343
Walks allowed 2,276 553.01 166.36 0 3,645
Hits allowed 2,174 817.73 553.09 0 19,278
Homeruns allowed 2,276 246.48 227.77 65 1,898
Strikeouts by pitchers 1,990 146.39 26.23 52 228
-----------------------------------------------------------------------
I personally prefer not to impute values so as to include researched bias, but you can consider imputing using the mean/median if you do not want to throw away a large part of your data (but will have to justify transforming the data though).
Here, I could consider imputing values for TEAM_BASERUN_CS as it has roughly 30% missing values and is not the key dependent or indpendent variable (which would be very hard to defend). Plotting the distribution of these variables before and after imputation would be a good idea to double check imputation does not bias your raw data.
# Sum NAs across columnsdf_train_cleaned%>%summarise(across(everything(),~sum(is.na(.))))%>%glimpse()
# summarise() creates a new data frame...# glimpse() is like a transposed version of print()...# Display NAs as percentage of observations round(x =sapply(X =df_train_cleaned, FUN =function(df){100*sum(is.na(df==TRUE)/length(df))}), digits =2)
### IF YOU PREFER, write a for loop to replace all NA values with the column median # for(i in 1:ncol(df_train_cleaned)) {# df_train_cleaned[ ,i][is.na(df_train_cleaned[ , i])] <- # median(df_train_cleaned[,i], na.rm = TRUE) # }
Imputation
If I was forced to impute, I would replace all NA values with column median for only two variables, and not imputing values for other variables.
TEAM_BASERUN_CS and TEAM_FIELDING_DP are two variables (in two categories base run and fielding) I would consider imputing, as I would want to see variables that will better capture the theoretical effect of their respective categories in the regression.
However, I would not be comfortable in keeping variables where more than 10% of the observations have been imputed, but it seems to me like fishing a little bit here to see if I can get the expected results.
Since even if I loose 30% of my data, I still think I have enough observations and thus will not use the imputed data. You can save the imputed data in another data frame and see if it really changes your results or not.
Another way to present the data (if the variables are reasonably balanced in observations - skip the count column). Putting too much text in notes is not a good idea, but here I do so for my own personal notes/transparency.
## ALL VARIABLES IN THE CLEANED DATAFRAMEstargazer(df_train_cleaned, # data type ="text", # html, latex covariate.labels =labels, # label the variable names digits =2, # round to 2 digits summary.stat =c("mean","sd","min","max"), # "median","p25","p75",... title ="Summary Statistics of Final, Clean Data",# table heading notes =c("N=2276.","'Caught Stealing' and 'Strikeouts by pitchers' have more than 10%", "missing observations respectively which I do not impute.", #"'Strikeouts by batters', 'Stolen bases' and 'Strikeouts by pitchers'", " have 102, 131 and 102 missing values.")#, # notes at bottom of table# font.size = "tiny" # fit )
Summary Statistics of Final, Clean Data
========================================================================
Statistic Mean St. Dev. Min Max
------------------------------------------------------------------------
Number of wins 80.79 15.75 0 146
Base Hits by batters (1B,2B,3B,HR) 1,469.27 144.59 891 2,554
Doubles by batters (2B) 241.25 46.80 69 458
Triples by batters (3B) 55.25 27.94 0 223
Homeruns by batters (4B) 99.61 60.55 0 264
Walks by batters 501.56 122.67 0 878
Strikeouts by batters 735.61 248.53 0 1,399
Stolen bases 124.76 87.79 0 697
Caught stealing 52.80 22.96 0 201
Errors 1,779.21 1,406.84 1,137 30,132
Double Plays 105.70 61.30 0 343
Walks allowed 553.01 166.36 0 3,645
Hits allowed 817.73 553.09 0 19,278
Homeruns allowed 246.48 227.77 65 1,898
Strikeouts by pitchers 146.39 26.23 52 228
------------------------------------------------------------------------
N=2276.
'Caught Stealing' and 'Strikeouts by pitchers' have more than 10%
missing observations respectively which I do not impute.
'Strikeouts by batters', 'Stolen bases' and 'Strikeouts by pitchers'
have 102, 131 and 102 missing values.
Data Visualization
Histogram
You could have used these here, but I personally prefer box plots.
Hard to notice changes in variables where as much as 30% of the values have been imputed (which actually suggests imputation may be fine). I would look at the count of observations just to make sure the imputation worked (the variables are in different data frames though).
hist(x =df_train_cleaned$TEAM_BASERUN_CS, main ="Non-Imputed Data")
hist(x =df_train_cleaned_transformed$TEAM_BASERUN_CS, main ="Imputed NAs")
# clean data frame where I keep relevant non-imputed variables describe(df_train_cleaned$TEAM_BASERUN_CS)
vars n mean sd median trimmed mad min max range skew kurtosis se
X1 1 1504 52.8 22.96 49 50.36 17.79 0 201 201 1.98 7.62 0.59
# clean data frame where I keep relevant variables (with imputations), and other transformations describe(df_train_cleaned_transformed$TEAM_BASERUN_CS)
vars n mean sd median trimmed mad min max range skew kurtosis se
X1 1 2276 51.51 18.75 49 49.58 7.41 0 201 201 2.6 13.47 0.39
I would definitely clean the labels and title of the graph to make it more presentable, add colors, or consider using ggplot2 package for graphing.
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## + labs(x='Variables', y='Frequency')## ‘stat_bin()‘ using ‘bins = 30‘. Pick better value with ‘bin width‘.remove(df)
You can control every chart element in ggplot, whihc is why it is so popular. Play with the code below -
mean_wins<-mean(df_train_cleaned$TARGET_WINS, na.rm=TRUE)sd_wins<-sd(df_train_cleaned$TARGET_WINS, na.rm=TRUE)ggplot(data =df_train_cleaned, aes(x =TARGET_WINS))+geom_histogram(binwidth=10, col ="red", fill ="grey", aes(y=..density..))+labs(x ="# of Wins", y ="Frequency")+geom_density(alpha=.2)+geom_vline(aes(xintercept =mean(TARGET_WINS, na.rm=TRUE)), color="red", linetype="dashed", linewidth=1)+geom_vline(xintercept =(mean_wins+2*sd_wins), color='blue', linetype='dashed', linewidth=1)+geom_vline(xintercept =(mean_wins-2*sd_wins), color='blue', linetype='dashed', linewidth=1)+geom_vline(xintercept =(mean_wins+3*sd_wins), color='purple', linetype='dashed', linewidth=1)+geom_vline(xintercept =(mean_wins-3*sd_wins), color='purple', linetype='dashed', linewidth=1)
Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
ℹ Please use `after_stat(density)` instead.
Box plots
Outcome Variable
Many settings to play around with. Explore at your lesiure time.
Try to ensure that the reader can grasp the entire story from your graph - so label them on title or axis atleast.
?boxplotboxplot(df_train$TARGET_WINS, las =2, horizontal =TRUE, main ="Wins")
Batting Variable
Instead of using brute force like below, try to write a loop to be more efficient - it is easier to read the code and find errors too.
Initially, it might be easier with brute force to get to know the syntax of the command, and the loop is simply a generalization on the command. So first become good at the base command.
par(mfrow=c(2,2))boxplot(df_train$TEAM_BATTING_H, horizontal =TRUE, main ="Total Hits")boxplot(df_train$TEAM_BATTING_2B, horizontal =TRUE, main ="Doubles")boxplot(df_train$TEAM_BATTING_3B, horizontal =TRUE, main ="Triples")boxplot(df_train$TEAM_BATTING_HR, horizontal =TRUE, main ="Home Runs")
LOOPING
Now that we have some experience with the base command, lets try to write a loop below.
Generate the same graphs above with a loop.
par(mfrow=c(2,2))################################### BARE BONES COMMAND TO ENSURE LOOP WORKS##############################for(iin2:5){boxplot(df_train_cleaned[,i])# choosing 2nd to 5th variables in the df_train dataframe by indexing columns}
Instead of using the default variable names for graph titles, use the vector label we created for titles.
OTHER ADVISE WITH LOOPS
Reading loops from inside to outside (ignore indexing initially) can be helpful.
When constructing a loop, start with the index and a bare bones command (boxplot here) to ensure loop works. Then start adding more embellishments (boxplot titles).
################################### BARE BONES COMMAND + EMBELLISHMENTS (added one by one)##############################par(mfrow=c(2,2))for(iin2:5){boxplot(df_train_cleaned[,i], main =paste("Boxplot:", labels[i]))}
# main = names(df_train[i]) will also work - paste command not required,
Other boxplots are not created using loops, but you can try.
par(mfrow=c(2,2))boxplot(df_train$TEAM_BATTING_BB, horizontal =TRUE, main ="Walks")boxplot(df_train$TEAM_BATTING_SO, horizontal =TRUE, main ="Strikeouts by Batters")boxplot(df_train$TEAM_BATTING_HBP, horizontal =TRUE, main ="Batters Hit")
Baserun Variables
par(mfrow=c(2,1))boxplot(df_train$TEAM_BASERUN_CS, horizontal =TRUE, main ="Caught Stealing")boxplot(df_train$TEAM_BASERUN_SB, horizontal =TRUE, main ="Stolen Bases")
Fielding Variables
Code not super alligned.
par(mfrow=c(2,1))boxplot(df_train$TEAM_FIELDING_E, horizontal =TRUE, main ="Errors")boxplot(df_train$TEAM_FIELDING_DP, horizontal =TRUE, main ="Double Plays")
Pitching Variables
Align the code by spacing it equally in your code chunks (use space bar, or tab).
par(mfrow=c(2,2))boxplot(df_train$TEAM_PITCHING_H , horizontal =TRUE, main ="Hits Allowed")boxplot(df_train$TEAM_PITCHING_HR, horizontal =TRUE, main ="Homeruns Allowed")boxplot(df_train$TEAM_PITCHING_BB, horizontal =TRUE, main ="Walks Allowed")boxplot(df_train$TEAM_PITCHING_SO, horizontal =TRUE, main ="Strikeouts by Pitchers")
GGPLOT
If you prefer ggplot - (Note I am suppressing the code warnings)…
df_ggplot<-reshape2::melt(df_train_cleaned)# convert data to long format
No id variables; using all as measure variables
ggplot(data =df_ggplot, # ggplot requires data in long format mapping =aes(x =factor(variable), y =value))+geom_boxplot()+facet_wrap(facets =~variable, scale="free", ncol =5, nrow =3)+labs(x="Variables", y="Frequency")+theme( strip.text =element_text(size =rel(0.55)), # Reduce the size of facet titles axis.text.x.bottom =element_blank(), # Remove the x-axis title "Variable Names")
?theme
Alternatively, you can convert the data from wide format to long format with the gather command instead of the melt command (or any other commands) and input the long data into ggplot. You will get the same output.
See if you can label the axis and/or the title of the graphs a bit better. You can eyeball the skewness, outliers, mean, min/max from these charts and make your calls on imputations, truncation, transformations (log, sqrt, …) as per your choice.
# Convert df_train_cleaned to long format and save the data as ggplotdf...??gather
No vignettes or demos or help files found with alias or concept or
title matching 'gather' using fuzzy matching.
ggplotdf<-df_train_cleaned%>%gather(key =variable, value =value)# now you can use the same ggplot command above...
Scatter Plots
Scatter plots are two dimensional plots. We will have the dependent variable on the y axis (TARGET_WINS), and the independent variable on the x axis.
Again, start wit the base command and add the non-mandatory/embellishments/beautification code later.
I show two different ways to create the multiple charts with ggplot command, which requires all variables in a data frame in long format.
## PIPING APPROACHdf_train_cleaned%>%# CONVERT TO LONG FORMATgather(key =variable, value =value, -TARGET_WINS)%>%# CREATE GGPLOT AND EMBELLISH ITggplot(data =., mapping =aes(x =value, y =TARGET_WINS))+geom_point(fill ="pink", color="red")+geom_smooth(method ="lm", # model se =FALSE, color ="blue")+facet_wrap(~variable, scales ="free", ncol =4)+labs(x =element_blank(), y ="Target_Wins")
`geom_smooth()` using formula = 'y ~ x'
## SIMPLE RESHAPE ggplot_scatter<-reshape2::melt(data =df_train_cleaned, id.vars=c('TARGET_WINS'))#new argument appear in this melt function.ggplot(data =ggplot_scatter, aes(x =value, y =TARGET_WINS))+geom_point()+facet_wrap(~variable, scale ="free")+geom_smooth(method="lm", se =FALSE, color ="blue")+labs(x="", y="Number of Wins")
`geom_smooth()` using formula = 'y ~ x'
Team_Pitching_SO seems to have outliers !!! Potentially Team_Pitching_H too.
Lets quantify the correlations between the two variables in the graphs above in a table below -
data_cor<-data.frame(cor(df_train_cleaned, df_train_cleaned$TARGET_WINS, use ="pairwise.complete.obs"))colnames(data_cor)<-c("TARGET_WINS")print(data_cor)
I would probably shorten the names of the variables (once I get super comfortable with the data). It looks a bit messy/hard to read.
If you have never played baseball (like myself), having longer and more explanatory variable names may be helpful for recall value. Maybe not for client presentation though.
An abbreviation like B for batter, F for fielding, BR for baserun, P for pitcher may be the middle ground - shortens names, helps readability and may make more sense to for the presenter.
df_train_corr<-subset(df_train, select =-c(INDEX, TEAM_BATTING_HBP))colnames(df_train_corr)<-c("# of Wins","Batter: Base Hits", "Batter: Doubles","Batter: Triples", "Batter: Homeruns","Batter: Walks","Batter: Strikeouts","Baserun: Bases Stolen","Baserun: Caught Stealing","Pitcher: Hits Allowed","Pitcher: Homeruns Allowed","Pitcher: Walks Allowed","Pitcher: Strikeouts","Fielding: Errors","Fielding: Double Plays")?corrplotcorrplot(cor(df_train_corr, use ="na.or.complete"), type ="lower", method ="square"#"number" "circle")
Apart from basic data imputation for variables with less than 10% missing values and dropping variables with large missing values, you can create new variables (by taking log or other transformations) or even truncate the variables.
I personally do not like to impute values, but you can find some code below if you wish to do so.
df_train_cleaned_transformed<-df_train_cleaned# TEAM_PITCHING_H## A. LOGdf_train_cleaned_transformed$TEAM_PITCHING_H_LOG<-log(df_train_cleaned_transformed$TEAM_PITCHING_H)## B. REPLACE OUTLIERS WITH A MAX BOUND = 75 percentile + 1.5 * IQR quartile_3<-quantile(x =df_train_cleaned$TEAM_PITCHING_H, probs=c(.75), na.rm =TRUE)outlier_1.5<-1.5*IQR(df_train_cleaned$TEAM_PITCHING_H)df_train_cleaned_transformed$TEAM_PITCHING_H_TRUNC<-df_train_cleaned$TEAM_PITCHING_Hdf_train_cleaned_transformed$TEAM_PITCHING_H_TRUNC[df_train_cleaned_transformed$TEAM_PITCHING_H_TRUNC>(quartile_3+outlier_1.5)]<-quartile_3+outlier_1.5
Trying to keep short, informative names of both variables and data farmes is helpful. Guilty of not following it above.
Models
You always want to report the summary statistics for the variables on which you are running the regressions. So if your regression was run on 1500 observations with 10 variables, the summary statistics should be of those 10 variables and 1500 observations.
I have not fully followed this principle here (as I only use about 1500 observations in my model 1 and model 2 as I do not impute values and am using a variable which has 30% missing values). So I should be generating another summary statistics tables from stargazer package but since the summary statistics are very similar, I avoid it to make this report any more lengthy.
Thus, if I was to post only one summary statistics table, it would not be the summary statistics of the raw data but the final data in regression.
MODEL 1.
I begin with keeping all variables. Note if VIF above 10 implies string multicollinearity, and the unexpected coefficient signs could be due to this issue.
# MODEL 1: keep all variables model1<-lm(data =df_train_cleaned,TARGET_WINS~.)summary(model1)
sum_model1<-summary(model1)## INCORRECT COEFFICIENTS DUE TO MULTOCOLLINEARITY???vif_values1<-vif(model1)# multicollinearity# vif_values1as.data.frame(vif(model1))
I start dropping variables with empirical effects different from the theoretical signs that are likely to suffer from multicollinearity till I get the expected theoretical signs, and then the also the remaining variables that are insignificant. Of course, I do not get the convergence in one single try and tried many models before selecting the one below. It is a parsimonous model and thus I prefer it too over model 1, and can be easily explained to others.
# MODEL 2: Drop variables with unexpected theoretical signs in model 1, starting from the variables with largest VIFs model2<-lm(data =df_train_cleaned,TARGET_WINS~.-TEAM_BATTING_2B-TEAM_BATTING_HR-TEAM_BATTING_SO-TEAM_BATTING_BB-TEAM_BASERUN_CS-TEAM_PITCHING_SO-TEAM_PITCHING_HR-TEAM_PITCHING_BB-TEAM_FIELDING_DP)summary(model2)
Call:
lm(formula = TARGET_WINS ~ . - TEAM_BATTING_2B - TEAM_BATTING_HR -
TEAM_BATTING_SO - TEAM_BATTING_BB - TEAM_BASERUN_CS - TEAM_PITCHING_SO -
TEAM_PITCHING_HR - TEAM_PITCHING_BB - TEAM_FIELDING_DP, data = df_train_cleaned)
Residuals:
Min 1Q Median 3Q Max
-33.739 -7.461 -0.181 7.546 37.614
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 35.539190 4.553702 7.804 1.12e-14 ***
TEAM_BATTING_H 0.041698 0.003969 10.507 < 2e-16 ***
TEAM_BATTING_3B 0.152425 0.022426 6.797 1.55e-11 ***
TEAM_BASERUN_SB 0.029668 0.006438 4.608 4.42e-06 ***
TEAM_PITCHING_H -0.002852 0.002273 -1.255 0.21
TEAM_FIELDING_E -0.141018 0.009774 -14.428 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 10.97 on 1480 degrees of freedom
(790 observations deleted due to missingness)
Multiple R-squared: 0.2558, Adjusted R-squared: 0.2533
F-statistic: 101.7 on 5 and 1480 DF, p-value: < 2.2e-16
TEAM_BATTING_H: If the number of base hits (1B,2B,3B,HR) by the batters in the team during the season increases by 1, then the number of game wins of the team in the season increases by .042, cetrius paribus. This is a statistically significant at 1% level.
TEAM_BATTING_3B: If the number of triples (3B) by the batters in the team during the season increases by 1, then the number of game wins of the team in the season increases by .15, cetrius paribus. This is a statistically significant at 1% level.
This makes sense as teams with more triples are likely to be hard hitters/better batting performance. Note that by including 1B and 3B, we have large captured the batting performance of a team - no need to include doubles and home runs also - they will only make the coefficients unstable.
TEAM_BASERUN_SB: If the number of stolen bases by the team during the season increases by 1, then the number of game wins of the team in the season increases by 0.029, cetrius paribus. This is a statistically significant at 1% level.
TEAM_PITCHING_H: If the number of hits allowedby the team during the season increases by 1, then the number of game wins of the team in the season decreases by -0.002, cetrius paribus. This effect is not statistically significant though.
This tends to suggest that pitching is not more important than batting. Also, we do not keep other measures of pitching like Walks allowed, Homeruns allowed, or Strikeouts by pitchers due to multicollinearity problem as all of these are just capturing the pitching performance.
TEAM_FIELDING_E: If the number of fielding errors by the team during the season increases by 1, then the number of game wins of the team in the season decreases by -0.14, cetrius paribus. This effect is statistically significant and has a large magnitude/economic significance too.
So if I was to form my own team, I would try to get fit players who can field well and hit hard/frequent (bat well) and not focus too much on finding the best pitchers.
MODEL 3.
I keep my model 2, but take log of all variables (to better deal with right tail/positive skewness). Note that the model is parsimonious (few variables that have their expected signs, and are statistically significant), and the R2 increases from model 2 to model 3 (almost 40%). All coefficients are statistically significant at 1% level, unlike model 2.
# MODEL 3: Log all variables in model 2df_train_cleaned_log<-df_train_cleaned%>%select(-c(TEAM_BATTING_2B, TEAM_BATTING_HR, TEAM_BATTING_SO, TEAM_BATTING_BB ,TEAM_BASERUN_CS ,TEAM_PITCHING_SO ,TEAM_PITCHING_HR ,TEAM_PITCHING_BB,TEAM_FIELDING_DP))%>%mutate_all(funs(log =log(.+1)))%>%select(ends_with("log"))
Warning: `funs()` was deprecated in dplyr 0.8.0.
ℹ Please use a list of either functions or lambdas:
# Simple named list: list(mean = mean, median = median)
# Auto named with `tibble::lst()`: tibble::lst(mean, median)
# Using lambdas list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
#create horizontal bar chart to display each VIF valuebarplot(vif_values3, main ="VIF Values", horiz =TRUE, col ="steelblue")#add vertical line at 10abline(v =10, lwd =3, lty =2)
Coefficient Interpretation -
Now, we are in a log-log world and not a level-level world. So the interpretation will be a bit different.
TEAM_BATTING_H: If the number of base hits (1B,2B,3B,HR) by the batters in the team during the season increases by 1 percent, then the number of game wins of the team in the season increases by approximately 1.046 percent, cetrius paribus. This is a statistically significant at 1% level.
TEAM_BATTING_3B: If the number of triples (3B) by the batters in the team during the season increases by 1 percent, then the number of game wins of the team in the season increases by .13 percent, cetrius paribus. This is a statistically significant at 1% level.
Similar percent-percent interpretation for other variables.
Model Presentation
Again, I would rely on stargazer package to present the regressions. Model 1 and 2 side by side are helpful in seeing how coefficients change in magnitude.
You can keep model 3 here too, or create a new table for it.
stargazer(model1, model2, model3, type='text', column.labels=c("# of Wins","# of Wins","Log # of Wins"), covariate.labels=c(#'Identification Variable (do not use)', # INDEX## 'Number of wins','Base Hits by batters',# (1B,2B,3B,HR)','Doubles by batters (2B)','Triples by batters (3B)','Homeruns by batters (4B)', 'Walks by batters',#'Batters hit by pitch (get a free base)', # TEAM_BATTING_HBP'Strikeouts by batters','Stolen bases','Caught stealing','Hits allowed','Homeruns allowed','Walks allowed','Strikeouts by pitchers','Errors','Double Plays','Log Base Hits by batters',# (1B,2B,3B,HR)','Log Triples by batters (3B)','Log Stolen bases','Log Hits allowed','Log Feilding Errors','Intercept'))
Model 1: Seems the data cleaning and preparation I did was fine (dropping certain variables and imputing certain variables). Note N \(\approx\) 1500.
Model 2: Again, seems the variable sub-selection does not violate OLS assumptions, and as long as model fit increases and variable interpretation is better (find theoretical effects), I can go ahead with this model. Note N \(\approx\) 1500
Model 3: Residual analysis suggets observation 1211 is a troublemaker, and I should definitely throw it out (even though this was my favorite model). THIS SHOWS THAT JUST MAXIMISING ADJUSTED R2 MAY NOT ALWAYS BE THE BEST APPROACH.
Throwing the outlier observation in model 3 makes the residual plots much better, but still not as good at model 2. Also, coefficient estimates change substantially (in relative terms). So I may want to declare my model 2 as my favorite model here, though model 3b may be preferred under certain circumstances (like predicting on evaluation data), or suggests model 3 requires more careful data preparation and analsis.
You will have to compare the pros and cons on co-efficient interpretation/ ease, overall model fit, purpose of building the model (causal analysis or prediction), et cetra to decide upon your favorite model. Simple models are always better. Maybe augment model 2 in other ways like adding interaction terms.
Updated Model 3
# MODEL 3 Updated: Remove outlierdf_train_cleaned_log[1210:1212,]
You can look at the basic model fit variables in stargazer generated table to comment on overall model fit. Play around with the command more.
Adjusted R2 is the most common measure, and other measures like F statistic will largely paint similar picture in terms of whihc is the “best” model.
Adjusted R-squared
Higher the better, but always between 0 and 1. Ensure you are not just trying to maximize adjusted R2, as it would caused us to prefer model 3 where residual analysis suggested OLS assumptions may not be met.
Can manually pull out these numbers after running the regression - store the summary of the regression as another object, and pull elements from it using basic commands.
Higher the better, as it tells you the coefficients collectively are significant in fitting the response variable.
# do not want the numerator and denominator degrees of freedomF_stat<-c(sum_model1$fstatistic[1],sum_model2$fstatistic[1],sum_model3$fstatistic[1],sum_model3b$fstatistic[1])F_stat
value value value value
82.09633 101.73976 262.89368 178.78082
Mean Squared Error
The Mean Squared Error (MSE) can be found by calculating the average of squared residuals of each model. A lower mean square error is better (if the models are correctly specified).
The RMSE is the square root of the variance of the residuals. It indicates the absolute fit of the model to the data – how close the observed data points are to the model’s predicted values and is a good measure of how accurately the model fits the response. Lower values of RMSE indicate better fit.
INDEX TEAM_BATTING_H TEAM_BATTING_3B TEAM_BASERUN_SB TEAM_PITCHING_H
1 9 1209 33 62 1209
2 10 1221 29 54 1221
3 14 1395 29 59 1395
4 47 1539 29 148 1539
5 60 1445 68 NA 3902
6 63 1431 53 NA 2793
TEAM_FIELDING_E predicted_wins.fit predicted_wins.lwr predicted_wins.upr
1 140 69.63123 48.06167 91.20078
2 135 69.95543 48.38927 91.52159
3 156 73.90164 52.36133 95.44195
4 124 86.64847 65.09614 108.20080
5 616 NA NA NA
6 572 NA NA NA
# I prefer to choose variables by names instead of column index (more prone to errors in large datfarmes)## head(df_eval[,c(1,3,5,7,10,16:17)])tail(df_eval[,c(1,3,5,7,10,16:17)])
INDEX TEAM_BATTING_2B TEAM_BATTING_HR TEAM_BATTING_SO TEAM_BATTING_HBP
254 2487 72 18 1107 NA
255 2500 162 95 860 NA
256 2501 190 125 777 NA
257 2520 263 102 976 NA
258 2521 270 122 860 NA
259 2525 339 172 1084 NA
TEAM_FIELDING_DP predicted_wins.fit predicted_wins.lwr predicted_wins.upr
254 NA NA NA NA
255 146 70.00964 48.45520 91.56409
256 156 70.59167 49.04053 92.14281
257 113 83.15421 61.58805 104.72036
258 144 80.14207 58.54384 101.74029
259 150 79.75158 58.22401 101.27915