MoneyballGroupProject

Task I: Importing the Data

remove (list = ls())

`moneyball.training.data.(1)` <- read.csv("~/Desktop/moneyball-training-data (1).csv", header=FALSE)
moneyball <- `moneyball.training.data.(1)`

Task II: Cleaning the Data

A.  You can drop variables with too many missing observations (we talked about subsetting dataLinks to an external site. in class by indexing on column numbers) and then drop all rows corresponding to missing observations in any variable (na.omit command works). 

B.  Or you can impute the missing values with the mean/median of the variable.  

head(is.na(moneyball)) # search for missing values in data set
        V1    V2    V3    V4    V5    V6    V7    V8    V9   V10   V11   V12
[1,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[2,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[3,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[4,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[5,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[6,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
       V13   V14   V15   V16   V17
[1,] FALSE FALSE FALSE FALSE FALSE
[2,] FALSE FALSE FALSE FALSE FALSE
[3,] FALSE FALSE FALSE FALSE FALSE
[4,] FALSE FALSE FALSE FALSE FALSE
[5,] FALSE FALSE FALSE FALSE FALSE
[6,] FALSE FALSE FALSE FALSE FALSE
head(colSums(is.na(moneyball))) # number of missing values in each variable of the data set
V1 V2 V3 V4 V5 V6 
 0  0  0  0  0  0 
?head
head(moneyball, n = 10) # first 4 entries
      V1          V2             V3              V4              V5
1  INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
2      1          39           1445             194              39
3      2          70           1339             219              22
4      3          86           1377             232              35
5      4          70           1387             209              38
6      5          82           1297             186              27
7      6          75           1279             200              36
8      7          80           1244             179              54
9      8          85           1273             171              37
10    11          86           1391             197              40
                V6              V7              V8              V9
1  TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
2               13             143             842                
3              190             685            1075              37
4              137             602             917              46
5               96             451             922              43
6              102             472             920              49
7               92             443             973             107
8              122             525            1062              80
9              115             456            1027              40
10             114             447             922              69
               V10              V11             V12              V13
1  TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
2                                              9364               84
3               28                             1347              191
4               27                             1377              137
5               30                             1396               97
6               39                             1297              102
7               59                             1279               92
8               54                             1244              122
9               36                             1281              116
10              27                             1391              114
                V14              V15             V16              V17
1  TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
2               927             5456            1011                 
3               689             1082             193              155
4               602              917             175              153
5               454              928             164              156
6               472              920             138              168
7               443              973             123              149
8               525             1062             136              186
9               459             1033             112              136
10              447              922             127              169
#install.packages("visdat")

library(visdat)

# vis_miss(df)

vis_dat(moneyball)

Summary Statistics

# install.packages(Amelia)
require(Amelia)
Loading required package: Amelia
Loading required package: Rcpp
## 
## Amelia II: Multiple Imputation
## (Version 1.8.2, built: 2024-04-10)
## Copyright (C) 2005-2024 James Honaker, Gary King and Matthew Blackwell
## Refer to http://gking.harvard.edu/amelia/ for more information
## 
missmap(moneyball)

library("psych")
describe(moneyball)
     vars    n    mean     sd median trimmed    mad min  max range  skew
V1*     1 2277 1139.00 657.46   1139 1139.00 843.60   1 2277  2276  0.00
V2*     2 2277   79.08  27.08     87   84.67  16.31   1  109   108 -1.72
V3*     3 2277  254.34 119.12    245  249.36 114.16   1  570   569  0.36
V4*     4 2277  115.27  46.10    112  114.50  47.44   1  241   240  0.16
V5*     5 2277   86.72  29.75     85   88.62  22.24   1  145   144 -0.64
V6*     6 2277  112.10  75.12     97  109.87 100.82   1  244   243  0.21
V7*     7 2277  286.59 105.03    288  289.90  97.85   1  534   533 -0.27
V8*     8 2277  428.00 248.71    440  435.21 305.42   1  824   823 -0.22
V9*     9 2277  175.60 128.39    179  176.56 197.19   1  350   349 -0.04
V10*   10 2277   51.25  39.66     65   50.42  40.03   1  130   129 -0.21
V11*   11 2277    3.15   7.98      1    1.00   0.00   1   57    56  4.06
V12*   12 2277  321.53 196.68    274  301.63 173.46   1  844   843  0.83
V13*   13 2277  119.72  80.55    103  117.56 108.23   1  257   256  0.20
V14*   14 2277  260.15 105.38    253  258.01  97.85   1  536   535  0.19
V15*   15 2277  442.37 254.26    481  453.04 293.55   1  825   824 -0.34
V16*   16 2277  162.37 146.32    106  138.67  90.44   1  550   549  1.26
V17*   17 2277   49.55  32.70     50   47.47  28.17   1  146   145  0.54
     kurtosis    se
V1*     -1.20 13.78
V2*      2.18  0.57
V3*     -0.25  2.50
V4*     -0.38  0.97
V5*      0.90  0.62
V6*     -1.37  1.57
V7*      0.06  2.20
V8*     -1.12  5.21
V9*     -1.67  2.69
V10*    -1.46  0.83
V11*    16.78  0.17
V12*    -0.06  4.12
V13*    -1.42  1.69
V14*    -0.10  2.21
V15*    -1.13  5.33
V16*     0.51  3.07
V17*     0.43  0.69
#install.packages("stargazer") 
library(stargazer)

Please cite as: 
 Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
 R package version 5.2.3. https://CRAN.R-project.org/package=stargazer 
stargazer(moneyball, type = "text" )

=================================
Statistic N Mean St. Dev. Min Max
=================================
variable_labels <- c("Index", 
                     "Target Wins", 
                     "Team Batting H", 
                     "Team Batting 2B", 
                     "Team Batting 3B", 
                     "Team Batting HR",
                     "Team Batting BB",
                     "Team Batting SO",
                     "Team Baserun SB",
                     "Team Baserun CS",
                     "Team Batting HBP",
                     "Team Pitching H",
                     "Team Pitching HR",
                     "Team Pitching BB",
                     "Team Pitching SO",
                     "Team Feilding E")

data_notes <- c("N = 2277")

class(variable_labels)
[1] "character"
length(variable_labels)
[1] 16
stargazer(moneyball, 
          type              = "text", 
          title             = "Summary Statistics", 
          covariate.labels  = variable_labels,
          notes             = data_notes, 
          omit.summary.stat = "n", 
          digits            = 2)

Summary Statistics
===============================
Statistic Mean St. Dev. Min Max
===============================
N = 2277