remove (list = ls())
`moneyball.training.data.(1)` <- read.csv("~/Desktop/moneyball-training-data (1).csv", header=FALSE)
moneyball <- `moneyball.training.data.(1)`MoneyballGroupProject
Task I: Importing the Data
Task II: Cleaning the Data
A. You can drop variables with too many missing observations (we talked about subsetting dataLinks to an external site. in class by indexing on column numbers) and then drop all rows corresponding to missing observations in any variable (na.omit command works).
B. Or you can impute the missing values with the mean/median of the variable.
head(is.na(moneyball)) # search for missing values in data set V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
[1,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[2,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[3,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[4,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[5,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[6,] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
V13 V14 V15 V16 V17
[1,] FALSE FALSE FALSE FALSE FALSE
[2,] FALSE FALSE FALSE FALSE FALSE
[3,] FALSE FALSE FALSE FALSE FALSE
[4,] FALSE FALSE FALSE FALSE FALSE
[5,] FALSE FALSE FALSE FALSE FALSE
[6,] FALSE FALSE FALSE FALSE FALSE
head(colSums(is.na(moneyball))) # number of missing values in each variable of the data setV1 V2 V3 V4 V5 V6
0 0 0 0 0 0
?head
head(moneyball, n = 10) # first 4 entries V1 V2 V3 V4 V5
1 INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
2 1 39 1445 194 39
3 2 70 1339 219 22
4 3 86 1377 232 35
5 4 70 1387 209 38
6 5 82 1297 186 27
7 6 75 1279 200 36
8 7 80 1244 179 54
9 8 85 1273 171 37
10 11 86 1391 197 40
V6 V7 V8 V9
1 TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
2 13 143 842
3 190 685 1075 37
4 137 602 917 46
5 96 451 922 43
6 102 472 920 49
7 92 443 973 107
8 122 525 1062 80
9 115 456 1027 40
10 114 447 922 69
V10 V11 V12 V13
1 TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
2 9364 84
3 28 1347 191
4 27 1377 137
5 30 1396 97
6 39 1297 102
7 59 1279 92
8 54 1244 122
9 36 1281 116
10 27 1391 114
V14 V15 V16 V17
1 TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
2 927 5456 1011
3 689 1082 193 155
4 602 917 175 153
5 454 928 164 156
6 472 920 138 168
7 443 973 123 149
8 525 1062 136 186
9 459 1033 112 136
10 447 922 127 169
#install.packages("visdat")
library(visdat)
# vis_miss(df)
vis_dat(moneyball)Summary Statistics
# install.packages(Amelia)
require(Amelia)Loading required package: Amelia
Loading required package: Rcpp
##
## Amelia II: Multiple Imputation
## (Version 1.8.2, built: 2024-04-10)
## Copyright (C) 2005-2024 James Honaker, Gary King and Matthew Blackwell
## Refer to http://gking.harvard.edu/amelia/ for more information
##
missmap(moneyball)library("psych")
describe(moneyball) vars n mean sd median trimmed mad min max range skew
V1* 1 2277 1139.00 657.46 1139 1139.00 843.60 1 2277 2276 0.00
V2* 2 2277 79.08 27.08 87 84.67 16.31 1 109 108 -1.72
V3* 3 2277 254.34 119.12 245 249.36 114.16 1 570 569 0.36
V4* 4 2277 115.27 46.10 112 114.50 47.44 1 241 240 0.16
V5* 5 2277 86.72 29.75 85 88.62 22.24 1 145 144 -0.64
V6* 6 2277 112.10 75.12 97 109.87 100.82 1 244 243 0.21
V7* 7 2277 286.59 105.03 288 289.90 97.85 1 534 533 -0.27
V8* 8 2277 428.00 248.71 440 435.21 305.42 1 824 823 -0.22
V9* 9 2277 175.60 128.39 179 176.56 197.19 1 350 349 -0.04
V10* 10 2277 51.25 39.66 65 50.42 40.03 1 130 129 -0.21
V11* 11 2277 3.15 7.98 1 1.00 0.00 1 57 56 4.06
V12* 12 2277 321.53 196.68 274 301.63 173.46 1 844 843 0.83
V13* 13 2277 119.72 80.55 103 117.56 108.23 1 257 256 0.20
V14* 14 2277 260.15 105.38 253 258.01 97.85 1 536 535 0.19
V15* 15 2277 442.37 254.26 481 453.04 293.55 1 825 824 -0.34
V16* 16 2277 162.37 146.32 106 138.67 90.44 1 550 549 1.26
V17* 17 2277 49.55 32.70 50 47.47 28.17 1 146 145 0.54
kurtosis se
V1* -1.20 13.78
V2* 2.18 0.57
V3* -0.25 2.50
V4* -0.38 0.97
V5* 0.90 0.62
V6* -1.37 1.57
V7* 0.06 2.20
V8* -1.12 5.21
V9* -1.67 2.69
V10* -1.46 0.83
V11* 16.78 0.17
V12* -0.06 4.12
V13* -1.42 1.69
V14* -0.10 2.21
V15* -1.13 5.33
V16* 0.51 3.07
V17* 0.43 0.69
#install.packages("stargazer")
library(stargazer)
Please cite as:
Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
stargazer(moneyball, type = "text" )
=================================
Statistic N Mean St. Dev. Min Max
=================================
variable_labels <- c("Index",
"Target Wins",
"Team Batting H",
"Team Batting 2B",
"Team Batting 3B",
"Team Batting HR",
"Team Batting BB",
"Team Batting SO",
"Team Baserun SB",
"Team Baserun CS",
"Team Batting HBP",
"Team Pitching H",
"Team Pitching HR",
"Team Pitching BB",
"Team Pitching SO",
"Team Feilding E")
data_notes <- c("N = 2277")
class(variable_labels)[1] "character"
length(variable_labels)[1] 16
stargazer(moneyball,
type = "text",
title = "Summary Statistics",
covariate.labels = variable_labels,
notes = data_notes,
omit.summary.stat = "n",
digits = 2)
Summary Statistics
===============================
Statistic Mean St. Dev. Min Max
===============================
N = 2277