Data Exploration and Visualization

Data analysis on Baseball dataset

The baseball data set contains the following variables
+ Name* - Name of the player + Team - Name of the team + Position - Player position - Catcher,Designated hitter,First baseman,Outfielder,Relief pitcher, Second baseman,Shortstop,Starting pitcher, Third baseman + Height - Height of the player + Weight - Weight of the player + Age - Age of the player

Lets take a look at the dataset

mydata <- read.table('https://umich.instructure.com/files/330381/download?download_frd=1',as.is=T, header=T)
head(mydata,4)

##              Name Team      Position Height Weight   Age
## 1   Adam_Donachie  BAL       Catcher     74    180 22.99
## 2       Paul_Bako  BAL       Catcher     74    215 34.69
## 3 Ramon_Hernandez  BAL       Catcher     72    210 30.78
## 4    Kevin_Millar  BAL First_Baseman     72    210 35.43

dim(mydata)

## [1] 1034    6

str(mydata)

## 'data.frame':    1034 obs. of  6 variables:
##  $ Name    : chr  "Adam_Donachie" "Paul_Bako" "Ramon_Hernandez" "Kevin_Millar" ...
##  $ Team    : chr  "BAL" "BAL" "BAL" "BAL" ...
##  $ Position: chr  "Catcher" "Catcher" "Catcher" "First_Baseman" ...
##  $ Height  : int  74 74 72 72 73 69 69 71 76 71 ...
##  $ Weight  : int  180 215 210 210 188 176 209 200 231 180 ...
##  $ Age     : num  23 34.7 30.8 35.4 35.7 ...

#Lets see the mininmum, average and maximum values in datas
mydata %>% group_by(Position)  %>%
          summarise(min(Age),mean(Age),max(Age))

## # A tibble: 9 x 4
##   Position          `min(Age)` `mean(Age)` `max(Age)`
## * <chr>                  <dbl>       <dbl>      <dbl>
## 1 Catcher                 22.3        29.6       38.5
## 2 Designated_Hitter       23.6        30.4       38.8
## 3 First_Baseman           22.8        29.5       48.5
## 4 Outfielder              21.5        29.0       42.6
## 5 Relief_Pitcher          21.8        28.5       42.3
## 6 Second_Baseman          22.6        29.0       41.2
## 7 Shortstop               22.4        28.4       39.8
## 8 Starting_Pitcher        20.9        28.2       44.3
## 9 Third_Baseman           22.4        28.7       37.4

mydata %>% group_by(Position)  %>%
          summarise(min(Height),mean(Height),max(Height))

## # A tibble: 9 x 4
##   Position          `min(Height)` `mean(Height)` `max(Height)`
## * <chr>                     <int>          <dbl>         <int>
## 1 Catcher                      69           72.7            76
## 2 Designated_Hitter            70           74.2            77
## 3 First_Baseman                70           74              80
## 4 Outfielder                   68           73.0            78
## 5 Relief_Pitcher               68           74.4            83
## 6 Second_Baseman               67           71.4            75
## 7 Shortstop                    67           71.9            75
## 8 Starting_Pitcher             69           74.7            82
## 9 Third_Baseman                68           73.0            77

mydata %>% group_by(Position)  %>%
           summarise(min(Weight),mean(Weight),max(Weight))

## # A tibble: 9 x 4
##   Position          `min(Weight)` `mean(Weight)` `max(Weight)`
## * <chr>                     <int>          <dbl>         <int>
## 1 Catcher                     170           204.           245
## 2 Designated_Hitter           195           221.           275
## 3 First_Baseman               180           213.           260
## 4 Outfielder                  160           199.           257
## 5 Relief_Pitcher              150           204.           278
## 6 Second_Baseman              160           184.           210
## 7 Shortstop                   150           183.           210
## 8 Starting_Pitcher            150           205.           290
## 9 Third_Baseman               155           201.           240

Factorization and plotting

Two columns - * Team and Postion * contains categorical values that can be changed to factor

mydata$Team <- factor(mydata$Team)
mydata$Position <- factor(mydata$Position)
str(mydata$Team)

##  Factor w/ 30 levels "ANA","ARZ","ATL",..: 4 4 4 4 4 4 4 4 4 4 ...

str(mydata$Position)

##  Factor w/ 9 levels "Catcher","Designated_Hitter",..: 1 1 1 3 3 6 7 9 9 4 ...

Position haS 9 LEVELS. For better analysis, lets reduce the number of levels.

mydata <- mydata %>% plyr::mutate(new_post = 
                    case_when(Position == "First_Baseman" ~ "Baseman",
                              Position == "Second_Baseman" ~ "Baseman",
                              Position == "Third_Baseman" ~ "Baseman",
                              Position == "Catcher" ~ "Catcher",
                              Position == "Designated_Hitter" ~ "Hitter",
                              Position == "Outfielder" ~ "Outfielder",
                              Position == "Relief_Pitcher" ~ "Pitcher",
                              Position == "Starting_Pitcher" ~ "Pitcher",
                              Position == "Shortstop" ~ "Shortstop",
                    ))

dim(mydata)

## [1] 1034    7

head(mydata,4)

##              Name Team      Position Height Weight   Age new_post
## 1   Adam_Donachie  BAL       Catcher     74    180 22.99  Catcher
## 2       Paul_Bako  BAL       Catcher     74    215 34.69  Catcher
## 3 Ramon_Hernandez  BAL       Catcher     72    210 30.78  Catcher
## 4    Kevin_Millar  BAL First_Baseman     72    210 35.43  Baseman

table(mydata$Team,mydata$new_post)

##      
##       Baseman Catcher Hitter Outfielder Pitcher Shortstop
##   ANA       7       3      0          8      15         2
##   ARZ       5       2      0          5      14         2
##   ATL       6       2      0          7      20         2
##   BAL       5       3      1          7      18         1
##   BOS       3       3      1          7      20         2
##   CHC       5       3      0          8      19         1
##   CIN       6       3      0          8      17         2
##   CLE       5       2      1          8      18         1
##   COL       4       3      1          7      18         2
##   CWS       4       2      2          8      15         2
##   DET       6       2      1          6      19         3
##   FLA       4       2      0          7      17         2
##   HOU       8       3      0          5      17         1
##   KC        6       2      1          6      18         2
##   LA        8       2      0          6      16         1
##   MIN       7       3      2          5      15         1
##   MLW       5       4      0          8      15         3
##   NYM       5       2      0          7      22         2
##   NYY       6       2      1          5      17         1
##   OAK       6       3      1          5      19         3
##   PHI       5       4      1          6      19         1
##   PIT       5       2      0          7      20         1
##   SD        3       2      1          7      18         2
##   SEA       5       2      0          7      18         2
##   SF        6       2      0          7      18         1
##   STL       6       2      0          6      17         1
##   TB        4       3      1          5      19         1
##   TEX       3       4      1          6      19         2
##   TOR       6       2      2          3      19         2
##   WAS       4       2      0          7      20         3

Lets derive two tables from the above data set.

Table with player position and average values of age, height and weight.
Table with team name and position

# Position stats
position_stats <- mydata %>% group_by(Position) %>%
                           summarise(mean_height = round(mean(Height),0),mean_age= round(mean(Age),0),mean_weight= round(mean(Weight),0),
                           no_of_players= n())

head(position_stats,3)

## # A tibble: 3 x 5
##   Position          mean_height mean_age mean_weight no_of_players
##   <fct>                   <dbl>    <dbl>       <dbl>         <int>
## 1 Catcher                    73       30         204            76
## 2 Designated_Hitter          74       30         221            18
## 3 First_Baseman              74       29         213            55

sum(position_stats$no_of_players)

## [1] 1034

We will use ggplot to plot Height, weight of the player based on position and age.

#library(ggplot2)
ggplot2::ggplot(position_stats,aes(x=mean_weight, y= mean_height))+geom_point(aes(col=Position,size=mean_age))+
         labs(title = "Player position vs stats",x = "Average weight",y="Average height" )+scale_color_manual(values=c("slategray",  "midnightblue","green","yellow2","wheat4","Brown2","salmon3","deeppink","cyan3"))+theme(legend.key.size = unit(0.5,'cm'),legend.position = "left")+scale_size_continuous(range = c(4,10))+guides(color = guide_legend(override.aes = list(size = 4) ) )

**** Observation from graph: + Position based on average age: Least mean age: Shortstop,Starting_Pitcher Maximum mean age: Designated_Hitter

Position of tallest player: Starting_Pitcher
Position of shortest Player: Second_Baseman

******Designated hitter are mostly senior most player of the tem who don’t have to play defense. ******Shortstop is physically demanding position so the player with least average age makes sense. ******Starting pitcher have to pitch balls with precision and control. So, the average age of Starting pitcher is less

#library(dplyr)
names(mydata)

## [1] "Name"     "Team"     "Position" "Height"   "Weight"   "Age"      "new_post"

# Team stats
Team_Stats <- mydata %>% select(Team,new_post)%>% 
                     group_by(Team)%>%
                        count(new_post)

Team_Stats <- Team_Stats %>% distinct(Team,new_post,n,.keep_all=FALSE)   #remove duplicate rows from the dataset

sapply(Team_Stats,typeof)

##        Team    new_post           n 
##   "integer" "character"   "integer"

head(Team_Stats,10)

## # A tibble: 10 x 3
## # Groups:   Team [2]
##    Team  new_post       n
##    <fct> <chr>      <int>
##  1 ANA   Baseman        7
##  2 ANA   Catcher        3
##  3 ANA   Outfielder     8
##  4 ANA   Pitcher       15
##  5 ANA   Shortstop      2
##  6 ARZ   Baseman        5
##  7 ARZ   Catcher        2
##  8 ARZ   Outfielder     5
##  9 ARZ   Pitcher       14
## 10 ARZ   Shortstop      2

sum(Team_Stats$n)

## [1] 1034

length(mydata$Position)

## [1] 1034

Lets observe the number of players in each position based on teams

#library(ggplot2)
Team_stats_plot <- ggplot2::ggplot(Team_Stats,aes(x=new_post,y=n,fill=Team))+geom_bar(position = "dodge",stat = "identity")+coord_flip()
Team_stats_plot

**Observation from the above plot: + All teams have more pitchers. As pitchers are required to rest for a week after a match. Team would need more pitchers to take turns during matches. + Outfielder and Baseman are similar + Teams have more catchers than Hitters and Shortstops.

##Naive Bayes and knn on the dataset

Let me try knn and Naive bayes algorithm on the dataset.

set.seed(147)
sample_set <- caTools::sample.split(mydata,SplitRatio = 0.8)
Train_data <- mydata[sample_set, ]
Test_data <- mydata[!sample_set, ]

#frequency of values 

table(Test_data$new_post)

## 
##    Baseman    Catcher     Hitter Outfielder    Pitcher  Shortstop 
##         49         19          6         55        153         13

TTable <- transform(table(Train_data$new_post))
TTable

##         Var1 Freq
## 1    Baseman  109
## 2    Catcher   57
## 3     Hitter   12
## 4 Outfielder  139
## 5    Pitcher  383
## 6  Shortstop   39

#Proportion of train and test data

round(prop.table(table(Train_data$new_post))*100,1)

## 
##    Baseman    Catcher     Hitter Outfielder    Pitcher  Shortstop 
##       14.7        7.7        1.6       18.8       51.8        5.3

round(prop.table(table(Test_data$new_post))*100,1)

## 
##    Baseman    Catcher     Hitter Outfielder    Pitcher  Shortstop 
##       16.6        6.4        2.0       18.6       51.9        4.4

# Naive bayes algorithm

#laplace = 1

classifier <- e1071::naiveBayes(Train_data[, c("Weight","Height","Age")],as.factor(Train_data$new_post),laplace=1)
predict_vaue <- predict(classifier,Test_data[,c("Weight","Height","Age")])
gmodels::CrossTable(predict_vaue,(Test_data$new_post),prop.r = F,prop.c = F,prop.t = F,prop.chisq = F)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |-------------------------|
## 
##  
## Total Observations in Table:  295 
## 
##  
##              | (Test_data$new_post) 
## predict_vaue |    Baseman |    Catcher |     Hitter | Outfielder |    Pitcher |  Shortstop |  Row Total | 
## -------------|------------|------------|------------|------------|------------|------------|------------|
##      Baseman |          7 |          0 |          0 |          2 |          3 |          2 |         14 | 
## -------------|------------|------------|------------|------------|------------|------------|------------|
##   Outfielder |          4 |          2 |          0 |          6 |          4 |          6 |         22 | 
## -------------|------------|------------|------------|------------|------------|------------|------------|
##      Pitcher |         38 |         16 |          6 |         43 |        143 |          4 |        250 | 
## -------------|------------|------------|------------|------------|------------|------------|------------|
##    Shortstop |          0 |          1 |          0 |          4 |          3 |          1 |          9 | 
## -------------|------------|------------|------------|------------|------------|------------|------------|
## Column Total |         49 |         19 |          6 |         55 |        153 |         13 |        295 | 
## -------------|------------|------------|------------|------------|------------|------------|------------|
## 
##

table(predict_vaue,Test_data$new_post)

##             
## predict_vaue Baseman Catcher Hitter Outfielder Pitcher Shortstop
##   Baseman          7       0      0          2       3         2
##   Catcher          0       0      0          0       0         0
##   Hitter           0       0      0          0       0         0
##   Outfielder       4       2      0          6       4         6
##   Pitcher         38      16      6         43     143         4
##   Shortstop        0       1      0          4       3         1

sum(diag(table(predict_vaue,Test_data$new_post)))

## [1] 157

accuracy <- 157/295
round(accuracy*100,2)

## [1] 53.22

** knn algorithm

#library(e1071)
predict_knn <- class::knn(Train_data[,c("Weight","Height","Age")],Test_data[,c("Weight","Height","Age")],as.factor(Train_data$new_post),k=11)
gmodels::CrossTable(predict_knn,Test_data$new_post)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  295 
## 
##  
##              | Test_data$new_post 
##  predict_knn |    Baseman |    Catcher |     Hitter | Outfielder |    Pitcher |  Shortstop |  Row Total | 
## -------------|------------|------------|------------|------------|------------|------------|------------|
##      Baseman |          7 |          3 |          0 |          4 |         10 |          1 |         25 | 
##              |      1.953 |      1.200 |      0.508 |      0.094 |      0.679 |      0.009 |            | 
##              |      0.280 |      0.120 |      0.000 |      0.160 |      0.400 |      0.040 |      0.085 | 
##              |      0.143 |      0.158 |      0.000 |      0.073 |      0.065 |      0.077 |            | 
##              |      0.024 |      0.010 |      0.000 |      0.014 |      0.034 |      0.003 |            | 
## -------------|------------|------------|------------|------------|------------|------------|------------|
##      Catcher |          0 |          1 |          0 |          2 |          4 |          1 |          8 | 
##              |      1.329 |      0.456 |      0.163 |      0.173 |      0.005 |      1.189 |            | 
##              |      0.000 |      0.125 |      0.000 |      0.250 |      0.500 |      0.125 |      0.027 | 
##              |      0.000 |      0.053 |      0.000 |      0.036 |      0.026 |      0.077 |            | 
##              |      0.000 |      0.003 |      0.000 |      0.007 |      0.014 |      0.003 |            | 
## -------------|------------|------------|------------|------------|------------|------------|------------|
##   Outfielder |         11 |          1 |          0 |          7 |         19 |          4 |         42 | 
##              |      2.321 |      1.075 |      0.854 |      0.088 |      0.356 |      2.496 |            | 
##              |      0.262 |      0.024 |      0.000 |      0.167 |      0.452 |      0.095 |      0.142 | 
##              |      0.224 |      0.053 |      0.000 |      0.127 |      0.124 |      0.308 |            | 
##              |      0.037 |      0.003 |      0.000 |      0.024 |      0.064 |      0.014 |            | 
## -------------|------------|------------|------------|------------|------------|------------|------------|
##      Pitcher |         31 |         14 |          6 |         42 |        120 |          7 |        220 | 
##              |      0.841 |      0.002 |      0.520 |      0.024 |      0.305 |      0.749 |            | 
##              |      0.141 |      0.064 |      0.027 |      0.191 |      0.545 |      0.032 |      0.746 | 
##              |      0.633 |      0.737 |      1.000 |      0.764 |      0.784 |      0.538 |            | 
##              |      0.105 |      0.047 |      0.020 |      0.142 |      0.407 |      0.024 |            | 
## -------------|------------|------------|------------|------------|------------|------------|------------|
## Column Total |         49 |         19 |          6 |         55 |        153 |         13 |        295 | 
##              |      0.166 |      0.064 |      0.020 |      0.186 |      0.519 |      0.044 |            | 
## -------------|------------|------------|------------|------------|------------|------------|------------|
## 
##

sum(diag(table(predict_knn,Test_data$new_post)))

## [1] 135

accuracy <- sum(diag(table(predict_knn,Test_data$new_post)))/295
round(accuracy*100,2)

## [1] 45.76

#knn tunning to check the omtimum value of k

knn_classifier <- e1071::tune.knn(Train_data[,c("Weight","Height","Age")],as.factor(Train_data$new_post),knn=21:31)
knn_classifier

## 
## Error estimation of 'knn.wrapper' using 10-fold cross validation: 0.6104406

#use k=21
predict_knn <- class::knn(Train_data[,c("Weight","Height","Age")],Test_data[,c("Weight","Height","Age")],as.factor(Train_data$new_post),k=21)
#gmodels::CrossTable(predict_knn,Test_data$new_post)
sum(diag(table(predict_knn,Test_data$new_post)))

## [1] 147

accuracy <- sum(diag(table(predict_knn,Test_data$new_post)))/295
round(accuracy*100,2)

## [1] 49.83

Data Exploration and Visualization

Thrishna Balakrishnan

4/6/2021

Data analysis on Baseball dataset

Factorization and plotting