The baseball data set contains the following variables
+ Name* - Name of the player + Team - Name of the team + Position - Player position - Catcher,Designated hitter,First baseman,Outfielder,Relief pitcher, Second baseman,Shortstop,Starting pitcher, Third baseman + Height - Height of the player + Weight - Weight of the player + Age - Age of the player
Lets take a look at the dataset
mydata <- read.table('https://umich.instructure.com/files/330381/download?download_frd=1',as.is=T, header=T)
head(mydata,4)
## Name Team Position Height Weight Age
## 1 Adam_Donachie BAL Catcher 74 180 22.99
## 2 Paul_Bako BAL Catcher 74 215 34.69
## 3 Ramon_Hernandez BAL Catcher 72 210 30.78
## 4 Kevin_Millar BAL First_Baseman 72 210 35.43
dim(mydata)
## [1] 1034 6
str(mydata)
## 'data.frame': 1034 obs. of 6 variables:
## $ Name : chr "Adam_Donachie" "Paul_Bako" "Ramon_Hernandez" "Kevin_Millar" ...
## $ Team : chr "BAL" "BAL" "BAL" "BAL" ...
## $ Position: chr "Catcher" "Catcher" "Catcher" "First_Baseman" ...
## $ Height : int 74 74 72 72 73 69 69 71 76 71 ...
## $ Weight : int 180 215 210 210 188 176 209 200 231 180 ...
## $ Age : num 23 34.7 30.8 35.4 35.7 ...
#Lets see the mininmum, average and maximum values in datas
mydata %>% group_by(Position) %>%
summarise(min(Age),mean(Age),max(Age))
## # A tibble: 9 x 4
## Position `min(Age)` `mean(Age)` `max(Age)`
## * <chr> <dbl> <dbl> <dbl>
## 1 Catcher 22.3 29.6 38.5
## 2 Designated_Hitter 23.6 30.4 38.8
## 3 First_Baseman 22.8 29.5 48.5
## 4 Outfielder 21.5 29.0 42.6
## 5 Relief_Pitcher 21.8 28.5 42.3
## 6 Second_Baseman 22.6 29.0 41.2
## 7 Shortstop 22.4 28.4 39.8
## 8 Starting_Pitcher 20.9 28.2 44.3
## 9 Third_Baseman 22.4 28.7 37.4
mydata %>% group_by(Position) %>%
summarise(min(Height),mean(Height),max(Height))
## # A tibble: 9 x 4
## Position `min(Height)` `mean(Height)` `max(Height)`
## * <chr> <int> <dbl> <int>
## 1 Catcher 69 72.7 76
## 2 Designated_Hitter 70 74.2 77
## 3 First_Baseman 70 74 80
## 4 Outfielder 68 73.0 78
## 5 Relief_Pitcher 68 74.4 83
## 6 Second_Baseman 67 71.4 75
## 7 Shortstop 67 71.9 75
## 8 Starting_Pitcher 69 74.7 82
## 9 Third_Baseman 68 73.0 77
mydata %>% group_by(Position) %>%
summarise(min(Weight),mean(Weight),max(Weight))
## # A tibble: 9 x 4
## Position `min(Weight)` `mean(Weight)` `max(Weight)`
## * <chr> <int> <dbl> <int>
## 1 Catcher 170 204. 245
## 2 Designated_Hitter 195 221. 275
## 3 First_Baseman 180 213. 260
## 4 Outfielder 160 199. 257
## 5 Relief_Pitcher 150 204. 278
## 6 Second_Baseman 160 184. 210
## 7 Shortstop 150 183. 210
## 8 Starting_Pitcher 150 205. 290
## 9 Third_Baseman 155 201. 240
Two columns - * Team and Postion * contains categorical values that can be changed to factor
mydata$Team <- factor(mydata$Team)
mydata$Position <- factor(mydata$Position)
str(mydata$Team)
## Factor w/ 30 levels "ANA","ARZ","ATL",..: 4 4 4 4 4 4 4 4 4 4 ...
str(mydata$Position)
## Factor w/ 9 levels "Catcher","Designated_Hitter",..: 1 1 1 3 3 6 7 9 9 4 ...
Position haS 9 LEVELS. For better analysis, lets reduce the number of levels.
mydata <- mydata %>% plyr::mutate(new_post =
case_when(Position == "First_Baseman" ~ "Baseman",
Position == "Second_Baseman" ~ "Baseman",
Position == "Third_Baseman" ~ "Baseman",
Position == "Catcher" ~ "Catcher",
Position == "Designated_Hitter" ~ "Hitter",
Position == "Outfielder" ~ "Outfielder",
Position == "Relief_Pitcher" ~ "Pitcher",
Position == "Starting_Pitcher" ~ "Pitcher",
Position == "Shortstop" ~ "Shortstop",
))
dim(mydata)
## [1] 1034 7
head(mydata,4)
## Name Team Position Height Weight Age new_post
## 1 Adam_Donachie BAL Catcher 74 180 22.99 Catcher
## 2 Paul_Bako BAL Catcher 74 215 34.69 Catcher
## 3 Ramon_Hernandez BAL Catcher 72 210 30.78 Catcher
## 4 Kevin_Millar BAL First_Baseman 72 210 35.43 Baseman
table(mydata$Team,mydata$new_post)
##
## Baseman Catcher Hitter Outfielder Pitcher Shortstop
## ANA 7 3 0 8 15 2
## ARZ 5 2 0 5 14 2
## ATL 6 2 0 7 20 2
## BAL 5 3 1 7 18 1
## BOS 3 3 1 7 20 2
## CHC 5 3 0 8 19 1
## CIN 6 3 0 8 17 2
## CLE 5 2 1 8 18 1
## COL 4 3 1 7 18 2
## CWS 4 2 2 8 15 2
## DET 6 2 1 6 19 3
## FLA 4 2 0 7 17 2
## HOU 8 3 0 5 17 1
## KC 6 2 1 6 18 2
## LA 8 2 0 6 16 1
## MIN 7 3 2 5 15 1
## MLW 5 4 0 8 15 3
## NYM 5 2 0 7 22 2
## NYY 6 2 1 5 17 1
## OAK 6 3 1 5 19 3
## PHI 5 4 1 6 19 1
## PIT 5 2 0 7 20 1
## SD 3 2 1 7 18 2
## SEA 5 2 0 7 18 2
## SF 6 2 0 7 18 1
## STL 6 2 0 6 17 1
## TB 4 3 1 5 19 1
## TEX 3 4 1 6 19 2
## TOR 6 2 2 3 19 2
## WAS 4 2 0 7 20 3
Lets derive two tables from the above data set.
# Position stats
position_stats <- mydata %>% group_by(Position) %>%
summarise(mean_height = round(mean(Height),0),mean_age= round(mean(Age),0),mean_weight= round(mean(Weight),0),
no_of_players= n())
head(position_stats,3)
## # A tibble: 3 x 5
## Position mean_height mean_age mean_weight no_of_players
## <fct> <dbl> <dbl> <dbl> <int>
## 1 Catcher 73 30 204 76
## 2 Designated_Hitter 74 30 221 18
## 3 First_Baseman 74 29 213 55
sum(position_stats$no_of_players)
## [1] 1034
We will use ggplot to plot Height, weight of the player based on position and age.
#library(ggplot2)
ggplot2::ggplot(position_stats,aes(x=mean_weight, y= mean_height))+geom_point(aes(col=Position,size=mean_age))+
labs(title = "Player position vs stats",x = "Average weight",y="Average height" )+scale_color_manual(values=c("slategray", "midnightblue","green","yellow2","wheat4","Brown2","salmon3","deeppink","cyan3"))+theme(legend.key.size = unit(0.5,'cm'),legend.position = "left")+scale_size_continuous(range = c(4,10))+guides(color = guide_legend(override.aes = list(size = 4) ) )
**** Observation from graph: + Position based on average age: Least mean age: Shortstop,Starting_Pitcher Maximum mean age: Designated_Hitter
Position of tallest player: Starting_Pitcher
Position of shortest Player: Second_Baseman
******Designated hitter are mostly senior most player of the tem who don’t have to play defense. ******Shortstop is physically demanding position so the player with least average age makes sense. ******Starting pitcher have to pitch balls with precision and control. So, the average age of Starting pitcher is less
#library(dplyr)
names(mydata)
## [1] "Name" "Team" "Position" "Height" "Weight" "Age" "new_post"
# Team stats
Team_Stats <- mydata %>% select(Team,new_post)%>%
group_by(Team)%>%
count(new_post)
Team_Stats <- Team_Stats %>% distinct(Team,new_post,n,.keep_all=FALSE) #remove duplicate rows from the dataset
sapply(Team_Stats,typeof)
## Team new_post n
## "integer" "character" "integer"
head(Team_Stats,10)
## # A tibble: 10 x 3
## # Groups: Team [2]
## Team new_post n
## <fct> <chr> <int>
## 1 ANA Baseman 7
## 2 ANA Catcher 3
## 3 ANA Outfielder 8
## 4 ANA Pitcher 15
## 5 ANA Shortstop 2
## 6 ARZ Baseman 5
## 7 ARZ Catcher 2
## 8 ARZ Outfielder 5
## 9 ARZ Pitcher 14
## 10 ARZ Shortstop 2
sum(Team_Stats$n)
## [1] 1034
length(mydata$Position)
## [1] 1034
Lets observe the number of players in each position based on teams
#library(ggplot2)
Team_stats_plot <- ggplot2::ggplot(Team_Stats,aes(x=new_post,y=n,fill=Team))+geom_bar(position = "dodge",stat = "identity")+coord_flip()
Team_stats_plot
**Observation from the above plot: + All teams have more pitchers. As pitchers are required to rest for a week after a match. Team would need more pitchers to take turns during matches. + Outfielder and Baseman are similar + Teams have more catchers than Hitters and Shortstops.
##Naive Bayes and knn on the dataset
Let me try knn and Naive bayes algorithm on the dataset.
set.seed(147)
sample_set <- caTools::sample.split(mydata,SplitRatio = 0.8)
Train_data <- mydata[sample_set, ]
Test_data <- mydata[!sample_set, ]
#frequency of values
table(Test_data$new_post)
##
## Baseman Catcher Hitter Outfielder Pitcher Shortstop
## 49 19 6 55 153 13
TTable <- transform(table(Train_data$new_post))
TTable
## Var1 Freq
## 1 Baseman 109
## 2 Catcher 57
## 3 Hitter 12
## 4 Outfielder 139
## 5 Pitcher 383
## 6 Shortstop 39
#Proportion of train and test data
round(prop.table(table(Train_data$new_post))*100,1)
##
## Baseman Catcher Hitter Outfielder Pitcher Shortstop
## 14.7 7.7 1.6 18.8 51.8 5.3
round(prop.table(table(Test_data$new_post))*100,1)
##
## Baseman Catcher Hitter Outfielder Pitcher Shortstop
## 16.6 6.4 2.0 18.6 51.9 4.4
# Naive bayes algorithm
#laplace = 1
classifier <- e1071::naiveBayes(Train_data[, c("Weight","Height","Age")],as.factor(Train_data$new_post),laplace=1)
predict_vaue <- predict(classifier,Test_data[,c("Weight","Height","Age")])
gmodels::CrossTable(predict_vaue,(Test_data$new_post),prop.r = F,prop.c = F,prop.t = F,prop.chisq = F)
##
##
## Cell Contents
## |-------------------------|
## | N |
## |-------------------------|
##
##
## Total Observations in Table: 295
##
##
## | (Test_data$new_post)
## predict_vaue | Baseman | Catcher | Hitter | Outfielder | Pitcher | Shortstop | Row Total |
## -------------|------------|------------|------------|------------|------------|------------|------------|
## Baseman | 7 | 0 | 0 | 2 | 3 | 2 | 14 |
## -------------|------------|------------|------------|------------|------------|------------|------------|
## Outfielder | 4 | 2 | 0 | 6 | 4 | 6 | 22 |
## -------------|------------|------------|------------|------------|------------|------------|------------|
## Pitcher | 38 | 16 | 6 | 43 | 143 | 4 | 250 |
## -------------|------------|------------|------------|------------|------------|------------|------------|
## Shortstop | 0 | 1 | 0 | 4 | 3 | 1 | 9 |
## -------------|------------|------------|------------|------------|------------|------------|------------|
## Column Total | 49 | 19 | 6 | 55 | 153 | 13 | 295 |
## -------------|------------|------------|------------|------------|------------|------------|------------|
##
##
table(predict_vaue,Test_data$new_post)
##
## predict_vaue Baseman Catcher Hitter Outfielder Pitcher Shortstop
## Baseman 7 0 0 2 3 2
## Catcher 0 0 0 0 0 0
## Hitter 0 0 0 0 0 0
## Outfielder 4 2 0 6 4 6
## Pitcher 38 16 6 43 143 4
## Shortstop 0 1 0 4 3 1
sum(diag(table(predict_vaue,Test_data$new_post)))
## [1] 157
accuracy <- 157/295
round(accuracy*100,2)
## [1] 53.22
** knn algorithm
#library(e1071)
predict_knn <- class::knn(Train_data[,c("Weight","Height","Age")],Test_data[,c("Weight","Height","Age")],as.factor(Train_data$new_post),k=11)
gmodels::CrossTable(predict_knn,Test_data$new_post)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 295
##
##
## | Test_data$new_post
## predict_knn | Baseman | Catcher | Hitter | Outfielder | Pitcher | Shortstop | Row Total |
## -------------|------------|------------|------------|------------|------------|------------|------------|
## Baseman | 7 | 3 | 0 | 4 | 10 | 1 | 25 |
## | 1.953 | 1.200 | 0.508 | 0.094 | 0.679 | 0.009 | |
## | 0.280 | 0.120 | 0.000 | 0.160 | 0.400 | 0.040 | 0.085 |
## | 0.143 | 0.158 | 0.000 | 0.073 | 0.065 | 0.077 | |
## | 0.024 | 0.010 | 0.000 | 0.014 | 0.034 | 0.003 | |
## -------------|------------|------------|------------|------------|------------|------------|------------|
## Catcher | 0 | 1 | 0 | 2 | 4 | 1 | 8 |
## | 1.329 | 0.456 | 0.163 | 0.173 | 0.005 | 1.189 | |
## | 0.000 | 0.125 | 0.000 | 0.250 | 0.500 | 0.125 | 0.027 |
## | 0.000 | 0.053 | 0.000 | 0.036 | 0.026 | 0.077 | |
## | 0.000 | 0.003 | 0.000 | 0.007 | 0.014 | 0.003 | |
## -------------|------------|------------|------------|------------|------------|------------|------------|
## Outfielder | 11 | 1 | 0 | 7 | 19 | 4 | 42 |
## | 2.321 | 1.075 | 0.854 | 0.088 | 0.356 | 2.496 | |
## | 0.262 | 0.024 | 0.000 | 0.167 | 0.452 | 0.095 | 0.142 |
## | 0.224 | 0.053 | 0.000 | 0.127 | 0.124 | 0.308 | |
## | 0.037 | 0.003 | 0.000 | 0.024 | 0.064 | 0.014 | |
## -------------|------------|------------|------------|------------|------------|------------|------------|
## Pitcher | 31 | 14 | 6 | 42 | 120 | 7 | 220 |
## | 0.841 | 0.002 | 0.520 | 0.024 | 0.305 | 0.749 | |
## | 0.141 | 0.064 | 0.027 | 0.191 | 0.545 | 0.032 | 0.746 |
## | 0.633 | 0.737 | 1.000 | 0.764 | 0.784 | 0.538 | |
## | 0.105 | 0.047 | 0.020 | 0.142 | 0.407 | 0.024 | |
## -------------|------------|------------|------------|------------|------------|------------|------------|
## Column Total | 49 | 19 | 6 | 55 | 153 | 13 | 295 |
## | 0.166 | 0.064 | 0.020 | 0.186 | 0.519 | 0.044 | |
## -------------|------------|------------|------------|------------|------------|------------|------------|
##
##
sum(diag(table(predict_knn,Test_data$new_post)))
## [1] 135
accuracy <- sum(diag(table(predict_knn,Test_data$new_post)))/295
round(accuracy*100,2)
## [1] 45.76
#knn tunning to check the omtimum value of k
knn_classifier <- e1071::tune.knn(Train_data[,c("Weight","Height","Age")],as.factor(Train_data$new_post),knn=21:31)
knn_classifier
##
## Error estimation of 'knn.wrapper' using 10-fold cross validation: 0.6104406
#use k=21
predict_knn <- class::knn(Train_data[,c("Weight","Height","Age")],Test_data[,c("Weight","Height","Age")],as.factor(Train_data$new_post),k=21)
#gmodels::CrossTable(predict_knn,Test_data$new_post)
sum(diag(table(predict_knn,Test_data$new_post)))
## [1] 147
accuracy <- sum(diag(table(predict_knn,Test_data$new_post)))/295
round(accuracy*100,2)
## [1] 49.83