Use data set Cricket Batsmen.
1. Analysis of Data
2. Visualization of Data
3. Cluster Analysis of Cricket Data
The data are collected for all the teams on the performance of the players based on the batting records of the One-Day Internationals (ODIs) match from 1st Jan 2008 onwards until 1st Nov 2009. There are 435 players and 13 variables in the dataset. The descriptions of these variables is given below
Attributes:
Field Description
Mat - Matches played
Inns - Innings batted
Not Out - Not outs
Runs - Runs scored
HS - Highest inns scored
Ave - Batting average
BF - Balls faced
SR - Batting strike rate
100s - Scored of hundreds
50s - Scored of Fifties
DUCK - Ducks scored
Setup
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(vcd)
## Loading required package: grid
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(factoextra)
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
library(cluster)
Functions
detect_outliers <- function(inp, na.rm=TRUE) {
i.qnt <- quantile(inp, probs=c(.25, .75), na.rm=na.rm)
i.max <- 1.5 * IQR(inp, na.rm=na.rm)
otp <- inp
otp[inp < (i.qnt[1] - i.max)] <- NA
otp[inp > (i.qnt[2] + i.max)] <- NA
#inp <- count(inp[is.na(otp)])
sum(is.na(otp))
}
Non_outliers <- function(x, na.rm = TRUE, ...) {
qnt <- quantile(x, probs=c(.25, .75), na.rm = na.rm, ...)
H <- 1.5 * IQR(x, na.rm = na.rm)
y <- x
y[x < (qnt[1] - H)] <- NA
y[x > (qnt[2] + H)] <- NA
y
}
Remove_Outliers <- function ( z, na.rm = TRUE){
Out <- Non_outliers(z)
Out <-as.data.frame (Out)
z <- Out$Out[match(z, Out$Out)]
z
}
Graph_Boxplot <- function (input, na.rm = TRUE){
Plot <- ggplot(dfr_Cricket, aes(x="", y=input)) +
geom_boxplot(aes(fill=input), color="green") +
labs(title="Outliers")
Plot
}
detect_na <- function(inp) {
sum(is.na(inp))
}
Dataset
setwd("E:/R 2")
dfr_Cricket <- read.csv("cluster _data.csv", header=T, stringsAsFactors=F)
intRowCount <- nrow(dfr_Cricket)
head(dfr_Cricket)
## Player_ID Mat Inns Not.Outs Runs HS Ave BF SR X100s X50s Ducks X4s
## 1 1 55 48 14 2116 124 62.23 2569 82 2 16 0 138
## 2 2 51 47 12 1788 85 51.08 2120 84 0 17 0 109
## 3 3 49 46 4 1772 150 42.19 1949 91 4 12 3 183
## 4 4 49 47 5 1653 138 39.35 1667 99 4 8 3 167
## 5 5 49 48 3 1643 128 36.51 2208 74 4 8 2 162
## 6 6 33 33 1 1499 125 46.84 1193 126 3 10 0 206
## X6s
## 1 27
## 2 20
## 3 11
## 4 52
## 5 3
## 6 36
Observation
1. There are total ‘intRowCount’ data records in the file.
Missing Data
#sum(is.na(dfrModel$Age))
lapply(dfr_Cricket, FUN=detect_na)
## $Player_ID
## [1] 0
##
## $Mat
## [1] 0
##
## $Inns
## [1] 0
##
## $Not.Outs
## [1] 0
##
## $Runs
## [1] 0
##
## $HS
## [1] 0
##
## $Ave
## [1] 0
##
## $BF
## [1] 0
##
## $SR
## [1] 0
##
## $X100s
## [1] 0
##
## $X50s
## [1] 0
##
## $Ducks
## [1] 0
##
## $X4s
## [1] 0
##
## $X6s
## [1] 0
Observations
There are no NA records in the data sets.
Summary
#summary(dfrModel)
str(dfr_Cricket)
## 'data.frame': 435 obs. of 14 variables:
## $ Player_ID: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Mat : int 55 51 49 49 49 33 42 38 45 41 ...
## $ Inns : int 48 47 46 47 48 33 42 38 45 41 ...
## $ Not.Outs : int 14 12 4 5 3 1 1 2 0 6 ...
## $ Runs : int 2116 1788 1772 1653 1643 1499 1476 1411 1409 1339 ...
## $ HS : int 124 85 150 138 128 125 126 178 154 100 ...
## $ Ave : num 62.2 51.1 42.2 39.4 36.5 ...
## $ BF : int 2569 2120 1949 1667 2208 1193 1865 1637 1797 2004 ...
## $ SR : int 82 84 91 99 74 126 79 86 78 67 ...
## $ X100s : int 2 0 4 4 4 3 3 3 2 1 ...
## $ X50s : int 16 17 12 8 8 10 10 8 8 12 ...
## $ Ducks : int 0 0 3 3 2 0 1 1 6 2 ...
## $ X4s : int 138 109 183 167 162 206 143 156 160 109 ...
## $ X6s : int 27 20 11 52 3 36 13 22 16 2 ...
lapply(dfr_Cricket, FUN=describe)
## $Player_ID
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 435 218 125.72 218 218 161.6 1 435 434 0 -1.21
## se
## X1 6.03
##
## $Mat
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 435 13.2 12.2 9 11.34 8.9 1 55 54 1.17 0.54 0.59
##
## $Inns
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 435 10.54 10.58 7 8.65 7.41 1 48 47 1.51 1.79
## se
## X1 0.51
##
## $Not.Outs
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 435 1.85 2.47 1 1.36 1.48 0 14 14 1.83 3.72 0.12
##
## $Runs
## vars n mean sd median trimmed mad min max range skew
## X1 1 435 242.37 354.86 93 159.96 124.54 0 2116 2116 2.28
## kurtosis se
## X1 5.32 17.01
##
## $HS
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 435 48.19 40.47 39 43.68 41.51 0 194 194 0.89 0.13
## se
## X1 1.94
##
## $Ave
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 435 21.51 16.63 20 20.16 16.31 0 183 183 2.56 19.63
## se
## X1 0.8
##
## $BF
## vars n mean sd median trimmed mad min max range skew
## X1 1 435 310.33 425.61 139 216.69 177.91 1 2569 2568 2.19
## kurtosis se
## X1 5.07 20.41
##
## $SR
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 435 68.4 29.81 70 68.98 22.24 0 329 329 1.42 13.67
## se
## X1 1.43
##
## $X100s
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 435 0.25 0.73 0 0.05 0 0 4 4 3.43 12.15 0.04
##
## $X50s
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 435 1.29 2.44 0 0.68 0 0 17 17 2.75 9.26 0.12
##
## $Ducks
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 435 0.94 1.24 1 0.7 1.48 0 6 6 1.59 2.43 0.06
##
## $X4s
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 435 21.95 33.75 8 14.03 11.86 0 206 206 2.48 6.59
## se
## X1 1.62
##
## $X6s
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 435 3.57 6.78 1 1.99 1.48 0 53 53 3.67 17.73 0.33
Box Plot
lapply(dfr_Cricket, FUN=Graph_Boxplot)
## $Player_ID
##
## $Mat
##
## $Inns
##
## $Not.Outs
##
## $Runs
##
## $HS
##
## $Ave
##
## $BF
##
## $SR
##
## $X100s
##
## $X50s
##
## $Ducks
##
## $X4s
##
## $X6s
Observation
Here we can see that there are some outliers in the data but they may be useful while making the cluster.
To take the better decisions we are taking the players who played more than 7 Matches which is 40 percentile of the given data of matches.
Subset of Data
dfr_Cricket1 <- filter(dfr_Cricket, Mat>=7 )
No of Outliers in Each Variable
lapply(dfr_Cricket1, FUN=detect_outliers)
## $Player_ID
## [1] 0
##
## $Mat
## [1] 1
##
## $Inns
## [1] 8
##
## $Not.Outs
## [1] 12
##
## $Runs
## [1] 19
##
## $HS
## [1] 1
##
## $Ave
## [1] 1
##
## $BF
## [1] 13
##
## $SR
## [1] 17
##
## $X100s
## [1] 57
##
## $X50s
## [1] 17
##
## $Ducks
## [1] 3
##
## $X4s
## [1] 23
##
## $X6s
## [1] 18
Outliers Obsevations
1. We can see that there are outliers in the data, still we will go with outliers.
As all the variables are having different data ranges so we are going with Data Normalization so it will be easy to find out the Euclidian Distance which is very useful while doing the cluster analysis.
Data Cleaning (New Columns addition)
##Runs without Boundaries (Without 4's & 6's)
dfr_Cricket1$Runs_WO_Boundaries <- dfr_Cricket1$Runs-dfr_Cricket1$X4s*4-dfr_Cricket1$X6s*6
##No of matches player did not play
dfr_Cricket1$Mat_Not_Bat <- dfr_Cricket1$Mat-dfr_Cricket1$Inns
#No of Runs with boundries
dfr_Cricket1$Runs_boundries <- dfr_Cricket1$X4s*4+dfr_Cricket1$X6s*6
##No of balls faced per innings
dfr_Cricket1$Ball_Per_Inng <- dfr_Cricket1$BF/dfr_Cricket1$Inns
Observations 1. Four columns have been added to improve the data analysis.
2. One column is related to the runs without boundaries so from total runs 4’s and 6’s runs have been deducted.
3. Second column is added to know the difference between the total match playes & Total innings played to know the actual stats of each player. 4. Third Column is related to no of runs from boundries for each batsmen
5. Fourth column is related to average no of ball faced by each players per innings
Data Normalization
dfr_Cricket2 <- select(dfr_Cricket1, -c(Player_ID))
m <- apply(dfr_Cricket2, 2, mean)
s <- apply(dfr_Cricket2, 2, sd)
dfr_Cricket2 <- scale(dfr_Cricket2,m,s)
class(dfr_Cricket2)
## [1] "matrix"
dfr_Cricket2 <- as.data.frame(dfr_Cricket2)
Observations
1. As all the data values are different in different variables so for Cluster analysis we need to normalize the data
2. Data is normalized successfully to get the better clusters through Euclidien distance.
3. PLayer ID is removed while forming the clusters.
Calculating the Euclidean Distance
distance <- dist(dfr_Cricket2)
Observation
No errors. Distance is calucalated successfully.
hc.c <- hclust(distance)
hc.c
##
## Call:
## hclust(d = distance)
##
## Cluster method : complete
## Distance : euclidean
## Number of objects: 263
#plot(hc.c, labels=dfr_Cricket$Player_ID)
#plot(hc.c, hang=-1)
hc.d <- hclust(distance, method="ward.D")
hc.d
##
## Call:
## hclust(d = distance, method = "ward.D")
##
## Cluster method : ward.D
## Distance : euclidean
## Number of objects: 263
Cluster Membership With 10 cluster
member.c <- cutree(hc.c, 10)
member.d <- cutree(hc.d, 10)
Cluster Membership Comparison
table(member.c, member.d)
## member.d
## member.c 1 2 3 4 5 6 7 8 9 10
## 1 2 0 0 0 0 0 0 0 0 0
## 2 11 3 0 0 0 0 0 0 0 0
## 3 4 0 0 0 0 0 0 0 0 0
## 4 6 3 0 0 0 0 0 0 0 0
## 5 0 0 16 0 0 2 0 0 0 0
## 6 0 11 2 8 0 0 0 0 0 0
## 7 0 0 0 32 23 0 0 0 3 0
## 8 0 0 2 6 12 8 30 24 10 8
## 9 0 0 0 0 0 17 0 0 0 0
## 10 0 0 0 0 0 0 4 1 0 15
Observations Complete Linkage is putting most of the players in few Cluster only.
We can see that Complete Linkage Method is not giving better results compare to Wards method.
wss <- (nrow(dfr_Cricket2)-1)*sum(apply(dfr_Cricket2,2,var))
for( i in 2:20) wss[i] <- sum(kmeans(dfr_Cricket2, centers=i)$withinss)
plot(1:20, wss, type="b", Xlab="Number of Cluster", ylab = "Within group SS")
## Warning in plot.window(...): "Xlab" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "Xlab" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "Xlab" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "Xlab" is not
## a graphical parameter
## Warning in box(...): "Xlab" is not a graphical parameter
## Warning in title(...): "Xlab" is not a graphical parameter
Observations
We can see that At starting withon group Sum of square changes is very high, after Cluster 4 it is going near to flat & there are no large changes.
So here 4 Cluster would be good to take.
Cluster Membership With 4 cluster
member.c <- cutree(hc.c, 4)
member.d <- cutree(hc.d, 4)
Cluster Membership Comparison
table(member.c, member.d)
## member.d
## member.c 1 2 3 4
## 1 2 0 0 0
## 2 27 0 0 0
## 3 31 81 10 75
## 4 0 0 17 20
Observations
Complete Linkage is putting most of the players in few Cluster only.
We can see that Complete Linkage Method is not giving better results compare to Wards method.
Cluster Means
aggregate(dfr_Cricket2, list(member.c), mean)
## Group.1 Mat Inns Not.Outs Runs HS
## 1 1 2.8733333 2.9506566 3.7499898 3.8978162 0.99860182
## 2 2 1.5897564 1.9210407 0.2123993 2.1757460 1.54703113
## 3 3 -0.2954565 -0.2398586 -0.1776502 -0.2118991 -0.03352704
## 4 4 0.2576984 -0.2842503 0.5881710 -0.6701800 -1.00438425
## Ave BF SR X100s X50s Ducks
## 1 2.39864037 3.8914022 0.46324392 0.6661634 5.0002133 -1.0031611
## 2 1.11680317 2.0878132 0.61204175 2.1857219 1.9068678 0.8273187
## 3 0.03770842 -0.1966167 0.09984452 -0.2231817 -0.1882625 -0.2357697
## 4 -1.14539259 -0.6870345 -1.00326987 -0.4427036 -0.6594094 0.7058203
## X4s X6s Runs_WO_Boundaries Mat_Not_Bat Runs_boundries
## 1 2.3014770 2.2205292 4.9035856 0.2934864 2.4223161
## 2 2.2696890 1.6233298 1.9770859 -0.5012767 2.2462085
## 3 -0.2074103 -0.1631460 -0.2008568 -0.1763626 -0.2089916
## 4 -0.6763441 -0.4359782 -0.6383701 1.2889439 -0.6573220
## Ball_Per_Inng
## 1 1.67085410
## 2 1.09526583
## 3 0.05108305
## 4 -1.16154721
aggregate(dfr_Cricket2, list(member.d), mean)
## Group.1 Mat Inns Not.Outs Runs HS Ave
## 1 1 1.222814 1.4438765 0.5167966 1.5460004 1.1045718 0.9906509
## 2 2 -0.542051 -0.2974749 -0.5828730 -0.1361285 0.4642004 0.4251551
## 3 3 1.118042 0.2640295 1.2407687 -0.4449227 -0.5472973 -0.6715155
## 4 4 -0.627893 -0.7333255 -0.1820615 -0.7339022 -0.9378685 -0.7973231
## BF SR X100s X50s Ducks X4s
## 1 1.5074291 0.5382703 1.1097102 1.38042269 0.2202993 1.48205972
## 2 -0.0920539 0.1058901 -0.1552195 -0.02727367 -0.2225153 -0.09716841
## 3 -0.4589594 0.2463218 -0.4427036 -0.56524680 0.7196435 -0.47735852
## 4 -0.7431313 -0.5002527 -0.4427036 -0.68794243 -0.1539431 -0.71751855
## X6s Runs_WO_Boundaries Mat_Not_Bat Runs_boundries Ball_Per_Inng
## 1 1.1476297 1.4960380 -0.3079239 1.4887922 0.8895737
## 2 -0.1598784 -0.1420753 -0.6515304 -0.1189210 0.6462206
## 3 -0.1797088 -0.4277866 2.1479334 -0.4317024 -0.9058804
## 4 -0.5374262 -0.7021467 0.1395283 -0.7161996 -0.8553634
Observation
Through Cluster means we get to know which variables are important while forming the clusters.
Variables which are having high mean difference between the the clusters are good variables to formulate the clusters.
Like here we can see that for Complete Linkage below are the Important Variables:
Matches, Innings, Runs, HS, BF, SR, 50s, Runs_WO_Boundaries, Balls Per Innings
For Ward’s Method below are important variables:
Inns, Runs, HS, Ave, BF, SR, X4s, Runs_WO_Boundaries, Runs_boundries
Dendogram
plot(hc.d) # display dendogram
groups <- cutree(hc.d, k=4) # cut tree into 5 clusters
# draw dendogram with red borders around the 5 clusters
rect.hclust(hc.d, k=4, border="red")
m.a <- kmeans(dfr_Cricket2,4)
m.a
## K-means clustering with 4 clusters of sizes 90, 40, 37, 96
##
## Cluster means:
## Mat Inns Not.Outs Runs HS Ave
## 1 -0.3612805 -0.1216897 -0.4251897 0.02899844 0.5728416 0.5524428
## 2 1.4740740 1.7625589 0.4088179 1.99098359 1.3437094 1.1143769
## 3 1.0538630 0.3981031 1.4486660 -0.32177128 -0.5130303 -0.5111331
## 4 -0.6816734 -0.7737511 -0.3300655 -0.73274652 -0.8991875 -0.7852396
## BF SR X100s X50s Ducks X4s
## 1 0.0738511 0.1561467 -0.08540197 0.1161858 -0.1875208 0.04235061
## 2 1.9292563 0.5661027 1.66414374 1.8357785 0.3959446 1.96649372
## 3 -0.3394828 0.2146823 -0.44270355 -0.5005100 0.5879595 -0.38275125
## 4 -0.7422499 -0.4650058 -0.44270355 -0.6809270 -0.2157856 -0.71155737
## X6s Runs_WO_Boundaries Mat_Not_Bat Runs_boundries Ball_Per_Inng
## 1 -0.03509959 0.03016776 -0.61040832 0.02544754 0.7232138
## 2 1.48226722 1.88589915 -0.42180038 1.96523334 1.0553716
## 3 -0.13209114 -0.28309975 1.68135614 -0.34312180 -0.8105383
## 4 -0.53379535 -0.70496222 0.09998528 -0.71045943 -0.8053561
##
## Clustering vector:
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [36] 2 2 1 2 2 1 2 3 1 1 1 1 1 1 1 3 1 1 1 1 1 3 1 1 1 1 1 3 1 1 1 1 1 1 1
## [71] 1 1 1 1 1 1 1 1 1 1 1 3 3 1 3 3 1 1 3 1 1 1 1 3 3 1 1 1 1 1 1 1 1 3 3
## [106] 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 4 1 3 1 1 1 1 3 1 3 1 1 4 4 4 1 4 1 1 3
## [141] 1 4 4 1 1 4 4 1 1 1 4 1 3 1 1 3 1 3 1 3 4 3 3 1 3 4 4 1 4 4 4 3 4 3 3
## [176] 4 4 4 4 4 4 4 4 4 4 4 4 4 3 4 4 4 4 4 4 4 4 4 4 4 3 4 3 4 4 4 4 4 4 4
## [211] 4 4 3 4 4 3 4 4 4 4 4 4 3 3 4 4 3 3 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
## [246] 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4
##
## Within cluster sum of squares by cluster:
## [1] 497.2208 592.8048 266.7862 416.0508
## (between_SS / total_SS = 60.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
Observations
Cluster Membership Comparison
table(member.d, m.a$cluster)
##
## member.d 1 2 3 4
## 1 14 40 6 0
## 2 75 0 0 6
## 3 0 0 25 2
## 4 1 0 6 88
Cluster Plots
dfr_Cricket2$Player_ID <- dfr_Cricket1$Player_ID
plot(dfr_Cricket2[c("Player_ID", "Mat","Inns","Runs")], main="K Means Clustering", col=member.d)
plot(dfr_Cricket2[c("Player_ID", "BF","HS","Ave")], main="K Means Clustering", col=member.d)
plot(dfr_Cricket2[c("Player_ID", "X50s","X4s","Runs_WO_Boundaries")], main="K Means Clustering", col=member.d)
plot(dfr_Cricket2[c("Player_ID", "Runs_boundries","Mat_Not_Bat")], main="K Means Clustering", col=member.d)
##K Means Clustering Plots
dfr_Cricket2$Player_ID <- dfr_Cricket1$Player_ID
plot(dfr_Cricket2[c("Player_ID", "Mat","Inns","Runs")], main="K Means Clustering", col=m.a$cluster)
plot(dfr_Cricket2[c("Player_ID", "BF","HS","Ave")], main="K Means Clustering", col=m.a$cluster)
plot(dfr_Cricket2[c("Player_ID", "X50s","X4s","Runs_WO_Boundaries")], main="K Means Clustering", col=m.a$cluster)
plot(dfr_Cricket2[c("Player_ID", "Runs_boundries","Mat_Not_Bat")], main="K Means Clustering", col=m.a$cluster)
Write in CSV Files with Clustering Membership
dfr_Cricket1$ward_Membership <- member.d
dfr_Cricket1$Kmeans_Membership <- m.a$cluster
write.csv(dfr_Cricket1, "Cricket_Data_Final.csv")
m1 <- as.data.frame(m)
s1 <- as.data.frame(s)
write.csv(m.a$centers, "K_Means_check.csv")
write.csv(m1, "mean.csv")
write.csv(s1, "stdev.csv")
a <- aggregate(dfr_Cricket1, list(member.d), mean)
write.csv(a, "Wards_check.csv")
Four columns have been added to improve the data analysis.
Runs Without Boundaries
Runs With Boundaries
Difference between Matches & Innings
Balls faced per innings
Important variables which are contributed more to form the K means clusters are:
Matches, Innings, Runs, HS, Average, BF, 50s, X4s, Runs_WO_Boundaries, Runs_boundries
###########End of the Project#########