library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## combine, src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
### User-defined function 1: addproduct - sums the product of vector elements from two identical vectors
addproduct <- function(vec1,vec2){
sum <- 0
if (is.numeric(vec1)) {
if (is.numeric(vec2)) {
sum <- 0
for (i in 1:length(vec1)) {
sum <- sum + vec1[i] * vec2[i]
}
}
}
return(sum)
}
#Batting Analysis
# Read IPL batting data into R and print out the variable names
##reading
batting.df <- read.csv("D:/PGPBABI/Advance stats/practise assignment-IPL/batting_bowling_ipl_bat.csv")
print(head(batting.df))
## Name Runs Ave SR Fours Sixes HF
## 1 CH Gayle 733 61.08 160.74 46 59 9
## 2 G Gambhir 590 36.87 143.55 64 17 6
## 3 V Sehwag 495 33.00 161.23 57 19 5
## 4 CL White 479 43.54 149.68 41 20 5
## 5 S Dhawan 569 40.64 129.61 58 18 5
## 6 AM Rahane 560 40.00 129.33 73 10 5
print(names(batting.df))
## [1] "Name" "Runs" "Ave" "SR" "Fours" "Sixes" "HF"
# Look at the summary statistics esp. variance
bat_pca_data <- batting.df[,2:7]
summary(bat_pca_data)
## Runs Ave SR Fours
## Min. : 2.0 Min. : 0.50 Min. : 18.18 Min. : 0.00
## 1st Qu.: 98.0 1st Qu.:14.66 1st Qu.:108.75 1st Qu.: 6.25
## Median :196.5 Median :24.44 Median :120.14 Median :16.00
## Mean :219.9 Mean :24.73 Mean :119.16 Mean :19.79
## 3rd Qu.:330.8 3rd Qu.:32.20 3rd Qu.:132.00 3rd Qu.:28.00
## Max. :733.0 Max. :81.33 Max. :164.10 Max. :73.00
## Sixes HF
## Min. : 0.000 Min. :0.000
## 1st Qu.: 3.000 1st Qu.:0.000
## Median : 6.000 Median :0.500
## Mean : 7.578 Mean :1.189
## 3rd Qu.:10.000 3rd Qu.:2.000
## Max. :59.000 Max. :9.000
sd.batting<-apply(bat_pca_data,2,sd)
print(sd.batting)
## Runs Ave SR Fours Sixes HF
## 156.253669 13.619215 23.656547 16.399845 8.001373 1.688656
#Analyse Variance
corl_matrix = cor(bat_pca_data, method = "pearson", use = "all")
print(corl_matrix)
## Runs Ave SR Fours Sixes HF
## Runs 1.0000000 0.6929845 0.4934887 0.9188086 0.7697776 0.8351477
## Ave 0.6929845 1.0000000 0.6236059 0.5462114 0.6824143 0.6207537
## SR 0.4934887 0.6236059 1.0000000 0.3848104 0.5839428 0.4275835
## Fours 0.9188086 0.5462114 0.3848104 1.0000000 0.5225736 0.7836888
## Sixes 0.7697776 0.6824143 0.5839428 0.5225736 1.0000000 0.7676964
## HF 0.8351477 0.6207537 0.4275835 0.7836888 0.7676964 1.0000000
eig_calc = eigen(corl_matrix)
print(eig_calc)
## $values
## [1] 4.25471977 0.82707395 0.41202798 0.32546749 0.16383742 0.01687338
##
## $vectors
## [,1] [,2] [,3] [,4] [,5]
## [1,] -0.4582608 0.2664321 0.109779419 0.005201415 -0.45840889
## [2,] -0.3979731 -0.3311176 -0.005504861 -0.847363074 0.10122837
## [3,] -0.3253838 -0.6978033 0.450134482 0.432750288 0.11890348
## [4,] -0.4057417 0.4735580 0.508235378 0.032523046 -0.09676885
## [5,] -0.4173346 -0.1790246 -0.669425885 0.248781566 -0.39458014
## [6,] -0.4323718 0.2759323 -0.280825406 0.178117767 0.77486668
## [,6]
## [1,] 0.70483594
## [2,] -0.06063730
## [3,] 0.05624934
## [4,] -0.58514214
## [5,] -0.35786211
## [6,] 0.16096217
# . An Eigen value is a number, telling us how much variance there is in the data in the direction of the Eigen vector. When Eigen value is 0, there is no variation at all.
# . Based on the PCA performed, we observe that the Eigen value for PC1 greater than 1 and since other PCs are less than 1, they contain very less information and we shall ignore them. Hence, we will be considering only PC1 for further calculation.
# ** Perform a PCA of the data using the correlation matrix.**
# ** View the components of the PCA object and obtain the definitions of the Prinicipal Components (PCs) and the corresponding proportions of variation explained by each.**
# Perform Principal Component Analysis
pr_comp = prcomp(bat_pca_data, scale. = TRUE)
names(pr_comp)
## [1] "sdev" "rotation" "center" "scale" "x"
summary(pr_comp)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 2.0627 0.9094 0.64189 0.57050 0.40477 0.12990
## Proportion of Variance 0.7091 0.1378 0.06867 0.05424 0.02731 0.00281
## Cumulative Proportion 0.7091 0.8470 0.91564 0.96988 0.99719 1.00000
names(summary(pr_comp))
## [1] "sdev" "rotation" "center" "scale" "x"
## [6] "importance"
pr_comp$center
## Runs Ave SR Fours Sixes HF
## 219.933333 24.729889 119.164111 19.788889 7.577778 1.188889
pr_comp$scale
## Runs Ave SR Fours Sixes HF
## 156.253669 13.619215 23.656547 16.399845 8.001373 1.688656
pr_comp$rotation
## PC1 PC2 PC3 PC4 PC5
## Runs -0.4582608 0.2664321 -0.109779419 -0.005201415 -0.45840889
## Ave -0.3979731 -0.3311176 0.005504861 0.847363074 0.10122837
## SR -0.3253838 -0.6978033 -0.450134482 -0.432750288 0.11890348
## Fours -0.4057417 0.4735580 -0.508235378 -0.032523046 -0.09676885
## Sixes -0.4173346 -0.1790246 0.669425885 -0.248781566 -0.39458014
## HF -0.4323718 0.2759323 0.280825406 -0.178117767 0.77486668
## PC6
## Runs -0.70483594
## Ave 0.06063730
## SR -0.05624934
## Fours 0.58514214
## Sixes 0.35786211
## HF -0.16096217
dim(pr_comp$x)
## [1] 90 6
biplot(pr_comp,scale = 0)

#Produce a scree plot
### screeplot
prop_var <- summary(pr_comp)$importance[2,]
plot(prop_var, main = "Scree plot", xlab = "Principal Component", ylab = "Proportion of Variance Explained", type = "b")

#cumulative scree plot
plot(cumsum(prop_var), main = "Scree plot", xlab = "Principal Component", ylab = "Cumulative Proportion of Variance Explained", type = "b")

#Observations
#. 70.91 % of the total variation is explained by first Principal Component and confirmed by screeplot.
#. In the scree plot, the last big drop occurs between the first and second components and we choose the first component.
#Ranking Batsmen
names(batting.df)
## [1] "Name" "Runs" "Ave" "SR" "Fours" "Sixes" "HF"
bat_pca_scaled <- scale(bat_pca_data)
wt <- pr_comp$rotation[,"PC1"]
print(head(bat_pca_scaled))
## Runs Ave SR Fours Sixes HF
## [1,] 3.283550 2.6690314 1.7574792 1.598254 6.4266747 4.625637
## [2,] 2.368371 0.8913958 1.0308305 2.695825 1.1775757 2.849076
## [3,] 1.760385 0.6072385 1.7781923 2.268992 1.4275327 2.256890
## [4,] 1.657988 1.3811451 1.2899554 1.293373 1.5525113 2.256890
## [5,] 2.233974 1.1682106 0.4415644 2.329968 1.3025542 2.256890
## [6,] 2.176376 1.1212182 0.4297284 3.244611 0.3027258 2.256890
print(head(wt))
## Runs Ave SR Fours Sixes HF
## -0.4582608 -0.3979731 -0.3253838 -0.4057417 -0.4173346 -0.4323718
batting.df["bat_score"] <- apply(bat_pca_scaled[,1:6], 1, function(x) addproduct(x,wt))
batting_sorted <- batting.df[order(batting.df["bat_score"]),]
l = length(batting.df[,2])
print(l)
## [1] 90
batting_sorted["Rank"] <- seq(1,l,1)
print(head(batting_sorted["Rank"] ))
## Rank
## 1 1
## 2 2
## 3 3
## 5 4
## 6 5
## 4 6
print(batting_sorted[,c("Name","bat_score","Rank")])
## Name bat_score Rank
## 1 CH Gayle -8.46932649 1
## 2 G Gambhir -4.59261046 2
## 3 V Sehwag -4.11917372 3
## 5 S Dhawan -4.09711843 4
## 6 AM Rahane -4.00201596 5
## 4 CL White -3.87768716 6
## 8 RG Sharma -2.90275313 7
## 7 KP Pietersen -2.86324015 8
## 9 AB de Villiers -2.31442255 9
## 13 F du Plessis -2.11332721 10
## 11 DA Warner -2.07167458 11
## 10 JP Duminy -2.07004543 12
## 14 OA Shah -1.92807089 13
## 17 SK Raina -1.85947782 14
## 20 R Dravid -1.82430203 15
## 16 DJ Hussey -1.80700836 16
## 19 Mandeep Singh -1.78578045 17
## 12 SR Watson -1.76704360 18
## 15 DJ Bravo -1.71801144 19
## 18 AT Rayudu -1.44124294 20
## 25 RV Uthappa -1.37862275 21
## 22 M Vijay -1.27397565 22
## 23 SPD Smith -1.22983470 23
## 26 SE Marsh -1.17451659 24
## 33 JH Kallis -1.09031082 25
## 28 DMD Jayawardene -1.03774815 26
## 24 TM Dilshan -1.02798828 27
## 29 V Kohli -1.02343478 28
## 31 SR Tendulkar -0.87507320 29
## 34 MS Dhoni -0.85533911 30
## 21 DR Smith -0.68139774 31
## 27 KA Pollard -0.67992463 32
## 32 MEK Hussey -0.66591890 33
## 30 MA Agarwal -0.52863856 34
## 36 JD Ryder -0.46216311 35
## 35 MS Bisla -0.32212998 36
## 37 BJ Hodge -0.25773323 37
## 38 NV Ojha -0.24859667 38
## 41 BB McCullum -0.08781663 39
## 40 AC Gilchrist 0.04780304 40
## 39 DB Das 0.20003908 41
## 47 MK Tiwary 0.28787513 42
## 43 Azhar Mahmood 0.35636859 43
## 42 IK Pathan 0.36460087 44
## 45 S Badrinath 0.38418403 45
## 53 JEC Franklin 0.53957408 46
## 49 LRPL Taylor 0.53996515 47
## 44 MK Pandey 0.55793627 48
## 61 SC Ganguly 0.57702761 49
## 58 KD Karthik 0.57986223 50
## 54 KC Sangakkara 0.59893350 51
## 52 RA Jadeja 0.64001153 52
## 59 AL Menaria 0.70466589 53
## 51 DT Christian 0.72224185 54
## 57 SS Tiwary 0.75301754 55
## 50 M Manhas 0.79201547 56
## 55 Y Nagar 0.79276862 57
## 46 DA Miller 0.79991992 58
## 60 PA Patel 0.81598820 59
## 48 JA Morkel 0.83471086 60
## 62 YK Pathan 0.86869533 61
## 56 STR Binny 1.04786689 62
## 63 Harbhajan Singh 1.16884571 63
## 66 Y Venugopal Rao 1.26630765 64
## 64 RE Levi 1.27255831 65
## 67 AD Mathews 1.34367589 66
## 70 N Saini 1.35867376 67
## 68 PP Chawla 1.40985740 68
## 65 LR Shukla 1.44128536 69
## 69 Shakib Al Hasan 1.48975473 70
## 71 MN Samuels 1.49420087 71
## 72 MJ Clarke 1.70055028 72
## 74 R Vinay Kumar 1.91210895 73
## 73 R Bhatia 1.93429975 74
## 76 J Botha 1.97377764 75
## 79 SP Goswami 2.02276671 76
## 75 P Kumar 2.04633551 77
## 77 A Ashish Reddy 2.05133627 78
## 80 SL Malinga 2.08423163 79
## 78 DL Vettori 2.11223293 80
## 81 RJ Peterson 2.19715970 81
## 83 B Kumar 2.21469825 82
## 82 R Ashwin 2.26780435 83
## 84 DW Steyn 2.73265219 84
## 85 A Mishra 2.86816861 85
## 87 WD Parnell 2.98403383 86
## 86 Z Khan 2.98988876 87
## 88 PC Valthaty 3.05385090 88
## 89 RP Singh 3.40295794 89
## 90 R Sharma 3.92541071 90
#Bowling Analysis
#Read IPL bowling data into R and print out the variable names
bowling.df <- read.csv("D:/PGPBABI/Advance stats/practise assignment-IPL/batting_bowling_ipl_bowl.csv")
print(head(bowling.df))
## Name Wkts Ave Econ SR
## 1 R Ashwin 14 30.78 6.54 28.2
## 2 P Kumar 9 48.22 6.88 42.0
## 3 M Morkel 25 18.12 7.19 15.1
## 4 UT Yadav 19 23.84 7.42 19.2
## 5 Z Khan 17 26.64 7.55 21.1
## 6 IK Pathan 8 58.12 7.75 45.0
bowling_pca_data <- bowling.df[,2:5]
summary(bowling_pca_data)
## Wkts Ave Econ SR
## Min. : 1.00 Min. : 12.20 Min. : 5.400 Min. :12.00
## 1st Qu.: 5.00 1st Qu.: 22.32 1st Qu.: 6.950 1st Qu.:17.25
## Median : 8.00 Median : 29.00 Median : 7.530 Median :21.60
## Mean : 8.88 Mean : 34.51 Mean : 7.656 Mean :26.33
## 3rd Qu.:12.50 3rd Qu.: 36.44 3rd Qu.: 8.280 3rd Qu.:28.90
## Max. :25.00 Max. :161.00 Max. :11.650 Max. :96.00
sd.bowling<-apply(bowling_pca_data,2,sd)
print(sd.bowling)
## Wkts Ave Econ SR
## 5.491451 23.677252 1.104584 14.422891
corl_matrix = cor(bowling_pca_data, method = "pearson", use = "all")
print(corl_matrix)
## Wkts Ave Econ SR
## Wkts 1.0000000 -0.4905337 -0.2924540 -0.5123438
## Ave -0.4905337 1.0000000 0.5226172 0.9630984
## Econ -0.2924540 0.5226172 1.0000000 0.3277374
## SR -0.5123438 0.9630984 0.3277374 1.0000000
eig_calc = eigen(corl_matrix)
print(eig_calc)
## $values
## [1] 2.61606918 0.75160217 0.62018101 0.01214765
##
## $vectors
## [,1] [,2] [,3] [,4]
## [1,] 0.4282076 -0.33487615 0.8384720 0.03822333
## [2,] -0.5911683 0.04764188 0.3539052 -0.72318835
## [3,] -0.3834154 -0.89162604 -0.1681540 0.17239454
## [4,] -0.5658188 0.30098375 0.3787349 0.66769582
# Observation
# . An Eigen value is a number, telling us how much variance there is in the data in the direction of the Eigen vector. When Eigen value is 0, there is no variation at all.
# . Based on the PCA performed, we observe that the Eigen value for PC1 greater than 1 and since other PCs are less than 1, they contain very less information and we shall ignore them. Hence, we will be considering only PC1 for further calculation.
# ** Perform a PCA of the data using the correlation matrix.**
# In prcomp(),calculation is done by a singular value decomposition of the centered and possibly scaled data matrix. The print method for these objects prints the results in a nice format and the plot method produces a scree plot.
# View the components of the PCA object and obtain the definitions of the Prinicipal Components (PCs) and the corresponding proportions of variation explained by each.
# Perform Principal Component Analysis
pr_comp = prcomp(bowling_pca_data, scale. = TRUE)
names(pr_comp)
## [1] "sdev" "rotation" "center" "scale" "x"
summary(pr_comp)
## Importance of components:
## PC1 PC2 PC3 PC4
## Standard deviation 1.617 0.8669 0.7875 0.11022
## Proportion of Variance 0.654 0.1879 0.1550 0.00304
## Cumulative Proportion 0.654 0.8419 0.9970 1.00000
names(summary(pr_comp))
## [1] "sdev" "rotation" "center" "scale" "x"
## [6] "importance"
pr_comp$center
## Wkts Ave Econ SR
## 8.879518 34.505663 7.655542 26.332530
pr_comp$scale
## Wkts Ave Econ SR
## 5.491451 23.677252 1.104584 14.422891
pr_comp$rotation
## PC1 PC2 PC3 PC4
## Wkts -0.4282076 0.33487615 -0.8384720 -0.03822333
## Ave 0.5911683 -0.04764188 -0.3539052 0.72318835
## Econ 0.3834154 0.89162604 0.1681540 -0.17239454
## SR 0.5658188 -0.30098375 -0.3787349 -0.66769582
dim(pr_comp$x)
## [1] 83 4
biplot(pr_comp,scale = 0)

#Produce a scree plot
### screeplot
prop_var <- summary(pr_comp)$importance[2,]
plot(prop_var, main = "Scree plot", xlab = "Principal Component", ylab = "Proportion of Variance Explained", type = "b")

#cumulative scree plot
plot(cumsum(prop_var), main = "Scree plot", xlab = "Principal Component", ylab = "Cumulative Proportion of Variance Explained", type = "b")

# Observations
# . 65.4% of the total variation is explained by first Principal Component and confirmed by screeplot.
# . In the scree plot, the last big drop occurs between the first and second components and we choose the first component.
# Ranking Bowlers
names(bowling.df)
## [1] "Name" "Wkts" "Ave" "Econ" "SR"
bowling_pca_scaled <- scale(bowling_pca_data)
wt <- pr_comp$rotation[,"PC1"]
print(head(bowling_pca_scaled))
## Wkts Ave Econ SR
## [1,] 0.93244600 -0.1573520 -1.00992076 0.1294796
## [2,] 0.02193991 0.5792200 -0.70211253 1.0862919
## [3,] 2.93555941 -0.6920424 -0.42146385 -0.7787988
## [4,] 1.84295210 -0.4504603 -0.21324064 -0.4945285
## [5,] 1.47874966 -0.3322034 -0.09554926 -0.3627934
## [6,] -0.16016131 0.9973428 0.08551441 1.2942946
print(head(wt))
## Wkts Ave Econ SR
## -0.4282076 0.5911683 0.3834154 0.5658188
bowling.df["bowling_score"] <- apply(bowling_pca_scaled[,1:4], 1, function(x) addproduct(x,wt))
bowling_sorted <- bowling.df[order(-bowling.df["bowling_score"]),]
l = length(bowling.df[,2])
print(l)
## [1] 83
bowling_sorted["Rank"] <- seq(1,l,1)
print(head(bowling_sorted["Rank"] ))
## Rank
## 66 1
## 76 2
## 74 3
## 52 4
## 64 5
## 34 6
print(bowling_sorted[,c("Name","bowling_score","Rank")])
## Name bowling_score Rank
## 66 AD Russell 7.340420235 1
## 76 TP Sudhindra 6.248139606 2
## 74 JP Duminy 4.489764215 3
## 52 MS Gony 2.504518661 4
## 64 BA Bhatt 2.484470993 5
## 34 M Kartik 2.169085675 6
## 58 YK Pathan 2.017875431 7
## 12 Harbhajan Singh 1.856991489 8
## 80 SC Ganguly 1.644997038 9
## 71 SK Raina 1.546377295 10
## 6 IK Pathan 1.423303536 11
## 68 AB Agarkar 1.385650504 12
## 79 M de Lange 1.295630915 13
## 73 DR Smith 1.234423993 14
## 75 Anand Rajan 0.936391195 15
## 36 DL Vettori 0.900903916 16
## 2 P Kumar 0.678465319 17
## 31 B Lee 0.664327036 18
## 49 Ankit Sharma 0.645961216 19
## 72 AC Thomas 0.612634332 20
## 45 AA Chavan 0.349895490 21
## 27 HV Patel 0.330291385 22
## 40 VR Aaron 0.306249517 23
## 56 Iqbal Abdulla 0.292150810 24
## 26 SB Jakati 0.284594266 25
## 22 S Nadeem 0.281831653 26
## 77 MJ Clarke 0.229484778 27
## 39 A Singh 0.184309700 28
## 17 A Nehra 0.124319082 29
## 78 STR Binny 0.047814588 30
## 32 R Sharma 0.047560349 31
## 30 B Kumar 0.038288922 32
## 47 DT Christian 0.028307196 33
## 83 RE van der Merwe -0.006834434 34
## 55 Harmeet Singh -0.007107969 35
## 63 KP Appanna -0.024863755 36
## 38 V Pratap Singh -0.080047357 37
## 25 J Botha -0.153273525 38
## 28 RP Singh -0.190107010 39
## 59 Pankaj Singh -0.284836933 40
## 48 AD Mathews -0.288119764 41
## 82 P Parameswaran -0.324305735 42
## 53 SR Watson -0.325392070 43
## 44 MN Samuels -0.351977833 44
## 70 KMDN Kulasekara -0.431704501 45
## 61 DE Bollinger -0.437020464 46
## 67 JEC Franklin -0.437832266 47
## 60 WD Parnell -0.507690774 48
## 54 SW Tait -0.510308436 49
## 19 A Mishra -0.517919476 50
## 16 JA Morkel -0.542390643 51
## 46 A Ashish Reddy -0.572013908 52
## 8 DJ Bravo -0.577479999 53
## 42 PP Ojha -0.622140970 54
## 14 R Bhatia -0.627630562 55
## 15 SK Trivedi -0.667944762 56
## 33 GB Hogg -0.723481850 57
## 57 P Negi -0.730684063 58
## 1 R Ashwin -0.806259127 59
## 35 RA Jadeja -0.833201306 60
## 13 JH Kallis -0.921688023 61
## 20 MM Patel -0.960457327 62
## 24 Azhar Mahmood -0.974208459 63
## 51 AB Dinda -1.021293460 64
## 69 A Chandila -1.029601565 65
## 10 R Vinay Kumar -1.038229736 66
## 62 RJ Harris -1.041831323 67
## 9 PP Chawla -1.066594842 68
## 5 Z Khan -1.071510316 69
## 50 K Cooper -1.123351687 70
## 23 KA Pollard -1.147754774 71
## 65 AD Mascarenhas -1.216312359 72
## 18 P Awana -1.245857065 73
## 81 AB McDonald -1.356631890 74
## 4 UT Yadav -1.417037177 75
## 43 Shakib Al Hasan -1.544815235 76
## 37 BW Hilfenhaus -1.589157812 77
## 29 M Muralitharan -1.712550793 78
## 41 L Balaji -1.835863283 79
## 21 DW Steyn -2.142394316 80
## 3 M Morkel -2.268397070 81
## 11 SL Malinga -2.398824709 82
## 7 SP Narine -2.918497624 83