PCA_IPL

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(Hmisc)

## Loading required package: lattice

## Loading required package: survival

## Loading required package: Formula

## Loading required package: ggplot2

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:dplyr':
## 
##     combine, src, summarize

## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units

### User-defined function 1: addproduct - sums the product of vector elements from two identical vectors
addproduct <- function(vec1,vec2){
  sum <- 0
  if (is.numeric(vec1)) {
    if (is.numeric(vec2)) {
      sum <- 0
      for (i in 1:length(vec1)) {
        sum <- sum + vec1[i] * vec2[i]
      }
    }
  }
  return(sum) 
}

#Batting Analysis
# Read IPL batting data into R and print out the variable names
##reading 

batting.df <- read.csv("D:/PGPBABI/Advance stats/practise assignment-IPL/batting_bowling_ipl_bat.csv")
print(head(batting.df))

##        Name Runs   Ave     SR Fours Sixes HF
## 1  CH Gayle  733 61.08 160.74    46    59  9
## 2 G Gambhir  590 36.87 143.55    64    17  6
## 3  V Sehwag  495 33.00 161.23    57    19  5
## 4  CL White  479 43.54 149.68    41    20  5
## 5  S Dhawan  569 40.64 129.61    58    18  5
## 6 AM Rahane  560 40.00 129.33    73    10  5

print(names(batting.df))

## [1] "Name"  "Runs"  "Ave"   "SR"    "Fours" "Sixes" "HF"

# Look at the summary statistics esp. variance
bat_pca_data <- batting.df[,2:7]
summary(bat_pca_data)

##       Runs            Ave              SR             Fours      
##  Min.   :  2.0   Min.   : 0.50   Min.   : 18.18   Min.   : 0.00  
##  1st Qu.: 98.0   1st Qu.:14.66   1st Qu.:108.75   1st Qu.: 6.25  
##  Median :196.5   Median :24.44   Median :120.14   Median :16.00  
##  Mean   :219.9   Mean   :24.73   Mean   :119.16   Mean   :19.79  
##  3rd Qu.:330.8   3rd Qu.:32.20   3rd Qu.:132.00   3rd Qu.:28.00  
##  Max.   :733.0   Max.   :81.33   Max.   :164.10   Max.   :73.00  
##      Sixes              HF       
##  Min.   : 0.000   Min.   :0.000  
##  1st Qu.: 3.000   1st Qu.:0.000  
##  Median : 6.000   Median :0.500  
##  Mean   : 7.578   Mean   :1.189  
##  3rd Qu.:10.000   3rd Qu.:2.000  
##  Max.   :59.000   Max.   :9.000

sd.batting<-apply(bat_pca_data,2,sd)
print(sd.batting)

##       Runs        Ave         SR      Fours      Sixes         HF 
## 156.253669  13.619215  23.656547  16.399845   8.001373   1.688656

#Analyse Variance
corl_matrix = cor(bat_pca_data, method = "pearson", use = "all")
print(corl_matrix)

##            Runs       Ave        SR     Fours     Sixes        HF
## Runs  1.0000000 0.6929845 0.4934887 0.9188086 0.7697776 0.8351477
## Ave   0.6929845 1.0000000 0.6236059 0.5462114 0.6824143 0.6207537
## SR    0.4934887 0.6236059 1.0000000 0.3848104 0.5839428 0.4275835
## Fours 0.9188086 0.5462114 0.3848104 1.0000000 0.5225736 0.7836888
## Sixes 0.7697776 0.6824143 0.5839428 0.5225736 1.0000000 0.7676964
## HF    0.8351477 0.6207537 0.4275835 0.7836888 0.7676964 1.0000000

eig_calc = eigen(corl_matrix)
print(eig_calc)

## $values
## [1] 4.25471977 0.82707395 0.41202798 0.32546749 0.16383742 0.01687338
## 
## $vectors
##            [,1]       [,2]         [,3]         [,4]        [,5]
## [1,] -0.4582608  0.2664321  0.109779419  0.005201415 -0.45840889
## [2,] -0.3979731 -0.3311176 -0.005504861 -0.847363074  0.10122837
## [3,] -0.3253838 -0.6978033  0.450134482  0.432750288  0.11890348
## [4,] -0.4057417  0.4735580  0.508235378  0.032523046 -0.09676885
## [5,] -0.4173346 -0.1790246 -0.669425885  0.248781566 -0.39458014
## [6,] -0.4323718  0.2759323 -0.280825406  0.178117767  0.77486668
##             [,6]
## [1,]  0.70483594
## [2,] -0.06063730
## [3,]  0.05624934
## [4,] -0.58514214
## [5,] -0.35786211
## [6,]  0.16096217

# . An Eigen value is a number, telling us how much variance there is in the data in the direction of the Eigen vector. When Eigen value is 0, there is no variation at all.
# . Based on the PCA performed, we observe that the Eigen value for PC1 greater than 1 and since other PCs are less than 1, they contain very less information and we shall ignore them. Hence, we will be considering only PC1 for further calculation.
# ** Perform a PCA of the data using the correlation matrix.**
# ** View the components of the PCA object and obtain the definitions of the Prinicipal Components (PCs) and the corresponding proportions of variation explained by each.**
# Perform Principal Component Analysis
pr_comp = prcomp(bat_pca_data, scale. = TRUE)
names(pr_comp)

## [1] "sdev"     "rotation" "center"   "scale"    "x"

summary(pr_comp)

## Importance of components:
##                           PC1    PC2     PC3     PC4     PC5     PC6
## Standard deviation     2.0627 0.9094 0.64189 0.57050 0.40477 0.12990
## Proportion of Variance 0.7091 0.1378 0.06867 0.05424 0.02731 0.00281
## Cumulative Proportion  0.7091 0.8470 0.91564 0.96988 0.99719 1.00000

names(summary(pr_comp))

## [1] "sdev"       "rotation"   "center"     "scale"      "x"         
## [6] "importance"

pr_comp$center

##       Runs        Ave         SR      Fours      Sixes         HF 
## 219.933333  24.729889 119.164111  19.788889   7.577778   1.188889

pr_comp$scale

##       Runs        Ave         SR      Fours      Sixes         HF 
## 156.253669  13.619215  23.656547  16.399845   8.001373   1.688656

pr_comp$rotation

##              PC1        PC2          PC3          PC4         PC5
## Runs  -0.4582608  0.2664321 -0.109779419 -0.005201415 -0.45840889
## Ave   -0.3979731 -0.3311176  0.005504861  0.847363074  0.10122837
## SR    -0.3253838 -0.6978033 -0.450134482 -0.432750288  0.11890348
## Fours -0.4057417  0.4735580 -0.508235378 -0.032523046 -0.09676885
## Sixes -0.4173346 -0.1790246  0.669425885 -0.248781566 -0.39458014
## HF    -0.4323718  0.2759323  0.280825406 -0.178117767  0.77486668
##               PC6
## Runs  -0.70483594
## Ave    0.06063730
## SR    -0.05624934
## Fours  0.58514214
## Sixes  0.35786211
## HF    -0.16096217

dim(pr_comp$x)

## [1] 90  6

biplot(pr_comp,scale = 0)

#Produce a scree plot
### screeplot

prop_var <- summary(pr_comp)$importance[2,]
plot(prop_var, main = "Scree plot", xlab = "Principal Component", ylab =  "Proportion of Variance Explained", type = "b")

#cumulative scree plot
plot(cumsum(prop_var),  main = "Scree plot", xlab = "Principal Component", ylab = "Cumulative Proportion of Variance Explained", type = "b")

#Observations
#.  70.91 % of the total variation is explained by first Principal Component and confirmed by screeplot.
#.  In the scree plot, the last big drop occurs between the first and second components and we choose the first component.

#Ranking Batsmen
names(batting.df)

## [1] "Name"  "Runs"  "Ave"   "SR"    "Fours" "Sixes" "HF"

bat_pca_scaled  <- scale(bat_pca_data)
wt              <- pr_comp$rotation[,"PC1"]
print(head(bat_pca_scaled))

##          Runs       Ave        SR    Fours     Sixes       HF
## [1,] 3.283550 2.6690314 1.7574792 1.598254 6.4266747 4.625637
## [2,] 2.368371 0.8913958 1.0308305 2.695825 1.1775757 2.849076
## [3,] 1.760385 0.6072385 1.7781923 2.268992 1.4275327 2.256890
## [4,] 1.657988 1.3811451 1.2899554 1.293373 1.5525113 2.256890
## [5,] 2.233974 1.1682106 0.4415644 2.329968 1.3025542 2.256890
## [6,] 2.176376 1.1212182 0.4297284 3.244611 0.3027258 2.256890

print(head(wt))

##       Runs        Ave         SR      Fours      Sixes         HF 
## -0.4582608 -0.3979731 -0.3253838 -0.4057417 -0.4173346 -0.4323718

batting.df["bat_score"]   <- apply(bat_pca_scaled[,1:6], 1, function(x) addproduct(x,wt))

batting_sorted   <- batting.df[order(batting.df["bat_score"]),]

l = length(batting.df[,2])
print(l)

## [1] 90

batting_sorted["Rank"]     <- seq(1,l,1)
print(head(batting_sorted["Rank"] ))

##   Rank
## 1    1
## 2    2
## 3    3
## 5    4
## 6    5
## 4    6

print(batting_sorted[,c("Name","bat_score","Rank")])

##               Name   bat_score Rank
## 1         CH Gayle -8.46932649    1
## 2        G Gambhir -4.59261046    2
## 3         V Sehwag -4.11917372    3
## 5         S Dhawan -4.09711843    4
## 6        AM Rahane -4.00201596    5
## 4         CL White -3.87768716    6
## 8        RG Sharma -2.90275313    7
## 7     KP Pietersen -2.86324015    8
## 9   AB de Villiers -2.31442255    9
## 13    F du Plessis -2.11332721   10
## 11       DA Warner -2.07167458   11
## 10       JP Duminy -2.07004543   12
## 14         OA Shah -1.92807089   13
## 17        SK Raina -1.85947782   14
## 20        R Dravid -1.82430203   15
## 16       DJ Hussey -1.80700836   16
## 19   Mandeep Singh -1.78578045   17
## 12       SR Watson -1.76704360   18
## 15        DJ Bravo -1.71801144   19
## 18       AT Rayudu -1.44124294   20
## 25      RV Uthappa -1.37862275   21
## 22         M Vijay -1.27397565   22
## 23       SPD Smith -1.22983470   23
## 26        SE Marsh -1.17451659   24
## 33       JH Kallis -1.09031082   25
## 28 DMD Jayawardene -1.03774815   26
## 24      TM Dilshan -1.02798828   27
## 29         V Kohli -1.02343478   28
## 31    SR Tendulkar -0.87507320   29
## 34        MS Dhoni -0.85533911   30
## 21        DR Smith -0.68139774   31
## 27      KA Pollard -0.67992463   32
## 32      MEK Hussey -0.66591890   33
## 30      MA Agarwal -0.52863856   34
## 36        JD Ryder -0.46216311   35
## 35        MS Bisla -0.32212998   36
## 37        BJ Hodge -0.25773323   37
## 38         NV Ojha -0.24859667   38
## 41     BB McCullum -0.08781663   39
## 40    AC Gilchrist  0.04780304   40
## 39          DB Das  0.20003908   41
## 47       MK Tiwary  0.28787513   42
## 43   Azhar Mahmood  0.35636859   43
## 42       IK Pathan  0.36460087   44
## 45     S Badrinath  0.38418403   45
## 53    JEC Franklin  0.53957408   46
## 49     LRPL Taylor  0.53996515   47
## 44       MK Pandey  0.55793627   48
## 61      SC Ganguly  0.57702761   49
## 58      KD Karthik  0.57986223   50
## 54   KC Sangakkara  0.59893350   51
## 52       RA Jadeja  0.64001153   52
## 59      AL Menaria  0.70466589   53
## 51    DT Christian  0.72224185   54
## 57       SS Tiwary  0.75301754   55
## 50        M Manhas  0.79201547   56
## 55         Y Nagar  0.79276862   57
## 46       DA Miller  0.79991992   58
## 60        PA Patel  0.81598820   59
## 48       JA Morkel  0.83471086   60
## 62       YK Pathan  0.86869533   61
## 56       STR Binny  1.04786689   62
## 63 Harbhajan Singh  1.16884571   63
## 66 Y Venugopal Rao  1.26630765   64
## 64         RE Levi  1.27255831   65
## 67      AD Mathews  1.34367589   66
## 70         N Saini  1.35867376   67
## 68       PP Chawla  1.40985740   68
## 65       LR Shukla  1.44128536   69
## 69 Shakib Al Hasan  1.48975473   70
## 71      MN Samuels  1.49420087   71
## 72       MJ Clarke  1.70055028   72
## 74   R Vinay Kumar  1.91210895   73
## 73        R Bhatia  1.93429975   74
## 76         J Botha  1.97377764   75
## 79      SP Goswami  2.02276671   76
## 75         P Kumar  2.04633551   77
## 77  A Ashish Reddy  2.05133627   78
## 80      SL Malinga  2.08423163   79
## 78      DL Vettori  2.11223293   80
## 81     RJ Peterson  2.19715970   81
## 83         B Kumar  2.21469825   82
## 82        R Ashwin  2.26780435   83
## 84        DW Steyn  2.73265219   84
## 85        A Mishra  2.86816861   85
## 87      WD Parnell  2.98403383   86
## 86          Z Khan  2.98988876   87
## 88     PC Valthaty  3.05385090   88
## 89        RP Singh  3.40295794   89
## 90        R Sharma  3.92541071   90

#Bowling Analysis
#Read IPL bowling data into R and print out the variable names
bowling.df <- read.csv("D:/PGPBABI/Advance stats/practise assignment-IPL/batting_bowling_ipl_bowl.csv")
print(head(bowling.df))

##        Name Wkts   Ave Econ   SR
## 1  R Ashwin   14 30.78 6.54 28.2
## 2   P Kumar    9 48.22 6.88 42.0
## 3  M Morkel   25 18.12 7.19 15.1
## 4  UT Yadav   19 23.84 7.42 19.2
## 5    Z Khan   17 26.64 7.55 21.1
## 6 IK Pathan    8 58.12 7.75 45.0

bowling_pca_data <- bowling.df[,2:5]
summary(bowling_pca_data)

##       Wkts            Ave              Econ              SR       
##  Min.   : 1.00   Min.   : 12.20   Min.   : 5.400   Min.   :12.00  
##  1st Qu.: 5.00   1st Qu.: 22.32   1st Qu.: 6.950   1st Qu.:17.25  
##  Median : 8.00   Median : 29.00   Median : 7.530   Median :21.60  
##  Mean   : 8.88   Mean   : 34.51   Mean   : 7.656   Mean   :26.33  
##  3rd Qu.:12.50   3rd Qu.: 36.44   3rd Qu.: 8.280   3rd Qu.:28.90  
##  Max.   :25.00   Max.   :161.00   Max.   :11.650   Max.   :96.00

sd.bowling<-apply(bowling_pca_data,2,sd)
print(sd.bowling)

##      Wkts       Ave      Econ        SR 
##  5.491451 23.677252  1.104584 14.422891

corl_matrix = cor(bowling_pca_data, method = "pearson", use = "all")
print(corl_matrix)

##            Wkts        Ave       Econ         SR
## Wkts  1.0000000 -0.4905337 -0.2924540 -0.5123438
## Ave  -0.4905337  1.0000000  0.5226172  0.9630984
## Econ -0.2924540  0.5226172  1.0000000  0.3277374
## SR   -0.5123438  0.9630984  0.3277374  1.0000000

eig_calc = eigen(corl_matrix)
print(eig_calc)

## $values
## [1] 2.61606918 0.75160217 0.62018101 0.01214765
## 
## $vectors
##            [,1]        [,2]       [,3]        [,4]
## [1,]  0.4282076 -0.33487615  0.8384720  0.03822333
## [2,] -0.5911683  0.04764188  0.3539052 -0.72318835
## [3,] -0.3834154 -0.89162604 -0.1681540  0.17239454
## [4,] -0.5658188  0.30098375  0.3787349  0.66769582

# Observation
# . An Eigen value is a number, telling us how much variance there is in the data in the direction of the Eigen vector. When Eigen value is 0, there is no variation at all.
# . Based on the PCA performed, we observe that the Eigen value for PC1 greater than 1 and since other PCs are less than 1, they contain very less information and we shall ignore them. Hence, we will be considering only PC1 for further calculation.
# ** Perform a PCA of the data using the correlation matrix.**
#   In prcomp(),calculation is done by a singular value decomposition of the centered and possibly scaled data matrix. The print method for these objects prints the results in a nice format and the plot method produces a scree plot.
# View the components of the PCA object and obtain the definitions of the Prinicipal Components (PCs) and the corresponding proportions of variation explained by each.

# Perform Principal Component Analysis
pr_comp = prcomp(bowling_pca_data, scale. = TRUE)
names(pr_comp)

## [1] "sdev"     "rotation" "center"   "scale"    "x"

summary(pr_comp)

## Importance of components:
##                          PC1    PC2    PC3     PC4
## Standard deviation     1.617 0.8669 0.7875 0.11022
## Proportion of Variance 0.654 0.1879 0.1550 0.00304
## Cumulative Proportion  0.654 0.8419 0.9970 1.00000

names(summary(pr_comp))

## [1] "sdev"       "rotation"   "center"     "scale"      "x"         
## [6] "importance"

pr_comp$center

##      Wkts       Ave      Econ        SR 
##  8.879518 34.505663  7.655542 26.332530

pr_comp$scale

##      Wkts       Ave      Econ        SR 
##  5.491451 23.677252  1.104584 14.422891

pr_comp$rotation

##             PC1         PC2        PC3         PC4
## Wkts -0.4282076  0.33487615 -0.8384720 -0.03822333
## Ave   0.5911683 -0.04764188 -0.3539052  0.72318835
## Econ  0.3834154  0.89162604  0.1681540 -0.17239454
## SR    0.5658188 -0.30098375 -0.3787349 -0.66769582

dim(pr_comp$x)

## [1] 83  4

biplot(pr_comp,scale = 0)

#Produce a scree plot
### screeplot

prop_var <- summary(pr_comp)$importance[2,]
plot(prop_var, main = "Scree plot", xlab = "Principal Component", ylab =  "Proportion of Variance Explained", type = "b")

#cumulative scree plot
plot(cumsum(prop_var),  main = "Scree plot", xlab = "Principal Component", ylab = "Cumulative Proportion of Variance Explained", type = "b")

# Observations
# . 65.4% of the total variation is explained by first Principal Component and confirmed by screeplot.
# . In the scree plot, the last big drop occurs between the first and second components and we choose the first component.
# Ranking Bowlers
names(bowling.df)

## [1] "Name" "Wkts" "Ave"  "Econ" "SR"

bowling_pca_scaled  <- scale(bowling_pca_data)
wt                  <- pr_comp$rotation[,"PC1"]


print(head(bowling_pca_scaled))

##             Wkts        Ave        Econ         SR
## [1,]  0.93244600 -0.1573520 -1.00992076  0.1294796
## [2,]  0.02193991  0.5792200 -0.70211253  1.0862919
## [3,]  2.93555941 -0.6920424 -0.42146385 -0.7787988
## [4,]  1.84295210 -0.4504603 -0.21324064 -0.4945285
## [5,]  1.47874966 -0.3322034 -0.09554926 -0.3627934
## [6,] -0.16016131  0.9973428  0.08551441  1.2942946

print(head(wt))

##       Wkts        Ave       Econ         SR 
## -0.4282076  0.5911683  0.3834154  0.5658188

bowling.df["bowling_score"]   <- apply(bowling_pca_scaled[,1:4], 1, function(x) addproduct(x,wt))

bowling_sorted                <- bowling.df[order(-bowling.df["bowling_score"]),]

l = length(bowling.df[,2])
print(l)

## [1] 83

bowling_sorted["Rank"]     <- seq(1,l,1)
print(head(bowling_sorted["Rank"] ))

##    Rank
## 66    1
## 76    2
## 74    3
## 52    4
## 64    5
## 34    6

print(bowling_sorted[,c("Name","bowling_score","Rank")])

##                Name bowling_score Rank
## 66       AD Russell   7.340420235    1
## 76     TP Sudhindra   6.248139606    2
## 74        JP Duminy   4.489764215    3
## 52          MS Gony   2.504518661    4
## 64         BA Bhatt   2.484470993    5
## 34         M Kartik   2.169085675    6
## 58        YK Pathan   2.017875431    7
## 12  Harbhajan Singh   1.856991489    8
## 80       SC Ganguly   1.644997038    9
## 71         SK Raina   1.546377295   10
## 6         IK Pathan   1.423303536   11
## 68       AB Agarkar   1.385650504   12
## 79       M de Lange   1.295630915   13
## 73         DR Smith   1.234423993   14
## 75      Anand Rajan   0.936391195   15
## 36       DL Vettori   0.900903916   16
## 2           P Kumar   0.678465319   17
## 31            B Lee   0.664327036   18
## 49     Ankit Sharma   0.645961216   19
## 72        AC Thomas   0.612634332   20
## 45        AA Chavan   0.349895490   21
## 27         HV Patel   0.330291385   22
## 40         VR Aaron   0.306249517   23
## 56    Iqbal Abdulla   0.292150810   24
## 26        SB Jakati   0.284594266   25
## 22         S Nadeem   0.281831653   26
## 77        MJ Clarke   0.229484778   27
## 39          A Singh   0.184309700   28
## 17          A Nehra   0.124319082   29
## 78        STR Binny   0.047814588   30
## 32         R Sharma   0.047560349   31
## 30          B Kumar   0.038288922   32
## 47     DT Christian   0.028307196   33
## 83 RE van der Merwe  -0.006834434   34
## 55    Harmeet Singh  -0.007107969   35
## 63       KP Appanna  -0.024863755   36
## 38   V Pratap Singh  -0.080047357   37
## 25          J Botha  -0.153273525   38
## 28         RP Singh  -0.190107010   39
## 59     Pankaj Singh  -0.284836933   40
## 48       AD Mathews  -0.288119764   41
## 82   P Parameswaran  -0.324305735   42
## 53        SR Watson  -0.325392070   43
## 44       MN Samuels  -0.351977833   44
## 70  KMDN Kulasekara  -0.431704501   45
## 61     DE Bollinger  -0.437020464   46
## 67     JEC Franklin  -0.437832266   47
## 60       WD Parnell  -0.507690774   48
## 54          SW Tait  -0.510308436   49
## 19         A Mishra  -0.517919476   50
## 16        JA Morkel  -0.542390643   51
## 46   A Ashish Reddy  -0.572013908   52
## 8          DJ Bravo  -0.577479999   53
## 42          PP Ojha  -0.622140970   54
## 14         R Bhatia  -0.627630562   55
## 15       SK Trivedi  -0.667944762   56
## 33          GB Hogg  -0.723481850   57
## 57           P Negi  -0.730684063   58
## 1          R Ashwin  -0.806259127   59
## 35        RA Jadeja  -0.833201306   60
## 13        JH Kallis  -0.921688023   61
## 20         MM Patel  -0.960457327   62
## 24    Azhar Mahmood  -0.974208459   63
## 51         AB Dinda  -1.021293460   64
## 69       A Chandila  -1.029601565   65
## 10    R Vinay Kumar  -1.038229736   66
## 62        RJ Harris  -1.041831323   67
## 9         PP Chawla  -1.066594842   68
## 5            Z Khan  -1.071510316   69
## 50         K Cooper  -1.123351687   70
## 23       KA Pollard  -1.147754774   71
## 65   AD Mascarenhas  -1.216312359   72
## 18          P Awana  -1.245857065   73
## 81      AB McDonald  -1.356631890   74
## 4          UT Yadav  -1.417037177   75
## 43  Shakib Al Hasan  -1.544815235   76
## 37    BW Hilfenhaus  -1.589157812   77
## 29   M Muralitharan  -1.712550793   78
## 41         L Balaji  -1.835863283   79
## 21         DW Steyn  -2.142394316   80
## 3          M Morkel  -2.268397070   81
## 11       SL Malinga  -2.398824709   82
## 7         SP Narine  -2.918497624   83

PCA_IPL

Vyomesh Upadhyay

21 October 2018