[Q4] NCAA ranking

Read data…

setwd("C:/Users/bbalasub/Desktop/Bala-docs/Stanford/2014-winter-Stats216/hw2");
rm(list=ls());
games<-read.csv("games.csv", as.is=TRUE);
teams<-read.csv("teams.csv", as.is=TRUE);

Generate list of all teams

all.teams = sort(unique(c(teams$team,games$home,games$away)))

Function to compute a team's total margin of home team victory

total.margin <- function(team) 
  {
  with(games,
         sum(homeScore[home==team]) 
       + sum(awayScore[away==team]) 
       - sum(homeScore[away==team])  
      - sum(awayScore[home==team]))
  }

Compute margin of victory for each team and generate rank table based on margin of victory using linear regression.

margins <- sapply(all.teams, total.margin);
names(margins) <- all.teams;

r = sort(margins, decreasing=TRUE);
rank.table = data.frame(team=names(r), margin=r, marginrank=seq(1,length(r)))
row.names(rank.table) = seq(1,nrow(rank.table));
rank.table = merge(rank.table, teams, by.x = "team", all.x= TRUE);
rank.table = rank.table[with(rank.table, order(marginrank)), ];
rank.table = droplevels(rank.table);

Function to calculate number of games played by each team

num.games <- function(team)
  {
    with(games, sum(home==team) + sum(away==team));  
  }
nMatches = sapply(rank.table$team, num.games);
rank.table$nMatches = nMatches;

This is my own-try implementation. exactly the same answer as class.

n = nrow(games);
p = length(all.teams);
X2 <- matrix(0,nrow(games),length(all.teams));
i = match(games$home, all.teams);
j = match(games$away, all.teams);
X2[cbind(1:nrow(games), i)] =  1;
X2[cbind(1:nrow(games), j)] = -1;
X2 = as.data.frame(X2);
names(X2) <- all.teams;
margins = games$homeScore - games$awayScore;
y2 = margins;
reg.season.games <- which(games$gameType=="REG");
X2 <- X2[,names(X2) != "stanford-cardinal"];
mod2 <- lm(y2 ~ 0 + ., data=X2, subset=reg.season.games);

Now let us add home advantage into the model..

homeAdv <- 1 - games$neutralLocation
homeAdv.mod <- lm(y2 ~ 0 + homeAdv + ., data=X2, subset=reg.season.games)
homeAdv.coef <- coef(homeAdv.mod)[paste("`",teams$team,"`",sep="")]
names(homeAdv.coef) <- teams$team
head(coef(summary(homeAdv.mod)))
##                         Estimate Std. Error t value   Pr(>|t|)
## homeAdv                   3.5285     0.1569 22.4920 8.655e-107
## `air-force-falcons`      -5.2991     2.7621 -1.9185  5.510e-02
## `akron-zips`             -0.9668     2.7085 -0.3570  7.211e-01
## `alabama-a&m-bulldogs`  -27.1437     2.7761 -9.7777  2.227e-22
## `alabama-crimson-tide`   -2.6111     2.6706 -0.9777  3.283e-01
## `alabama-state-hornets` -26.3396     2.7200 -9.6836  5.519e-22
r = homeAdv.coef[2:length(homeAdv.coef)];

r = sort(r, decreasing=TRUE);
t = data.frame(team=names(r), lmcoef=r, lmrank=seq(1,length(r)))
row.names(t) = seq(1,nrow(t));
rank.table = merge(rank.table, t, by.x = "team", all.x= TRUE);
rank.table = rank.table[with(rank.table, order(lmrank)), ];
rank.table = droplevels(rank.table);
###########################################################################################

(a) Logistic regression

n = nrow(games);
p = length(all.teams);
X0 <- matrix(0,nrow(games),length(all.teams));
i = match(games$home, all.teams);
j = match(games$away, all.teams);
X0[cbind(1:nrow(games), i)] =  1;
X0[cbind(1:nrow(games), j)] = -1;
X0 = as.data.frame(X0);
names(X0) <- all.teams;
margins = games$homeScore - games$awayScore;
y = margins;
y[y > 0] = 1;
y[y<=0] = 0;
y = as.logical(y);
homeAdv <- 1 - games$neutralLocation
reg.season.games <- which(games$gameType=="REG");
mod2 <- glm(y ~0+homeAdv+., data=X0, subset=reg.season.games, family=binomial);
summary(mod2)$r.squared
## NULL
newrank = coef(mod2);
b0 = newrank[1];
newrank = newrank[2:length(newrank)];
newrank = data.frame(logisticcoef=newrank, team=all.teams)
newrank = newrank[with(newrank, order(-logisticcoef)), ];
newrank$logisticrank = seq(1, nrow(newrank), 1)

rank.table = merge(rank.table, newrank, by.x = "team", all.x= TRUE);
rank.table = rank.table[with(rank.table, order(logisticrank)), ];
rank.table = droplevels(rank.table);
head(rank.table)
##                      team margin marginrank conference inTourney apRank
## 290 saint-mary-saint-mary     10        163       <NA>        NA     NA
## 326 st.-thomas-(tx)-celts     13        161       <NA>        NA     NA
## 120      gonzaga-bulldogs    554          4 west-coast         1      1
## 175  louisville-cardinals    627          2   big-east         1      2
## 155       kansas-jayhawks    490          5     big-12         1      3
## 143      indiana-hoosiers    595          3    big-ten         1      4
##     usaTodayRank nMatches lmcoef lmrank logisticcoef logisticrank
## 290           NA        1     NA     NA       15.700            1
## 326           NA        1     NA     NA       14.844            2
## 120            1       34  9.988      4        4.344            3
## 175            2       40 12.081      3        3.986            4
## 155            3       37  8.632      6        3.839            5
## 143            5       36 13.255      1        3.829            6

Let us now compare the various rankings:

cor(rank.table$apRank, rank.table$marginrank, use="pairwise.complete.obs")
## [1] 0.4131
cor(rank.table$apRank, rank.table$lmrank, use="pairwise.complete.obs")
## [1] 0.6433
cor(rank.table$apRank, rank.table$logisticrank, use="pairwise.complete.obs")
## [1] 0.9389
cor(rank.table$apRank, rank.table$usaTodayRank, use="pairwise.complete.obs")
## [1] 0.9777

It is clear that the logistic regression provides better correlated rankings with ap than the linear model or the margin model. Let us now plot the various ranks against the ap rank.

plot(rank.table$apRank, rank.table$usaTodayRank, xlab="rank using various methods", ylab="aprank");
points(rank.table$apRank, rank.table$lmrank, col="blue", pch=18);
points(rank.table$apRank, rank.table$logisticrank, col="red", pch=17);
grid();

plot of chunk unnamed-chunk-4

However, from the rankings table, we immediately notice that the top-two ranks have only played one game each (which they won). This resulted in a spuriously large value of bi (the regression coefficient) for the top two teams. Although the likelihood is not really separable (in terms of the probabilities for each of the teams), the likelihood increases by selecting larger coefficients for those teams whose wins and losses can be perfectly separated as discussed in problem 3.

###########################################################################################

(b) Reanalysis by removing teams that played less than 5 games

n = nrow(games);
p = length(all.teams);
X0 <- matrix(0,nrow(games),length(all.teams));
i = match(games$home, all.teams);
j = match(games$away, all.teams);
X0[cbind(1:nrow(games), i)] =  1;
X0[cbind(1:nrow(games), j)] = -1;
X0 = as.data.frame(X0);
names(X0) <- all.teams;
margins = games$homeScore - games$awayScore;
y = margins;
y[y > 0] = 1;
y[y<=0] = 0;
y = as.logical(y);
homeAdv <- 1 - games$neutralLocation

# name list of teams that have played less than 5 matches
tlist = rank.table$team[rank.table$nMatches<5];
tlist = droplevels(tlist);
tf = (games$away %in% tlist) | (games$home %in% tlist);
reg.season.games <- which(games$gameType=="REG" & !tf);
# do the logistic fit
mod3 <- glm(y ~0+homeAdv+., data=X0, subset=reg.season.games, family=binomial);
summary(mod3)$r.squared
## NULL
newrank = coef(mod3);
b0 = newrank[1];
newrank = newrank[2:length(newrank)];
newrank = data.frame(logistic2coef=newrank, team=all.teams)
newrank = newrank[with(newrank, order(-logistic2coef)), ];
newrank$logistic2rank = seq(1, nrow(newrank), 1)

rank.table = merge(rank.table, newrank, by.x = "team", all.x= TRUE);
rank.table = rank.table[with(rank.table, order(logistic2rank)), ];
rank.table = droplevels(rank.table);
# reorder columns for readable display
rank.table=rank.table[,c(1,3,6,7,10,12,14,2,4,5,8,9,11,13)]
head(rank.table, n=25)
##                          team marginrank apRank usaTodayRank lmrank
## 120          gonzaga-bulldogs          4      1            1      4
## 175      louisville-cardinals          2      2            2      3
## 155           kansas-jayhawks          5      3            3      6
## 143          indiana-hoosiers          3      4            5      1
## 224          new-mexico-lobos         53     11           10     23
## 253       ohio-state-buckeyes         14      7            6      7
## 83           duke-blue-devils         12      6            7      5
## 114          georgetown-hoyas         43      8            8     18
## 197   michigan-state-spartans         38      9            9     12
## 198       michigan-wolverines          8     10           11      8
## 195     miami-(fl)-hurricanes         24      5            4     14
## 156     kansas-state-wildcats         52     12           14     36
## 331           syracuse-orange          6     16           18     10
## 192            memphis-tigers         21     19           15     42
## 288     saint-louis-billikens         19     13           13     26
## 185   marquette-golden-eagles         66     15           16     31
## 38            butler-bulldogs         63     NA           NA     51
## 400         wisconsin-badgers         26     18           17     11
## 101            florida-gators          1     14           12      2
## 255    oklahoma-state-cowboys         35     17           19     20
## 365               unlv-rebels         49     NA           NA     32
## 12           arizona-wildcats         20     21           20     16
## 269       pittsburgh-panthers         10     20           22      9
## 248 notre-dame-fighting-irish         56     23           NA     35
## 65        colorado-state-rams         29     NA           NA     22
##     logisticrank logistic2rank margin     conference inTourney nMatches
## 120            3             1    554     west-coast         1       34
## 175            4             2    627       big-east         1       40
## 155            5             3    490         big-12         1       37
## 143            6             4    595        big-ten         1       36
## 224            7             5    233  mountain-west         1       35
## 253            8             6    402        big-ten         1       37
## 83             9             7    428            acc         1       36
## 114           10             8    264       big-east         1       32
## 197           11             9    283        big-ten         1       35
## 198           12            10    465        big-ten         1       39
## 195           13            11    328            acc         1       36
## 156           14            12    238         big-12         1       34
## 331           15            13    469       big-east         1       40
## 192           16            14    344 conference-usa         1       36
## 288           17            15    351    atlantic-10         1       35
## 185           18            16    191       big-east         1       35
## 38            19            17    196    atlantic-10         1       36
## 400           20            18    316        big-ten         1       35
## 101           21            19    630            sec         1       37
## 255           22            20    292         big-12         1       33
## 365           23            21    245  mountain-west         1       34
## 12            24            22    344         pac-12         1       35
## 269           25            23    435       big-east         1       33
## 248           26            24    228       big-east         1       35
## 65            27            25    299  mountain-west         1       34
##      lmcoef logisticcoef logistic2coef
## 120  9.9875        4.344         4.344
## 175 12.0810        3.986         3.986
## 155  8.6319        3.839         3.839
## 143 13.2552        3.829         3.829
## 224  3.0044        3.813         3.813
## 253  7.8965        3.711         3.711
## 83   9.3937        3.680         3.680
## 114  4.2590        3.605         3.605
## 197  6.1762        3.550         3.550
## 198  7.5054        3.534         3.534
## 195  5.1916        3.335         3.335
## 156  1.6525        3.323         3.323
## 331  7.0696        3.178         3.178
## 192  1.0144        3.106         3.106
## 288  2.6373        3.095         3.095
## 185  2.2985        3.082         3.082
## 38  -0.7322        3.033         3.033
## 400  6.8313        3.024         3.024
## 101 12.6386        2.890         2.890
## 255  3.6909        2.877         2.877
## 365  2.1699        2.867         2.867
## 12   4.6849        2.866         2.866
## 269  7.3563        2.840         2.840
## 248  1.9050        2.824         2.824
## 65   3.1522        2.812         2.812

Let us now look at the correlations again:

cor(rank.table$apRank, rank.table$logisticrank, use="pairwise.complete.obs")
## [1] 0.9389
cor(rank.table$apRank, rank.table$logistic2rank, use="pairwise.complete.obs")
## [1] 0.9389

The correlation after removing the teams playing less than 5 matches seems to have worsened slightly. Looking at the standard deviation,

sd(rank.table$logisticrank - rank.table$apRank, na.rm=TRUE)
## [1] 3.278
sd(rank.table$logistic2rank - rank.table$apRank, na.rm=TRUE)
## [1] 3.278
sd(rank.table$lmrank - rank.table$apRank, na.rm=TRUE)
## [1] 10.68

we see that we are performing slightly worse in the logistical regression after filtering. In any case, our logistic model is substantially better than the linear model. For the sake of completeness, let us plot all the ranks again.

plot(rank.table$apRank, rank.table$usaTodayRank, xlab="rank using various methods", ylab="aprank");
points(rank.table$apRank, rank.table$lmrank, col="blue", pch=18);
points(rank.table$apRank, rank.table$logisticrank, col="red", pch=17);
points(rank.table$apRank, rank.table$logistic2rank, col="green", pch=17);
grid();

plot of chunk unnamed-chunk-8

In summary, logistic model seems to perform best, and corresponds better to voters decision. Since the voters ranks correspond better to the model without removing the outliers, it is possible that the voters are unduly biased by convincing victories against weak teams.

###########################################################################################

(c) Let us see how many teams are better than stanford at the 95% confidence level for the linear model. For this, we both need Pr<0.05 and coefficient>0.

z = summary(homeAdv.mod); z = z$coefficients;
n = nrow(z);
Pr = z[2:n,4];
Coef = z[2:n,1];
tf = Pr < 0.05 & Coef > 0;
sum(tf)
## [1] 13

rank.table$lmFlag = FALSE;
loc = match(names(tf), paste("`",rank.table$team,"`", sep=""));
rank.table[loc,"lmFlag"] = tf;

We now do the same thing for the logistic model.

z = summary(mod3); 
z = z$coefficients;
n = nrow(z);
z = z[2:n,];
Pr = z[,4];
stanCoef = z[grep("stanford", rownames(z)), 1]
Coef = z[,1];
tf = Pr < 0.05 & Coef > stanCoef;
sum(tf)
## [1] 66

rank.table$glmFlag = FALSE;
loc = match(names(tf), paste("`",rank.table$team,"`", sep=""));
rank.table[loc,"glmFlag"] = tf;

Fraction of teams which are better than stanford for the linear and logistic models:

sum(rank.table$lmFlag == TRUE)/nrow(rank.table)
## [1] 0.03202
sum(rank.table$glmFlag == TRUE)/nrow(rank.table)
## [1] 0.1626
###########################################################################################

(d) Cross-validation

Let us start with the linear model

setwd("C:/Users/bbalasub/Desktop/Bala-docs/Stanford/2014-winter-Stats216/hw2");
rm(list=ls());
games<-read.csv("games.csv", as.is=TRUE);
teams<-read.csv("teams.csv", as.is=TRUE);
all.teams = sort(unique(c(teams$team,games$home,games$away)))
total.margin <- function(team) 
  {
  with(games,
         sum(homeScore[home==team]) 
       + sum(awayScore[away==team]) 
       - sum(homeScore[away==team])  
      - sum(awayScore[home==team]))
  }
margins <- sapply(all.teams, total.margin);
names(margins) <- all.teams;

num.games <- function(team)
  {
    with(games, sum(home==team) + sum(away==team));  
  }

# start cross-validation here
X2 <- matrix(0,nrow(games),length(all.teams));
i = match(games$home, all.teams);
j = match(games$away, all.teams);
X2[cbind(1:nrow(games), i)] =  1;
X2[cbind(1:nrow(games), j)] = -1;
X2 = as.data.frame(X2);
names(X2) <- all.teams;
margins = games$homeScore - games$awayScore;
y2 = margins;
X2 <- X2[,names(X2) != "stanford-cardinal"];
homeAdv <- 1 - games$neutralLocation

# baseline regression of everything
reg.season.games <- games$gameType=="REG";
homeAdv.mod0 <- lm(y2 ~ 0 + homeAdv + ., data=X2, subset=reg.season.games)

# subselect sample before cross-validation to include only regular games
y2 = subset(y2, reg.season.games);
X2 = X2[reg.season.games,];
homeAdv = subset(homeAdv, reg.season.games);
margins = subset(margins, reg.season.games);
actResult = margins;
actResult[actResult < 0] = 0;
actResult[actResult > 0] = 1;
actResult = as.logical(actResult);

# read for cross-validation now
n = length(y2);
linPredict = logical(n);
linPredict[1:length(linPredict)] = NA;
for (i in 1:10)
{
  # starting and ending index of validation sets
  stIdx = ceiling(n/10)*(i-1)+1;
  enIdx = min(ceiling(n/10)*(i), n);
  tf = !logical(n);
  tf[stIdx:enIdx] = FALSE;
  # training
  fit <- lm(y2 ~ 0 + homeAdv + ., data=X2, subset=tf);
  # predict
  ym = predict(fit, newdata=X2, type="response");
  temp = ym;
  temp[temp<0] = 0; # home team loses
  temp[temp>0] = 1; # home team wins
  temp = as.logical(temp);
  linPredict[stIdx:enIdx] = temp[stIdx:enIdx]; 
}
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading

Now, we do the cross-validation for logistic regression

# read for cross-validation now
n = length(y2);
logPredict = logical(n);
logPredict[1:length(logPredict)] = NA;
for (i in 1:10)
{
  # starting and ending index of validation sets
  stIdx = ceiling(n/10)*(i-1)+1;
  enIdx = min(ceiling(n/10)*(i), n);
  tf = !logical(n);
  tf[stIdx:enIdx] = FALSE;
  # training
  fit <- glm(actResult ~ 0 + homeAdv + ., data=X2, subset=tf, family=binomial);
  # predict
  ym = predict(fit, newdata=X2, type="response");
  temp = ym;
  temp[temp<0.5] = 0; # home team loses
  temp[temp>=0.5] = 1; # home team wins
  temp = as.logical(temp);
  logPredict[stIdx:enIdx] = temp[stIdx:enIdx]; 
}
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading

Create the confusion matrix now.

d = data.frame(act=actResult, lin=linPredict, log=logPredict)
n11 = sum(d$lin==d$act & d$log==d$act)
n12 = sum(d$lin==d$act & d$log!=d$act)
n21 = sum(d$lin!=d$act & d$log==d$act)
n22 = sum(d$lin!=d$act & d$log!=d$act)
a = c(n11,n21,n12,n22)
dim(a) = c(2,2)
print(a)
##      [,1] [,2]
## [1,] 3295  465
## [2,]  327 1308