Read data…
setwd("C:/Users/bbalasub/Desktop/Bala-docs/Stanford/2014-winter-Stats216/hw2");
rm(list=ls());
games<-read.csv("games.csv", as.is=TRUE);
teams<-read.csv("teams.csv", as.is=TRUE);
Generate list of all teams
all.teams = sort(unique(c(teams$team,games$home,games$away)))
Function to compute a team's total margin of home team victory
total.margin <- function(team)
{
with(games,
sum(homeScore[home==team])
+ sum(awayScore[away==team])
- sum(homeScore[away==team])
- sum(awayScore[home==team]))
}
Compute margin of victory for each team and generate rank table based on margin of victory using linear regression.
margins <- sapply(all.teams, total.margin);
names(margins) <- all.teams;
r = sort(margins, decreasing=TRUE);
rank.table = data.frame(team=names(r), margin=r, marginrank=seq(1,length(r)))
row.names(rank.table) = seq(1,nrow(rank.table));
rank.table = merge(rank.table, teams, by.x = "team", all.x= TRUE);
rank.table = rank.table[with(rank.table, order(marginrank)), ];
rank.table = droplevels(rank.table);
Function to calculate number of games played by each team
num.games <- function(team)
{
with(games, sum(home==team) + sum(away==team));
}
nMatches = sapply(rank.table$team, num.games);
rank.table$nMatches = nMatches;
This is my own-try implementation. exactly the same answer as class.
n = nrow(games);
p = length(all.teams);
X2 <- matrix(0,nrow(games),length(all.teams));
i = match(games$home, all.teams);
j = match(games$away, all.teams);
X2[cbind(1:nrow(games), i)] = 1;
X2[cbind(1:nrow(games), j)] = -1;
X2 = as.data.frame(X2);
names(X2) <- all.teams;
margins = games$homeScore - games$awayScore;
y2 = margins;
reg.season.games <- which(games$gameType=="REG");
X2 <- X2[,names(X2) != "stanford-cardinal"];
mod2 <- lm(y2 ~ 0 + ., data=X2, subset=reg.season.games);
Now let us add home advantage into the model..
homeAdv <- 1 - games$neutralLocation
homeAdv.mod <- lm(y2 ~ 0 + homeAdv + ., data=X2, subset=reg.season.games)
homeAdv.coef <- coef(homeAdv.mod)[paste("`",teams$team,"`",sep="")]
names(homeAdv.coef) <- teams$team
head(coef(summary(homeAdv.mod)))
## Estimate Std. Error t value Pr(>|t|)
## homeAdv 3.5285 0.1569 22.4920 8.655e-107
## `air-force-falcons` -5.2991 2.7621 -1.9185 5.510e-02
## `akron-zips` -0.9668 2.7085 -0.3570 7.211e-01
## `alabama-a&m-bulldogs` -27.1437 2.7761 -9.7777 2.227e-22
## `alabama-crimson-tide` -2.6111 2.6706 -0.9777 3.283e-01
## `alabama-state-hornets` -26.3396 2.7200 -9.6836 5.519e-22
r = homeAdv.coef[2:length(homeAdv.coef)];
r = sort(r, decreasing=TRUE);
t = data.frame(team=names(r), lmcoef=r, lmrank=seq(1,length(r)))
row.names(t) = seq(1,nrow(t));
rank.table = merge(rank.table, t, by.x = "team", all.x= TRUE);
rank.table = rank.table[with(rank.table, order(lmrank)), ];
rank.table = droplevels(rank.table);
###########################################################################################
(a) Logistic regression
n = nrow(games);
p = length(all.teams);
X0 <- matrix(0,nrow(games),length(all.teams));
i = match(games$home, all.teams);
j = match(games$away, all.teams);
X0[cbind(1:nrow(games), i)] = 1;
X0[cbind(1:nrow(games), j)] = -1;
X0 = as.data.frame(X0);
names(X0) <- all.teams;
margins = games$homeScore - games$awayScore;
y = margins;
y[y > 0] = 1;
y[y<=0] = 0;
y = as.logical(y);
homeAdv <- 1 - games$neutralLocation
reg.season.games <- which(games$gameType=="REG");
mod2 <- glm(y ~0+homeAdv+., data=X0, subset=reg.season.games, family=binomial);
summary(mod2)$r.squared
## NULL
newrank = coef(mod2);
b0 = newrank[1];
newrank = newrank[2:length(newrank)];
newrank = data.frame(logisticcoef=newrank, team=all.teams)
newrank = newrank[with(newrank, order(-logisticcoef)), ];
newrank$logisticrank = seq(1, nrow(newrank), 1)
rank.table = merge(rank.table, newrank, by.x = "team", all.x= TRUE);
rank.table = rank.table[with(rank.table, order(logisticrank)), ];
rank.table = droplevels(rank.table);
head(rank.table)
## team margin marginrank conference inTourney apRank
## 290 saint-mary-saint-mary 10 163 <NA> NA NA
## 326 st.-thomas-(tx)-celts 13 161 <NA> NA NA
## 120 gonzaga-bulldogs 554 4 west-coast 1 1
## 175 louisville-cardinals 627 2 big-east 1 2
## 155 kansas-jayhawks 490 5 big-12 1 3
## 143 indiana-hoosiers 595 3 big-ten 1 4
## usaTodayRank nMatches lmcoef lmrank logisticcoef logisticrank
## 290 NA 1 NA NA 15.700 1
## 326 NA 1 NA NA 14.844 2
## 120 1 34 9.988 4 4.344 3
## 175 2 40 12.081 3 3.986 4
## 155 3 37 8.632 6 3.839 5
## 143 5 36 13.255 1 3.829 6
Let us now compare the various rankings:
cor(rank.table$apRank, rank.table$marginrank, use="pairwise.complete.obs")
## [1] 0.4131
cor(rank.table$apRank, rank.table$lmrank, use="pairwise.complete.obs")
## [1] 0.6433
cor(rank.table$apRank, rank.table$logisticrank, use="pairwise.complete.obs")
## [1] 0.9389
cor(rank.table$apRank, rank.table$usaTodayRank, use="pairwise.complete.obs")
## [1] 0.9777
It is clear that the logistic regression provides better correlated rankings with ap than the linear model or the margin model. Let us now plot the various ranks against the ap rank.
plot(rank.table$apRank, rank.table$usaTodayRank, xlab="rank using various methods", ylab="aprank");
points(rank.table$apRank, rank.table$lmrank, col="blue", pch=18);
points(rank.table$apRank, rank.table$logisticrank, col="red", pch=17);
grid();
However, from the rankings table, we immediately notice that the top-two ranks have only played one game each (which they won). This resulted in a spuriously large value of bi (the regression coefficient) for the top two teams. Although the likelihood is not really separable (in terms of the probabilities for each of the teams), the likelihood increases by selecting larger coefficients for those teams whose wins and losses can be perfectly separated as discussed in problem 3.
###########################################################################################
(b) Reanalysis by removing teams that played less than 5 games
n = nrow(games);
p = length(all.teams);
X0 <- matrix(0,nrow(games),length(all.teams));
i = match(games$home, all.teams);
j = match(games$away, all.teams);
X0[cbind(1:nrow(games), i)] = 1;
X0[cbind(1:nrow(games), j)] = -1;
X0 = as.data.frame(X0);
names(X0) <- all.teams;
margins = games$homeScore - games$awayScore;
y = margins;
y[y > 0] = 1;
y[y<=0] = 0;
y = as.logical(y);
homeAdv <- 1 - games$neutralLocation
# name list of teams that have played less than 5 matches
tlist = rank.table$team[rank.table$nMatches<5];
tlist = droplevels(tlist);
tf = (games$away %in% tlist) | (games$home %in% tlist);
reg.season.games <- which(games$gameType=="REG" & !tf);
# do the logistic fit
mod3 <- glm(y ~0+homeAdv+., data=X0, subset=reg.season.games, family=binomial);
summary(mod3)$r.squared
## NULL
newrank = coef(mod3);
b0 = newrank[1];
newrank = newrank[2:length(newrank)];
newrank = data.frame(logistic2coef=newrank, team=all.teams)
newrank = newrank[with(newrank, order(-logistic2coef)), ];
newrank$logistic2rank = seq(1, nrow(newrank), 1)
rank.table = merge(rank.table, newrank, by.x = "team", all.x= TRUE);
rank.table = rank.table[with(rank.table, order(logistic2rank)), ];
rank.table = droplevels(rank.table);
# reorder columns for readable display
rank.table=rank.table[,c(1,3,6,7,10,12,14,2,4,5,8,9,11,13)]
head(rank.table, n=25)
## team marginrank apRank usaTodayRank lmrank
## 120 gonzaga-bulldogs 4 1 1 4
## 175 louisville-cardinals 2 2 2 3
## 155 kansas-jayhawks 5 3 3 6
## 143 indiana-hoosiers 3 4 5 1
## 224 new-mexico-lobos 53 11 10 23
## 253 ohio-state-buckeyes 14 7 6 7
## 83 duke-blue-devils 12 6 7 5
## 114 georgetown-hoyas 43 8 8 18
## 197 michigan-state-spartans 38 9 9 12
## 198 michigan-wolverines 8 10 11 8
## 195 miami-(fl)-hurricanes 24 5 4 14
## 156 kansas-state-wildcats 52 12 14 36
## 331 syracuse-orange 6 16 18 10
## 192 memphis-tigers 21 19 15 42
## 288 saint-louis-billikens 19 13 13 26
## 185 marquette-golden-eagles 66 15 16 31
## 38 butler-bulldogs 63 NA NA 51
## 400 wisconsin-badgers 26 18 17 11
## 101 florida-gators 1 14 12 2
## 255 oklahoma-state-cowboys 35 17 19 20
## 365 unlv-rebels 49 NA NA 32
## 12 arizona-wildcats 20 21 20 16
## 269 pittsburgh-panthers 10 20 22 9
## 248 notre-dame-fighting-irish 56 23 NA 35
## 65 colorado-state-rams 29 NA NA 22
## logisticrank logistic2rank margin conference inTourney nMatches
## 120 3 1 554 west-coast 1 34
## 175 4 2 627 big-east 1 40
## 155 5 3 490 big-12 1 37
## 143 6 4 595 big-ten 1 36
## 224 7 5 233 mountain-west 1 35
## 253 8 6 402 big-ten 1 37
## 83 9 7 428 acc 1 36
## 114 10 8 264 big-east 1 32
## 197 11 9 283 big-ten 1 35
## 198 12 10 465 big-ten 1 39
## 195 13 11 328 acc 1 36
## 156 14 12 238 big-12 1 34
## 331 15 13 469 big-east 1 40
## 192 16 14 344 conference-usa 1 36
## 288 17 15 351 atlantic-10 1 35
## 185 18 16 191 big-east 1 35
## 38 19 17 196 atlantic-10 1 36
## 400 20 18 316 big-ten 1 35
## 101 21 19 630 sec 1 37
## 255 22 20 292 big-12 1 33
## 365 23 21 245 mountain-west 1 34
## 12 24 22 344 pac-12 1 35
## 269 25 23 435 big-east 1 33
## 248 26 24 228 big-east 1 35
## 65 27 25 299 mountain-west 1 34
## lmcoef logisticcoef logistic2coef
## 120 9.9875 4.344 4.344
## 175 12.0810 3.986 3.986
## 155 8.6319 3.839 3.839
## 143 13.2552 3.829 3.829
## 224 3.0044 3.813 3.813
## 253 7.8965 3.711 3.711
## 83 9.3937 3.680 3.680
## 114 4.2590 3.605 3.605
## 197 6.1762 3.550 3.550
## 198 7.5054 3.534 3.534
## 195 5.1916 3.335 3.335
## 156 1.6525 3.323 3.323
## 331 7.0696 3.178 3.178
## 192 1.0144 3.106 3.106
## 288 2.6373 3.095 3.095
## 185 2.2985 3.082 3.082
## 38 -0.7322 3.033 3.033
## 400 6.8313 3.024 3.024
## 101 12.6386 2.890 2.890
## 255 3.6909 2.877 2.877
## 365 2.1699 2.867 2.867
## 12 4.6849 2.866 2.866
## 269 7.3563 2.840 2.840
## 248 1.9050 2.824 2.824
## 65 3.1522 2.812 2.812
Let us now look at the correlations again:
cor(rank.table$apRank, rank.table$logisticrank, use="pairwise.complete.obs")
## [1] 0.9389
cor(rank.table$apRank, rank.table$logistic2rank, use="pairwise.complete.obs")
## [1] 0.9389
The correlation after removing the teams playing less than 5 matches seems to have worsened slightly. Looking at the standard deviation,
sd(rank.table$logisticrank - rank.table$apRank, na.rm=TRUE)
## [1] 3.278
sd(rank.table$logistic2rank - rank.table$apRank, na.rm=TRUE)
## [1] 3.278
sd(rank.table$lmrank - rank.table$apRank, na.rm=TRUE)
## [1] 10.68
we see that we are performing slightly worse in the logistical regression after filtering. In any case, our logistic model is substantially better than the linear model. For the sake of completeness, let us plot all the ranks again.
plot(rank.table$apRank, rank.table$usaTodayRank, xlab="rank using various methods", ylab="aprank");
points(rank.table$apRank, rank.table$lmrank, col="blue", pch=18);
points(rank.table$apRank, rank.table$logisticrank, col="red", pch=17);
points(rank.table$apRank, rank.table$logistic2rank, col="green", pch=17);
grid();
In summary, logistic model seems to perform best, and corresponds better to voters decision. Since the voters ranks correspond better to the model without removing the outliers, it is possible that the voters are unduly biased by convincing victories against weak teams.
###########################################################################################
(c) Let us see how many teams are better than stanford at the 95% confidence level for the linear model. For this, we both need Pr<0.05 and coefficient>0.
z = summary(homeAdv.mod); z = z$coefficients;
n = nrow(z);
Pr = z[2:n,4];
Coef = z[2:n,1];
tf = Pr < 0.05 & Coef > 0;
sum(tf)
## [1] 13
rank.table$lmFlag = FALSE;
loc = match(names(tf), paste("`",rank.table$team,"`", sep=""));
rank.table[loc,"lmFlag"] = tf;
We now do the same thing for the logistic model.
z = summary(mod3);
z = z$coefficients;
n = nrow(z);
z = z[2:n,];
Pr = z[,4];
stanCoef = z[grep("stanford", rownames(z)), 1]
Coef = z[,1];
tf = Pr < 0.05 & Coef > stanCoef;
sum(tf)
## [1] 66
rank.table$glmFlag = FALSE;
loc = match(names(tf), paste("`",rank.table$team,"`", sep=""));
rank.table[loc,"glmFlag"] = tf;
Fraction of teams which are better than stanford for the linear and logistic models:
sum(rank.table$lmFlag == TRUE)/nrow(rank.table)
## [1] 0.03202
sum(rank.table$glmFlag == TRUE)/nrow(rank.table)
## [1] 0.1626
###########################################################################################
(d) Cross-validation
Let us start with the linear model
setwd("C:/Users/bbalasub/Desktop/Bala-docs/Stanford/2014-winter-Stats216/hw2");
rm(list=ls());
games<-read.csv("games.csv", as.is=TRUE);
teams<-read.csv("teams.csv", as.is=TRUE);
all.teams = sort(unique(c(teams$team,games$home,games$away)))
total.margin <- function(team)
{
with(games,
sum(homeScore[home==team])
+ sum(awayScore[away==team])
- sum(homeScore[away==team])
- sum(awayScore[home==team]))
}
margins <- sapply(all.teams, total.margin);
names(margins) <- all.teams;
num.games <- function(team)
{
with(games, sum(home==team) + sum(away==team));
}
# start cross-validation here
X2 <- matrix(0,nrow(games),length(all.teams));
i = match(games$home, all.teams);
j = match(games$away, all.teams);
X2[cbind(1:nrow(games), i)] = 1;
X2[cbind(1:nrow(games), j)] = -1;
X2 = as.data.frame(X2);
names(X2) <- all.teams;
margins = games$homeScore - games$awayScore;
y2 = margins;
X2 <- X2[,names(X2) != "stanford-cardinal"];
homeAdv <- 1 - games$neutralLocation
# baseline regression of everything
reg.season.games <- games$gameType=="REG";
homeAdv.mod0 <- lm(y2 ~ 0 + homeAdv + ., data=X2, subset=reg.season.games)
# subselect sample before cross-validation to include only regular games
y2 = subset(y2, reg.season.games);
X2 = X2[reg.season.games,];
homeAdv = subset(homeAdv, reg.season.games);
margins = subset(margins, reg.season.games);
actResult = margins;
actResult[actResult < 0] = 0;
actResult[actResult > 0] = 1;
actResult = as.logical(actResult);
# read for cross-validation now
n = length(y2);
linPredict = logical(n);
linPredict[1:length(linPredict)] = NA;
for (i in 1:10)
{
# starting and ending index of validation sets
stIdx = ceiling(n/10)*(i-1)+1;
enIdx = min(ceiling(n/10)*(i), n);
tf = !logical(n);
tf[stIdx:enIdx] = FALSE;
# training
fit <- lm(y2 ~ 0 + homeAdv + ., data=X2, subset=tf);
# predict
ym = predict(fit, newdata=X2, type="response");
temp = ym;
temp[temp<0] = 0; # home team loses
temp[temp>0] = 1; # home team wins
temp = as.logical(temp);
linPredict[stIdx:enIdx] = temp[stIdx:enIdx];
}
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
Now, we do the cross-validation for logistic regression
# read for cross-validation now
n = length(y2);
logPredict = logical(n);
logPredict[1:length(logPredict)] = NA;
for (i in 1:10)
{
# starting and ending index of validation sets
stIdx = ceiling(n/10)*(i-1)+1;
enIdx = min(ceiling(n/10)*(i), n);
tf = !logical(n);
tf[stIdx:enIdx] = FALSE;
# training
fit <- glm(actResult ~ 0 + homeAdv + ., data=X2, subset=tf, family=binomial);
# predict
ym = predict(fit, newdata=X2, type="response");
temp = ym;
temp[temp<0.5] = 0; # home team loses
temp[temp>=0.5] = 1; # home team wins
temp = as.logical(temp);
logPredict[stIdx:enIdx] = temp[stIdx:enIdx];
}
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
## Warning: prediction from a rank-deficient fit may be misleading
Create the confusion matrix now.
d = data.frame(act=actResult, lin=linPredict, log=logPredict)
n11 = sum(d$lin==d$act & d$log==d$act)
n12 = sum(d$lin==d$act & d$log!=d$act)
n21 = sum(d$lin!=d$act & d$log==d$act)
n22 = sum(d$lin!=d$act & d$log!=d$act)
a = c(n11,n21,n12,n22)
dim(a) = c(2,2)
print(a)
## [,1] [,2]
## [1,] 3295 465
## [2,] 327 1308