rm(list=ls())
setwd("~/stats/kaggle/kobe/")
df <- read.csv("data.csv")
df$shot_made_flag <- as.factor(df$shot_made_flag)
df$shot_made_flag <- factor(df$shot_made_flag, levels = c(1,0))
sample_submission <- read.csv("~/Downloads/sample_submission-2.csv")
train <- subset(df, !is.na(df$shot_made_flag))
test <- subset(df, is.na(df$shot_made_flag))
train <- droplevels(train)
test <- droplevels(test)
levels(test$action_type) <- levels(train$action_type)
train$shot_made_flag <- as.factor(train$shot_made_flag)
submission <- data.frame(shot_id=rownames(test), shot_made_flag = rep(NA,nrow(test)))
add <- data.frame(shot_id=c(22624,22578), shot_made_flag = c(1,1))
#Error in model.frame.default(Terms, newdata, na.action = na.action, xlev = object$xlevels) :
#factor action_type has new levels Cutting Finger Roll Layup Shot, Turnaround Fadeaway Bank Jump Shot
which(df$action_type == "Cutting Finger Roll Layup Shot" , arr.ind = T)
## [1] 22624
which(df$action_type == "Turnaround Fadeaway Bank Jump Shot", arr.ind=T)
## [1] 22578
df <- df[-c(22624,22578),]
SPLIT DATA INTO SEASON FOR SEPERATE ANALYSIS OF EACH SEASON IN ORDER TO AVOID DATA LEAKAGE
season <- df$season
df.season <- split(df, season)
df$time_remaining <- df$minutes_remaining * 60 + df$seconds_remaining
df$seconds_remaining <- NULL
df$minutes_remaining <- NULL
#### calculate the distance by using the x and y coordinate
distance <- sqrt((df$loc_x/10) ** 2 + (df$loc_y/10)**2)
data.frame(df$shot_distance, distance)[1:10,]
## df.shot_distance distance
## 1 18 18.185984
## 2 15 15.700000
## 3 16 16.860012
## 4 22 22.286543
## 5 0 0.000000
## 6 14 14.541664
## 7 0 0.000000
## 8 2 2.801785
## 9 12 12.605158
## 10 12 12.928264
#### replace the rounded values of the df with the actual ones
df$shot_distance <- distance
areas <- split(df, df$shot_zone_area)
ggplot(data=df, aes(x=df$loc_x, y=df$loc_y)) + geom_point(aes(color=df$shot_zone_area), alpha=0.7, size=3) + theme_void()
ggplot(data=df, aes(x=df$loc_x, y=df$loc_y)) + geom_point(aes(color=df$shot_zone_basic))
ggplot(data=df, aes(x=df$loc_x, y=df$loc_y)) + geom_point(aes(color=df$shot_distance), alpha=0.7, size=3)
ggplot(subset(df, !is.na(shot_made_flag)), aes(x = loc_x, y = loc_y)) +
geom_point(aes(color = shot_made_flag), alpha = 0.5, size = 0.5) +
ylim(c(-50, 400)) +
theme_void() +
scale_color_brewer(palette = "Set1") +
facet_grid(~ shot_made_flag) +
labs(title = "Shots Made(Blue) vs. Shots Missed(Red)")
## Warning: Removed 69 rows containing missing values (geom_point).
#### check team_id and team_name
table(df$team_id, df$team_name)
##
## Los Angeles Lakers
## 1610612747 30695
#### Seems like he never played for another team, so just drop these columns
df$team_id <- NULL
df$team_name <- NULL
#### split date into year,month, day
date <- df$game_date
df <- df %>% separate(game_date, c("year", "month", "day"), sep="-")
df$year <- as.factor(df$year)
df$month <- as.factor(df$month)
#### game_event and game_id not needed
df$game_id <- NULL
df$game_event_id <- NULL
#### correlate x_loc, y_loc, lon, lat
mat <- data.frame(loc_x = df$loc_x, loc_y = df$loc_y, lat = df$lat, lon = df$lon)
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.2.5
cor.matrix <- cor(mat)
corrplot(cor.matrix, method="number")
corrplot.mixed(cor.matrix)
#### lon, x_loc and y_loc, lat got cor = 1 so we can delete lon and lat
df$lon <- NULL
df$lat <- NULL
#### checkout season feature
df$season[1:10]
## [1] 2000-01 2000-01 2000-01 2000-01 2000-01 2000-01 2000-01 2000-01
## [9] 2000-01 2000-01
## 20 Levels: 1996-97 1997-98 1998-99 1999-00 2000-01 2001-02 ... 2015-16
levels(df$season)
## [1] "1996-97" "1997-98" "1998-99" "1999-00" "2000-01" "2001-02" "2002-03"
## [8] "2003-04" "2004-05" "2005-06" "2006-07" "2007-08" "2008-09" "2009-10"
## [15] "2010-11" "2011-12" "2012-13" "2013-14" "2014-15" "2015-16"
#### keep only the starting year of the season, delete the ending year
df <- df %>% separate(season, c("season", "rubbish"), sep="-")
df$rubbish<- NULL
bkup <- df
df$season <- as.factor(df$season)
#### turn season into integer
table(df$shot_type)
##
## 2PT Field Goal 3PT Field Goal
## 24269 6426
#cor(df$shot_type, df$time_remaining)
t.test(df$time_remaining~df$shot_type)
##
## Welch Two Sample t-test
##
## data: df$time_remaining by df$shot_type
## t = 20.78, df = 9954.4, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 55.14705 66.63495
## sample estimates:
## mean in group 2PT Field Goal mean in group 3PT Field Goal
## 334.2323 273.3413
fit <- glm(time_remaining~shot_type, data=df)
summary(fit)
##
## Call:
## glm(formula = time_remaining ~ shot_type, data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -334.23 -179.79 -18.23 172.77 435.66
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 334.232 1.327 251.9 <2e-16 ***
## shot_type3PT Field Goal -60.891 2.900 -21.0 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 42722.53)
##
## Null deviance: 1330120509 on 30694 degrees of freedom
## Residual deviance: 1311282664 on 30693 degrees of freedom
## AIC: 414398
##
## Number of Fisher Scoring iterations: 2
table(df$period)
##
## 1 2 3 4 5 6 7
## 8048 6718 8294 7260 330 38 7
#### turn cartesian into polar coordinates
library(pracma)
## Warning: package 'pracma' was built under R version 3.2.5
polarcoords <- pracma::cart2pol(cbind(df$loc_x,df$loc_y))
df$angle <- polarcoords[,2]
train <- subset(df, !is.na(df$shot_made_flag))
test <- subset(df, is.na(df$shot_made_flag))
fit <- glm(shot_made_flag~ time_remaining+year+month+shot_zone_basic+shot_zone_area+shot_distance+period+shot_type, family = binomial(link="logit"), data=train)
summary(fit)
##
## Call:
## glm(formula = shot_made_flag ~ time_remaining + year + month +
## shot_zone_basic + shot_zone_area + shot_distance + period +
## shot_type, family = binomial(link = "logit"), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7805 -1.2608 0.8774 1.0370 1.5362
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) 1.230e+01 9.355e+01 0.131
## time_remaining -1.998e-04 6.265e-05 -3.189
## year1997 -2.796e-01 2.736e-01 -1.022
## year1998 -3.144e-01 2.823e-01 -1.114
## year1999 -4.508e-01 2.740e-01 -1.645
## year2000 -5.335e-01 2.700e-01 -1.976
## year2001 -4.747e-01 2.711e-01 -1.751
## year2002 -3.749e-01 2.698e-01 -1.389
## year2003 -3.908e-01 2.707e-01 -1.443
## year2004 -3.571e-01 2.707e-01 -1.319
## year2005 -4.491e-01 2.713e-01 -1.655
## year2006 -6.020e-01 2.702e-01 -2.228
## year2007 -5.065e-01 2.705e-01 -1.873
## year2008 -5.748e-01 2.699e-01 -2.129
## year2009 -5.645e-01 2.695e-01 -2.095
## year2010 -4.719e-01 2.702e-01 -1.746
## year2011 -5.202e-01 2.736e-01 -1.901
## year2012 -4.592e-01 2.699e-01 -1.701
## year2013 -4.899e-01 2.757e-01 -1.777
## year2014 -2.659e-01 2.800e-01 -0.950
## year2015 -2.028e-01 2.819e-01 -0.719
## year2016 -2.546e-01 2.829e-01 -0.900
## month02 -4.833e-03 4.847e-02 -0.100
## month03 7.517e-02 4.585e-02 1.640
## month04 7.331e-02 4.991e-02 1.469
## month05 -1.283e-03 5.617e-02 -0.023
## month06 1.406e-01 8.483e-02 1.657
## month10 1.383e-01 1.209e-01 1.144
## month11 3.477e-02 4.954e-02 0.702
## month12 2.912e-02 4.736e-02 0.615
## shot_zone_basicBackcourt -8.895e+00 9.355e+01 -0.095
## shot_zone_basicIn The Paint (Non-RA) -4.954e-01 2.804e-01 -1.767
## shot_zone_basicLeft Corner 3 -2.838e-01 1.454e-01 -1.952
## shot_zone_basicMid-Range -5.006e-01 2.705e-01 -1.851
## shot_zone_basicRestricted Area -9.621e-01 2.940e-01 -3.272
## shot_zone_basicRight Corner 3 -1.060e-01 1.280e-01 -0.829
## shot_zone_areaCenter(C) -1.145e+01 9.355e+01 -0.122
## shot_zone_areaLeft Side Center(LC) -1.146e+01 9.355e+01 -0.122
## shot_zone_areaLeft Side(L) -1.134e+01 9.355e+01 -0.121
## shot_zone_areaRight Side Center(RC) -1.153e+01 9.355e+01 -0.123
## shot_zone_areaRight Side(R) -1.137e+01 9.355e+01 -0.122
## shot_distance 2.319e-02 5.397e-03 4.297
## period 4.274e-02 1.125e-02 3.799
## shot_type3PT Field Goal -3.444e-01 2.710e-01 -1.271
## Pr(>|z|)
## (Intercept) 0.895432
## time_remaining 0.001430 **
## year1997 0.306795
## year1998 0.265422
## year1999 0.099946 .
## year2000 0.048151 *
## year2001 0.079991 .
## year2002 0.164729
## year2003 0.148917
## year2004 0.187131
## year2005 0.097852 .
## year2006 0.025895 *
## year2007 0.061075 .
## year2008 0.033233 *
## year2009 0.036195 *
## year2010 0.080793 .
## year2011 0.057279 .
## year2012 0.088874 .
## year2013 0.075568 .
## year2014 0.342358
## year2015 0.471982
## year2016 0.368037
## month02 0.920580
## month03 0.101086
## month04 0.141909
## month05 0.981782
## month06 0.097532 .
## month10 0.252815
## month11 0.482844
## month12 0.538632
## shot_zone_basicBackcourt 0.924249
## shot_zone_basicIn The Paint (Non-RA) 0.077308 .
## shot_zone_basicLeft Corner 3 0.050991 .
## shot_zone_basicMid-Range 0.064191 .
## shot_zone_basicRestricted Area 0.001068 **
## shot_zone_basicRight Corner 3 0.407304
## shot_zone_areaCenter(C) 0.902590
## shot_zone_areaLeft Side Center(LC) 0.902506
## shot_zone_areaLeft Side(L) 0.903516
## shot_zone_areaRight Side Center(RC) 0.901945
## shot_zone_areaRight Side(R) 0.903237
## shot_distance 1.73e-05 ***
## period 0.000145 ***
## shot_type3PT Field Goal 0.203726
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 35325 on 25696 degrees of freedom
## Residual deviance: 34075 on 25653 degrees of freedom
## AIC: 34163
##
## Number of Fisher Scoring iterations: 11
y_pred <- predict(fit, newdata = test, type="response")
submission <- data.frame(shot_id=test$shot_id, shot_made_flag=y_pred)
submission[1:45,]
## shot_id shot_made_flag
## 1 1 0.5904961
## 8 8 0.3977848
## 17 17 0.3607748
## 20 20 0.3508380
## 33 33 0.5612013
## 34 34 0.5399670
## 35 35 0.3501978
## 36 36 0.3546253
## 37 37 0.3516078
## 38 38 0.5592184
## 45 45 0.3803782
## 50 50 0.6227977
## 55 55 0.6538309
## 60 60 0.6519908
## 66 66 0.5282187
## 67 67 0.5444308
## 71 71 0.5869660
## 80 80 0.5136570
## 85 85 0.4882963
## 86 86 0.5545440
## 95 95 0.5651438
## 104 104 0.5765793
## 113 113 0.6207887
## 123 123 0.5961855
## 126 126 0.5562377
## 133 133 0.3725072
## 141 141 0.5358888
## 144 144 0.6243448
## 150 150 0.3430100
## 152 152 0.5680470
## 153 153 0.3613728
## 156 156 0.3695250
## 159 159 0.3698497
## 165 165 0.3752181
## 172 172 0.5676987
## 182 182 0.3579664
## 192 192 0.5727412
## 193 193 0.3443176
## 196 196 0.5786752
## 202 202 0.5912729
## 210 210 0.6714016
## 211 211 0.6067702
## 228 228 0.3805724
## 257 257 0.5425504
## 260 260 0.5513112
write.csv(submission, "12.06.csv", row.names = F)
#### SLIDING WINDOW ANALYSIS
train.bySeason <- split(train, train$season)
test.bySeason <- split(test, test$season)
preds <- rep(NA, 5000)
names <- NA
for(i in 1:length(train.bySeason)){
temp.train <- train.bySeason[[i]]
temp.test <- test.bySeason[[i]]
fit <- glm(shot_made_flag ~ time_remaining+shot_distance+opponent, family =binomial(link = "logit"), data=temp.train)
y <- predict(fit, newdata=temp.test, type="response")
x <- temp.test$shot_id
if(i == 1){
preds <- y
names <- x
}
else{
preds <- dplyr::combine(preds,y)
names <- dplyr::combine(names, x)
}
}
sub.season <- data.frame(shot_id = names, shot_made_flag = preds)
summary(fit)
##
## Call:
## glm(formula = shot_made_flag ~ time_remaining + shot_distance +
## opponent, family = binomial(link = "logit"), data = temp.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1502 -1.2221 0.7248 0.9218 1.6937
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.1009105 0.7945237 1.386 0.1659
## time_remaining -0.0005645 0.0003941 -1.432 0.1520
## shot_distance 0.0503233 0.0086996 5.785 7.27e-09 ***
## opponentBKN -0.8665728 0.9873411 -0.878 0.3801
## opponentBOS -0.4773234 0.8655967 -0.551 0.5813
## opponentCHA -1.3740293 0.8464698 -1.623 0.1045
## opponentCHI -1.1954886 0.8699043 -1.374 0.1694
## opponentCLE -2.0153405 0.8540870 -2.360 0.0183 *
## opponentDAL -1.2116106 0.8711560 -1.391 0.1643
## opponentDEN -1.4795928 0.8144993 -1.817 0.0693 .
## opponentDET -0.1896877 0.9135205 -0.208 0.8355
## opponentGSW -0.4490137 0.8743702 -0.514 0.6076
## opponentHOU -1.9834229 0.8105831 -2.447 0.0144 *
## opponentIND -0.5884486 0.8623946 -0.682 0.4950
## opponentLAC -0.7094344 0.8549163 -0.830 0.4066
## opponentMEM -1.1341333 0.8363348 -1.356 0.1751
## opponentMIA 0.1856452 1.3272501 0.140 0.8888
## opponentMIL -1.1211352 0.8912045 -1.258 0.2084
## opponentMIN -1.4706290 0.8120443 -1.811 0.0701 .
## opponentNOP -0.9936995 0.8371262 -1.187 0.2352
## opponentNYK -1.2281369 0.8577733 -1.432 0.1522
## opponentOKC -1.2852669 0.8259073 -1.556 0.1197
## opponentPHI -0.4641344 0.9480982 -0.490 0.6245
## opponentPHX -1.5490508 0.9541429 -1.623 0.1045
## opponentPOR -1.2423604 0.8248961 -1.506 0.1320
## opponentSAC -1.5478334 0.8287141 -1.868 0.0618 .
## opponentSAS -1.1666570 0.8047705 -1.450 0.1471
## opponentTOR -1.5365870 0.8678105 -1.771 0.0766 .
## opponentUTA -1.1687636 0.8112621 -1.441 0.1497
## opponentWAS -1.5992004 0.8363554 -1.912 0.0559 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1213.9 on 931 degrees of freedom
## Residual deviance: 1135.4 on 902 degrees of freedom
## AIC: 1195.4
##
## Number of Fisher Scoring iterations: 4
max(table(sub.season))
## [1] 1
write.csv(sub.season, "sub.season2.csv", row.names = F)
#### RF
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
preds.rf <- rep(NA, 5000)
names <- NA
for(i in 1:length(train.bySeason)){
temp.train <- train.bySeason[[i]]
temp.test <- test.bySeason[[i]]
rf.fit <- randomForest(shot_made_flag ~ .- matchup - shot_id, type ="reg", data=temp.train[ , !(names(temp.train) %in% c('action_type'))], ntrees=3001)
y <- predict(rf.fit, newdata=temp.test, type="class")
x <- temp.test$shot_id
if(i == 1){
preds.rf <- y
names <- x
}
else{
preds.rf <- dplyr::combine(preds.rf,y)
names <- dplyr::combine(names, x)
}
varImpPlot(rf.fit)
}
sub.season.rf <- data.frame(shot_id = names, shot_made_flag = preds.rf)