rm(list=ls())
setwd("~/stats/kaggle/kobe/")
df <- read.csv("data.csv") 
df$shot_made_flag <- as.factor(df$shot_made_flag)
df$shot_made_flag <- factor(df$shot_made_flag, levels = c(1,0))
sample_submission <- read.csv("~/Downloads/sample_submission-2.csv")
train <- subset(df, !is.na(df$shot_made_flag))
test <- subset(df, is.na(df$shot_made_flag))
train <- droplevels(train)
test <- droplevels(test)
levels(test$action_type) <- levels(train$action_type)
train$shot_made_flag <- as.factor(train$shot_made_flag)
submission <- data.frame(shot_id=rownames(test), shot_made_flag = rep(NA,nrow(test)))
add <- data.frame(shot_id=c(22624,22578), shot_made_flag = c(1,1))
#Error in model.frame.default(Terms, newdata, na.action = na.action, xlev = object$xlevels) : 
  #factor action_type has new levels Cutting Finger Roll Layup Shot, Turnaround Fadeaway Bank Jump Shot

which(df$action_type == "Cutting Finger Roll Layup Shot" , arr.ind = T)
## [1] 22624
which(df$action_type == "Turnaround Fadeaway Bank Jump Shot", arr.ind=T)
## [1] 22578
df <- df[-c(22624,22578),]

SPLIT DATA INTO SEASON FOR SEPERATE ANALYSIS OF EACH SEASON IN ORDER TO AVOID DATA LEAKAGE

season <- df$season
df.season <- split(df, season)
df$time_remaining <- df$minutes_remaining * 60 + df$seconds_remaining
df$seconds_remaining <- NULL
df$minutes_remaining <- NULL





#### calculate the distance by using the x and y coordinate
distance <- sqrt((df$loc_x/10) ** 2 + (df$loc_y/10)**2)
data.frame(df$shot_distance, distance)[1:10,]
##    df.shot_distance  distance
## 1                18 18.185984
## 2                15 15.700000
## 3                16 16.860012
## 4                22 22.286543
## 5                 0  0.000000
## 6                14 14.541664
## 7                 0  0.000000
## 8                 2  2.801785
## 9                12 12.605158
## 10               12 12.928264
#### replace the rounded values of the df with the actual ones
df$shot_distance <- distance

areas <- split(df, df$shot_zone_area)

ggplot(data=df, aes(x=df$loc_x, y=df$loc_y)) + geom_point(aes(color=df$shot_zone_area), alpha=0.7, size=3) + theme_void() 

ggplot(data=df, aes(x=df$loc_x, y=df$loc_y)) + geom_point(aes(color=df$shot_zone_basic)) 

ggplot(data=df, aes(x=df$loc_x, y=df$loc_y)) + geom_point(aes(color=df$shot_distance), alpha=0.7, size=3) 

ggplot(subset(df, !is.na(shot_made_flag)), aes(x = loc_x, y = loc_y)) +
    geom_point(aes(color = shot_made_flag), alpha = 0.5, size = 0.5) +
    ylim(c(-50, 400)) +
    theme_void() +
    scale_color_brewer(palette = "Set1") +
    facet_grid(~ shot_made_flag) +
    labs(title = "Shots Made(Blue) vs. Shots Missed(Red)")
## Warning: Removed 69 rows containing missing values (geom_point).

#### check team_id and team_name
table(df$team_id, df$team_name)
##             
##              Los Angeles Lakers
##   1610612747              30695
#### Seems like he never played for another team, so just drop these columns
df$team_id <- NULL
df$team_name <- NULL

#### split date into year,month, day
date <- df$game_date
df <- df %>% separate(game_date, c("year", "month", "day"), sep="-")
df$year <- as.factor(df$year)
df$month <- as.factor(df$month)
#### game_event and game_id not needed
df$game_id <- NULL
df$game_event_id <- NULL

#### correlate x_loc, y_loc, lon, lat
mat <- data.frame(loc_x = df$loc_x, loc_y = df$loc_y, lat = df$lat, lon = df$lon)
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.2.5
cor.matrix <- cor(mat)
corrplot(cor.matrix, method="number")

corrplot.mixed(cor.matrix)

#### lon, x_loc and y_loc, lat got cor = 1 so we can delete lon and lat
df$lon <- NULL
df$lat <- NULL

#### checkout season feature
df$season[1:10]
##  [1] 2000-01 2000-01 2000-01 2000-01 2000-01 2000-01 2000-01 2000-01
##  [9] 2000-01 2000-01
## 20 Levels: 1996-97 1997-98 1998-99 1999-00 2000-01 2001-02 ... 2015-16
levels(df$season)
##  [1] "1996-97" "1997-98" "1998-99" "1999-00" "2000-01" "2001-02" "2002-03"
##  [8] "2003-04" "2004-05" "2005-06" "2006-07" "2007-08" "2008-09" "2009-10"
## [15] "2010-11" "2011-12" "2012-13" "2013-14" "2014-15" "2015-16"
#### keep only the starting year of the season, delete the ending year
df <- df %>% separate(season, c("season", "rubbish"), sep="-")
df$rubbish<- NULL
bkup <- df
df$season <- as.factor(df$season)
#### turn season into integer 
table(df$shot_type)
## 
## 2PT Field Goal 3PT Field Goal 
##          24269           6426
#cor(df$shot_type, df$time_remaining)
t.test(df$time_remaining~df$shot_type)
## 
##  Welch Two Sample t-test
## 
## data:  df$time_remaining by df$shot_type
## t = 20.78, df = 9954.4, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  55.14705 66.63495
## sample estimates:
## mean in group 2PT Field Goal mean in group 3PT Field Goal 
##                     334.2323                     273.3413
fit <- glm(time_remaining~shot_type, data=df)
summary(fit)
## 
## Call:
## glm(formula = time_remaining ~ shot_type, data = df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -334.23  -179.79   -18.23   172.77   435.66  
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              334.232      1.327   251.9   <2e-16 ***
## shot_type3PT Field Goal  -60.891      2.900   -21.0   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 42722.53)
## 
##     Null deviance: 1330120509  on 30694  degrees of freedom
## Residual deviance: 1311282664  on 30693  degrees of freedom
## AIC: 414398
## 
## Number of Fisher Scoring iterations: 2
table(df$period)
## 
##    1    2    3    4    5    6    7 
## 8048 6718 8294 7260  330   38    7
#### turn cartesian into polar coordinates

library(pracma)
## Warning: package 'pracma' was built under R version 3.2.5
polarcoords <- pracma::cart2pol(cbind(df$loc_x,df$loc_y))
df$angle <- polarcoords[,2]
train <- subset(df, !is.na(df$shot_made_flag))
test <- subset(df, is.na(df$shot_made_flag))
fit <- glm(shot_made_flag~ time_remaining+year+month+shot_zone_basic+shot_zone_area+shot_distance+period+shot_type, family = binomial(link="logit"), data=train)
summary(fit)
## 
## Call:
## glm(formula = shot_made_flag ~ time_remaining + year + month + 
##     shot_zone_basic + shot_zone_area + shot_distance + period + 
##     shot_type, family = binomial(link = "logit"), data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7805  -1.2608   0.8774   1.0370   1.5362  
## 
## Coefficients:
##                                        Estimate Std. Error z value
## (Intercept)                           1.230e+01  9.355e+01   0.131
## time_remaining                       -1.998e-04  6.265e-05  -3.189
## year1997                             -2.796e-01  2.736e-01  -1.022
## year1998                             -3.144e-01  2.823e-01  -1.114
## year1999                             -4.508e-01  2.740e-01  -1.645
## year2000                             -5.335e-01  2.700e-01  -1.976
## year2001                             -4.747e-01  2.711e-01  -1.751
## year2002                             -3.749e-01  2.698e-01  -1.389
## year2003                             -3.908e-01  2.707e-01  -1.443
## year2004                             -3.571e-01  2.707e-01  -1.319
## year2005                             -4.491e-01  2.713e-01  -1.655
## year2006                             -6.020e-01  2.702e-01  -2.228
## year2007                             -5.065e-01  2.705e-01  -1.873
## year2008                             -5.748e-01  2.699e-01  -2.129
## year2009                             -5.645e-01  2.695e-01  -2.095
## year2010                             -4.719e-01  2.702e-01  -1.746
## year2011                             -5.202e-01  2.736e-01  -1.901
## year2012                             -4.592e-01  2.699e-01  -1.701
## year2013                             -4.899e-01  2.757e-01  -1.777
## year2014                             -2.659e-01  2.800e-01  -0.950
## year2015                             -2.028e-01  2.819e-01  -0.719
## year2016                             -2.546e-01  2.829e-01  -0.900
## month02                              -4.833e-03  4.847e-02  -0.100
## month03                               7.517e-02  4.585e-02   1.640
## month04                               7.331e-02  4.991e-02   1.469
## month05                              -1.283e-03  5.617e-02  -0.023
## month06                               1.406e-01  8.483e-02   1.657
## month10                               1.383e-01  1.209e-01   1.144
## month11                               3.477e-02  4.954e-02   0.702
## month12                               2.912e-02  4.736e-02   0.615
## shot_zone_basicBackcourt             -8.895e+00  9.355e+01  -0.095
## shot_zone_basicIn The Paint (Non-RA) -4.954e-01  2.804e-01  -1.767
## shot_zone_basicLeft Corner 3         -2.838e-01  1.454e-01  -1.952
## shot_zone_basicMid-Range             -5.006e-01  2.705e-01  -1.851
## shot_zone_basicRestricted Area       -9.621e-01  2.940e-01  -3.272
## shot_zone_basicRight Corner 3        -1.060e-01  1.280e-01  -0.829
## shot_zone_areaCenter(C)              -1.145e+01  9.355e+01  -0.122
## shot_zone_areaLeft Side Center(LC)   -1.146e+01  9.355e+01  -0.122
## shot_zone_areaLeft Side(L)           -1.134e+01  9.355e+01  -0.121
## shot_zone_areaRight Side Center(RC)  -1.153e+01  9.355e+01  -0.123
## shot_zone_areaRight Side(R)          -1.137e+01  9.355e+01  -0.122
## shot_distance                         2.319e-02  5.397e-03   4.297
## period                                4.274e-02  1.125e-02   3.799
## shot_type3PT Field Goal              -3.444e-01  2.710e-01  -1.271
##                                      Pr(>|z|)    
## (Intercept)                          0.895432    
## time_remaining                       0.001430 ** 
## year1997                             0.306795    
## year1998                             0.265422    
## year1999                             0.099946 .  
## year2000                             0.048151 *  
## year2001                             0.079991 .  
## year2002                             0.164729    
## year2003                             0.148917    
## year2004                             0.187131    
## year2005                             0.097852 .  
## year2006                             0.025895 *  
## year2007                             0.061075 .  
## year2008                             0.033233 *  
## year2009                             0.036195 *  
## year2010                             0.080793 .  
## year2011                             0.057279 .  
## year2012                             0.088874 .  
## year2013                             0.075568 .  
## year2014                             0.342358    
## year2015                             0.471982    
## year2016                             0.368037    
## month02                              0.920580    
## month03                              0.101086    
## month04                              0.141909    
## month05                              0.981782    
## month06                              0.097532 .  
## month10                              0.252815    
## month11                              0.482844    
## month12                              0.538632    
## shot_zone_basicBackcourt             0.924249    
## shot_zone_basicIn The Paint (Non-RA) 0.077308 .  
## shot_zone_basicLeft Corner 3         0.050991 .  
## shot_zone_basicMid-Range             0.064191 .  
## shot_zone_basicRestricted Area       0.001068 ** 
## shot_zone_basicRight Corner 3        0.407304    
## shot_zone_areaCenter(C)              0.902590    
## shot_zone_areaLeft Side Center(LC)   0.902506    
## shot_zone_areaLeft Side(L)           0.903516    
## shot_zone_areaRight Side Center(RC)  0.901945    
## shot_zone_areaRight Side(R)          0.903237    
## shot_distance                        1.73e-05 ***
## period                               0.000145 ***
## shot_type3PT Field Goal              0.203726    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 35325  on 25696  degrees of freedom
## Residual deviance: 34075  on 25653  degrees of freedom
## AIC: 34163
## 
## Number of Fisher Scoring iterations: 11
y_pred <- predict(fit, newdata = test, type="response")
submission <- data.frame(shot_id=test$shot_id, shot_made_flag=y_pred)
submission[1:45,]
##     shot_id shot_made_flag
## 1         1      0.5904961
## 8         8      0.3977848
## 17       17      0.3607748
## 20       20      0.3508380
## 33       33      0.5612013
## 34       34      0.5399670
## 35       35      0.3501978
## 36       36      0.3546253
## 37       37      0.3516078
## 38       38      0.5592184
## 45       45      0.3803782
## 50       50      0.6227977
## 55       55      0.6538309
## 60       60      0.6519908
## 66       66      0.5282187
## 67       67      0.5444308
## 71       71      0.5869660
## 80       80      0.5136570
## 85       85      0.4882963
## 86       86      0.5545440
## 95       95      0.5651438
## 104     104      0.5765793
## 113     113      0.6207887
## 123     123      0.5961855
## 126     126      0.5562377
## 133     133      0.3725072
## 141     141      0.5358888
## 144     144      0.6243448
## 150     150      0.3430100
## 152     152      0.5680470
## 153     153      0.3613728
## 156     156      0.3695250
## 159     159      0.3698497
## 165     165      0.3752181
## 172     172      0.5676987
## 182     182      0.3579664
## 192     192      0.5727412
## 193     193      0.3443176
## 196     196      0.5786752
## 202     202      0.5912729
## 210     210      0.6714016
## 211     211      0.6067702
## 228     228      0.3805724
## 257     257      0.5425504
## 260     260      0.5513112
write.csv(submission, "12.06.csv", row.names = F)
#### SLIDING WINDOW ANALYSIS
train.bySeason <- split(train, train$season)
test.bySeason <- split(test, test$season)
preds <- rep(NA, 5000)
names <- NA
for(i in 1:length(train.bySeason)){
  temp.train <- train.bySeason[[i]]
  temp.test <- test.bySeason[[i]]
  fit <- glm(shot_made_flag ~ time_remaining+shot_distance+opponent, family =binomial(link = "logit"), data=temp.train)
  y <- predict(fit, newdata=temp.test, type="response")
  x <- temp.test$shot_id
  if(i == 1){
    preds <- y
    names <- x
  }
  else{
    preds <- dplyr::combine(preds,y)
    names <- dplyr::combine(names, x)
  }
}
sub.season <- data.frame(shot_id = names, shot_made_flag = preds)
summary(fit)
## 
## Call:
## glm(formula = shot_made_flag ~ time_remaining + shot_distance + 
##     opponent, family = binomial(link = "logit"), data = temp.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1502  -1.2221   0.7248   0.9218   1.6937  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     1.1009105  0.7945237   1.386   0.1659    
## time_remaining -0.0005645  0.0003941  -1.432   0.1520    
## shot_distance   0.0503233  0.0086996   5.785 7.27e-09 ***
## opponentBKN    -0.8665728  0.9873411  -0.878   0.3801    
## opponentBOS    -0.4773234  0.8655967  -0.551   0.5813    
## opponentCHA    -1.3740293  0.8464698  -1.623   0.1045    
## opponentCHI    -1.1954886  0.8699043  -1.374   0.1694    
## opponentCLE    -2.0153405  0.8540870  -2.360   0.0183 *  
## opponentDAL    -1.2116106  0.8711560  -1.391   0.1643    
## opponentDEN    -1.4795928  0.8144993  -1.817   0.0693 .  
## opponentDET    -0.1896877  0.9135205  -0.208   0.8355    
## opponentGSW    -0.4490137  0.8743702  -0.514   0.6076    
## opponentHOU    -1.9834229  0.8105831  -2.447   0.0144 *  
## opponentIND    -0.5884486  0.8623946  -0.682   0.4950    
## opponentLAC    -0.7094344  0.8549163  -0.830   0.4066    
## opponentMEM    -1.1341333  0.8363348  -1.356   0.1751    
## opponentMIA     0.1856452  1.3272501   0.140   0.8888    
## opponentMIL    -1.1211352  0.8912045  -1.258   0.2084    
## opponentMIN    -1.4706290  0.8120443  -1.811   0.0701 .  
## opponentNOP    -0.9936995  0.8371262  -1.187   0.2352    
## opponentNYK    -1.2281369  0.8577733  -1.432   0.1522    
## opponentOKC    -1.2852669  0.8259073  -1.556   0.1197    
## opponentPHI    -0.4641344  0.9480982  -0.490   0.6245    
## opponentPHX    -1.5490508  0.9541429  -1.623   0.1045    
## opponentPOR    -1.2423604  0.8248961  -1.506   0.1320    
## opponentSAC    -1.5478334  0.8287141  -1.868   0.0618 .  
## opponentSAS    -1.1666570  0.8047705  -1.450   0.1471    
## opponentTOR    -1.5365870  0.8678105  -1.771   0.0766 .  
## opponentUTA    -1.1687636  0.8112621  -1.441   0.1497    
## opponentWAS    -1.5992004  0.8363554  -1.912   0.0559 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1213.9  on 931  degrees of freedom
## Residual deviance: 1135.4  on 902  degrees of freedom
## AIC: 1195.4
## 
## Number of Fisher Scoring iterations: 4
max(table(sub.season))
## [1] 1
write.csv(sub.season, "sub.season2.csv", row.names = F)
#### RF
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
preds.rf <- rep(NA, 5000)
names <- NA
for(i in 1:length(train.bySeason)){
  temp.train <- train.bySeason[[i]]
  temp.test <- test.bySeason[[i]]
  rf.fit <- randomForest(shot_made_flag ~ .- matchup - shot_id, type ="reg", data=temp.train[ , !(names(temp.train) %in% c('action_type'))], ntrees=3001)
  y <- predict(rf.fit, newdata=temp.test, type="class")
  x <- temp.test$shot_id
  if(i == 1){
    preds.rf <- y
    names <- x
  }
  else{
    preds.rf <- dplyr::combine(preds.rf,y)
    names <- dplyr::combine(names, x)
  }
  varImpPlot(rf.fit)
}

sub.season.rf <- data.frame(shot_id = names, shot_made_flag = preds.rf)