#Load data
getwd()
## [1] "/Users/anesunyawata/Documents/Data110 Final Project"
players <- read.csv("players.csv")
salaries <- read.csv("salaries_1985to2018.csv")
#Eliminate repeat player names
players_unique <- distinct(players, name, .keep_all = TRUE)

#Create common variable in order to merge the 2 data sets.

colnames(players_unique)[1]<-colnames(salaries)[2]
full_data <- merge(players_unique, salaries, by= "player_id")

#Checking to make sure data sets were merged correctly
str((full_data))
## 'data.frame':    14024 obs. of  26 variables:
##  $ player_id   : chr  "abdelal01" "abdelal01" "abdelal01" "abdelal01" ...
##  $ birthDate   : chr  "24-Jun-68" "24-Jun-68" "24-Jun-68" "24-Jun-68" ...
##  $ birthPlace  : chr  "Cairo, Egypt" "Cairo, Egypt" "Cairo, Egypt" "Cairo, Egypt" ...
##  $ career_AST  : num  0.3 0.3 0.3 0.3 0.3 3.6 3.6 3.6 3.6 3.5 ...
##  $ career_FG.  : chr  "50.2" "50.2" "50.2" "50.2" ...
##  $ career_FG3. : chr  "0" "0" "0" "0" ...
##  $ career_FT.  : chr  "70.1" "70.1" "70.1" "70.1" ...
##  $ career_G    : int  256 256 256 256 256 1560 1560 1560 1560 586 ...
##  $ career_PER  : chr  "13" "13" "13" "13" ...
##  $ career_PTS  : num  5.7 5.7 5.7 5.7 5.7 24.6 24.6 24.6 24.6 14.6 ...
##  $ career_TRB  : chr  "3.3" "3.3" "3.3" "3.3" ...
##  $ career_WS   : chr  "4.8" "4.8" "4.8" "4.8" ...
##  $ career_eFG. : chr  "50.2" "50.2" "50.2" "50.2" ...
##  $ draft_pick  : chr  "25th overall" "25th overall" "25th overall" "25th overall" ...
##  $ draft_round : chr  "1st round" "1st round" "1st round" "1st round" ...
##  $ draft_team  : chr  "Portland Trail Blazers" "Portland Trail Blazers" "Portland Trail Blazers" "Portland Trail Blazers" ...
##  $ draft_year  : chr  "1990" "1990" "1990" "1990" ...
##  $ name        : chr  "Alaa Abdelnaby" "Alaa Abdelnaby" "Alaa Abdelnaby" "Alaa Abdelnaby" ...
##  $ position    : chr  "Power Forward" "Power Forward" "Power Forward" "Power Forward" ...
##  $ shoots      : chr  "Right" "Right" "Right" "Right" ...
##  $ league      : chr  "NBA" "NBA" "NBA" "NBA" ...
##  $ salary      : int  500000 805000 494000 650000 395000 2000000 3000000 2030000 1530000 2008000 ...
##  $ season      : chr  "1992-93" "1993-94" "1991-92" "1994-95" ...
##  $ season_end  : int  1993 1994 1992 1995 1991 1988 1989 1986 1985 1992 ...
##  $ season_start: int  1992 1993 1991 1994 1990 1987 1988 1985 1984 1991 ...
##  $ team        : chr  "Boston Celtics" "Boston Celtics" "Portland Trail Blazers" "Sacramento Kings" ...
ggplot(full_data, aes(x = salary))+
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Created new variable called salary_millions in order to make the x-axis on the histogram easier to read.

full_data <-
  mutate(full_data,
    salary_millions = salary/1000000
  )

#Customized histogram
ggplot(full_data, aes(x = salary_millions))+
  geom_histogram(fill = "blue", color = "black")+
  ggtitle("Frequency of NBA salary ranges since 1985")+
  xlab("Salary (Millions)")+
  ylab("Frequency")+ 
  theme_bw()+
  theme(panel.grid.major = element_blank())+
  theme(panel.grid.minor = element_blank())
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Create new variable to designate years before 1998
full_data <-
  mutate(full_data, before_1998 = season_end < 1998)

#check that before_1998 is leveled correctly
str(full_data$before_1998)
##  logi [1:14024] TRUE TRUE TRUE TRUE TRUE TRUE ...
test <- sample(1: nrow(full_data), 10)
  
full_data$season_end[test]
##  [1] 1988 2016 2009 2018 1985 1998 2007 2009 2014 2017
full_data$before_1998[test]
##  [1]  TRUE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE
#Look for linear for relationships between statistics indicating player effectiveness and salary
ggplot(full_data, aes(x = salary_millions, y = career_PTS, color = before_1998))+
  geom_point(size=0.5)

ggplot(full_data, aes(x = salary_millions, y = career_AST, color = before_1998))+
  geom_point(size=0.5)

ggplot(full_data, aes(x = salary_millions, y = career_G, color = before_1998))+
  geom_point(size=0.5)

ggplot(full_data, aes(x = salary_millions, y = career_PER, color = before_1998))+
  geom_point(size=0.5)

#Didn't see any linear relationships between salaries and major stats, so I decided to look at the average in salaries over time as well as the standard deviation.
(meansby_year <- aggregate(salary_millions~season_end, data = full_data, mean))
##    season_end salary_millions
## 1        1985       0.3996364
## 2        1986       0.3726309
## 3        1987       0.5430333
## 4        1988       0.4579613
## 5        1989       0.5286683
## 6        1990       1.6709375
## 7        1991       0.8311449
## 8        1992       0.9533906
## 9        1993       1.0579855
## 10       1994       1.2654576
## 11       1995       1.3576655
## 12       1996       1.7417068
## 13       1997       1.9377223
## 14       1998       2.1307594
## 15       1999       2.4583953
## 16       2000       2.5030532
## 17       2001       3.2225792
## 18       2002       3.3875299
## 19       2003       3.6380194
## 20       2004       3.6516106
## 21       2005       3.7099994
## 22       2006       3.8541600
## 23       2007       3.8156623
## 24       2008       4.2761453
## 25       2009       4.5974022
## 26       2010       4.4649377
## 27       2011       4.3200063
## 28       2012       4.3005636
## 29       2013       4.2519810
## 30       2014       4.8743350
## 31       2015       4.0352025
## 32       2016       4.4363571
## 33       2017       5.3021175
## 34       2018       6.0390105
ggplot(meansby_year, aes(x= season_end, y = salary_millions))+
  geom_line()

(sdby_year <- aggregate(salary_millions~season_end, data = full_data, sd))
##    season_end salary_millions
## 1        1985       0.3442535
## 2        1986       0.3422999
## 3        1987       0.3851189
## 4        1988       0.4113069
## 5        1989       0.4735973
## 6        1990       0.6464278
## 7        1991       0.6609400
## 8        1992       0.8008670
## 9        1993       0.8752406
## 10       1994       0.9759089
## 11       1995       1.1573435
## 12       1996       1.6786839
## 13       1997       2.4732533
## 14       1998       2.7707181
## 15       1999       2.6781868
## 16       2000       3.0487491
## 17       2001       3.4590165
## 18       2002       3.6827405
## 19       2003       4.0373344
## 20       2004       4.0127969
## 21       2005       3.9621434
## 22       2006       4.0244066
## 23       2007       4.1718258
## 24       2008       4.4784066
## 25       2009       4.7330220
## 26       2010       4.6480387
## 27       2011       4.5609979
## 28       2012       4.6120690
## 29       2013       4.6319243
## 30       2014       4.8714458
## 31       2015       4.6649256
## 32       2016       5.1071934
## 33       2017       6.3015544
## 34       2018       7.2451318
ggplot(sdby_year, aes(x= season_end, y = salary_millions))+
  geom_line()

#Used the filter function to isolate by position; since there were so many positions due to combinations, I had to narrow it down to just the main 5, eliminating a number of entries.
save <- c(1, 5, 11, 19, 27)

filtered_positions <-filter(full_data, position %in% levels(as.factor(full_data$position))[save])

ggplot(filtered_positions, aes(x = salary_millions))+
  geom_histogram()+
  facet_wrap(~position)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

(meanby_year <- aggregate(salary_millions~season_end + position, data = filtered_positions, mean))
##     season_end       position salary_millions
## 1         1985         Center       0.4603939
## 2         1986         Center       0.4516957
## 3         1987         Center       0.6646667
## 4         1988         Center       0.5871073
## 5         1989         Center       0.6123197
## 6         1990         Center       1.6476923
## 7         1991         Center       0.9664231
## 8         1992         Center       1.0078065
## 9         1993         Center       1.3152857
## 10        1994         Center       1.3885156
## 11        1995         Center       1.4505500
## 12        1996         Center       1.7947500
## 13        1997         Center       1.8908047
## 14        1998         Center       2.2234496
## 15        1999         Center       3.0327343
## 16        2000         Center       2.7812897
## 17        2001         Center       3.5191462
## 18        2002         Center       3.2847491
## 19        2003         Center       3.7713133
## 20        2004         Center       3.4378641
## 21        2005         Center       3.4059746
## 22        2006         Center       3.1651377
## 23        2007         Center       3.2001369
## 24        2008         Center       3.7015472
## 25        2009         Center       4.1490338
## 26        2010         Center       4.7592147
## 27        2011         Center       3.4878533
## 28        2012         Center       3.4997587
## 29        2013         Center       4.3661598
## 30        2014         Center       4.3001805
## 31        2015         Center       3.7612817
## 32        2016         Center       5.1018702
## 33        2017         Center       6.4249237
## 34        2018         Center       7.0070884
## 35        1985    Point Guard       0.2768421
## 36        1986    Point Guard       0.2354063
## 37        1987    Point Guard       0.4031667
## 38        1988    Point Guard       0.2944671
## 39        1989    Point Guard       0.3395366
## 40        1990    Point Guard       1.5883333
## 41        1991    Point Guard       0.6622549
## 42        1992    Point Guard       0.8094737
## 43        1993    Point Guard       0.8405806
## 44        1994    Point Guard       1.1194912
## 45        1995    Point Guard       1.1480222
## 46        1996    Point Guard       1.4888627
## 47        1997    Point Guard       1.7118018
## 48        1998    Point Guard       1.7379784
## 49        1999    Point Guard       2.0886203
## 50        2000    Point Guard       2.2179489
## 51        2001    Point Guard       2.6594750
## 52        2002    Point Guard       3.0411538
## 53        2003    Point Guard       3.0327076
## 54        2004    Point Guard       2.9635167
## 55        2005    Point Guard       2.8950876
## 56        2006    Point Guard       3.2231790
## 57        2007    Point Guard       3.7989193
## 58        2008    Point Guard       4.0871232
## 59        2009    Point Guard       3.9015430
## 60        2010    Point Guard       3.8515298
## 61        2011    Point Guard       4.0007976
## 62        2012    Point Guard       4.0619049
## 63        2013    Point Guard       4.7205477
## 64        2014    Point Guard       5.1076943
## 65        2015    Point Guard       4.2291127
## 66        2016    Point Guard       4.6036304
## 67        2017    Point Guard       5.4093337
## 68        2018    Point Guard       6.0169971
## 69        1985  Power Forward       0.3569048
## 70        1986  Power Forward       0.3595860
## 71        1987  Power Forward       0.3287777
## 72        1988  Power Forward       0.4268141
## 73        1989  Power Forward       0.5130000
## 74        1990  Power Forward       1.6714286
## 75        1991  Power Forward       0.7620270
## 76        1992  Power Forward       1.0132059
## 77        1993  Power Forward       0.9821842
## 78        1994  Power Forward       1.0785128
## 79        1995  Power Forward       0.9214782
## 80        1996  Power Forward       1.2944333
## 81        1997  Power Forward       1.3955000
## 82        1998  Power Forward       1.2192458
## 83        1999  Power Forward       1.6157814
## 84        2000  Power Forward       1.6376176
## 85        2001  Power Forward       2.5101811
## 86        2002  Power Forward       2.7690089
## 87        2003  Power Forward       2.9925673
## 88        2004  Power Forward       2.7024661
## 89        2005  Power Forward       2.8587976
## 90        2006  Power Forward       2.0455323
## 91        2007  Power Forward       1.7151865
## 92        2008  Power Forward       2.3401987
## 93        2009  Power Forward       2.5676874
## 94        2010  Power Forward       2.7855118
## 95        2011  Power Forward       2.9561931
## 96        2012  Power Forward       2.5251658
## 97        2013  Power Forward       2.8910066
## 98        2014  Power Forward       4.2471591
## 99        2015  Power Forward       3.1015253
## 100       2016  Power Forward       3.3158407
## 101       2017  Power Forward       3.2514119
## 102       2018  Power Forward       4.0941274
## 103       1985 Shooting Guard       0.2689000
## 104       1986 Shooting Guard       0.2539547
## 105       1987 Shooting Guard       0.4475000
## 106       1988 Shooting Guard       0.2680745
## 107       1989 Shooting Guard       0.4142946
## 108       1990 Shooting Guard       1.1500000
## 109       1991 Shooting Guard       0.6238387
## 110       1992 Shooting Guard       0.6768636
## 111       1993 Shooting Guard       0.8003223
## 112       1994 Shooting Guard       1.0186154
## 113       1995 Shooting Guard       1.0196798
## 114       1996 Shooting Guard       1.5849091
## 115       1997 Shooting Guard       1.8515333
## 116       1998 Shooting Guard       1.8515504
## 117       1999 Shooting Guard       1.9789582
## 118       2000 Shooting Guard       2.5775880
## 119       2001 Shooting Guard       3.8955180
## 120       2002 Shooting Guard       3.3387479
## 121       2003 Shooting Guard       3.3631814
## 122       2004 Shooting Guard       2.8127214
## 123       2005 Shooting Guard       3.7397592
## 124       2006 Shooting Guard       2.9073560
## 125       2007 Shooting Guard       1.8503145
## 126       2008 Shooting Guard       2.4853887
## 127       2009 Shooting Guard       2.9446719
## 128       2010 Shooting Guard       3.6568460
## 129       2011 Shooting Guard       3.5795574
## 130       2012 Shooting Guard       3.7609707
## 131       2013 Shooting Guard       3.6306395
## 132       2014 Shooting Guard       3.4510088
## 133       2015 Shooting Guard       2.6285946
## 134       2016 Shooting Guard       2.6612470
## 135       2017 Shooting Guard       3.4193254
## 136       2018 Shooting Guard       4.1250403
## 137       1985  Small Forward       0.4662000
## 138       1986  Small Forward       0.3865127
## 139       1987  Small Forward       0.6390000
## 140       1988  Small Forward       0.4773122
## 141       1989  Small Forward       0.4590226
## 142       1990  Small Forward       1.6833333
## 143       1991  Small Forward       0.7045897
## 144       1992  Small Forward       0.8555128
## 145       1993  Small Forward       0.8932391
## 146       1994  Small Forward       1.1966389
## 147       1995  Small Forward       1.1552559
## 148       1996  Small Forward       1.3084750
## 149       1997  Small Forward       1.3678927
## 150       1998  Small Forward       1.6620889
## 151       1999  Small Forward       2.1975111
## 152       2000  Small Forward       1.9741302
## 153       2001  Small Forward       2.5707266
## 154       2002  Small Forward       3.2910366
## 155       2003  Small Forward       3.7287049
## 156       2004  Small Forward       3.2809051
## 157       2005  Small Forward       4.1072986
## 158       2006  Small Forward       3.8831909
## 159       2007  Small Forward       3.9460484
## 160       2008  Small Forward       4.6905846
## 161       2009  Small Forward       4.5407749
## 162       2010  Small Forward       4.0814645
## 163       2011  Small Forward       3.4176058
## 164       2012  Small Forward       3.0355831
## 165       2013  Small Forward       2.4039074
## 166       2014  Small Forward       3.3349018
## 167       2015  Small Forward       2.3306651
## 168       2016  Small Forward       2.4412495
## 169       2017  Small Forward       3.2162949
## 170       2018  Small Forward       4.0968207
#Created final plot

ggplot(meanby_year, aes(x= season_end, y = salary_millions, color=position))+
  geom_line()+
  facet_wrap(~position)+
  xlab("Year")+
  ylab("Salary (Millions)")+
  ggtitle("NBA Salaries by Position since 1985")+
  theme_dark()+
  theme(plot.background = element_rect(fill = "light blue"))+
  theme(panel.background = element_rect(fill = 'black'))+
  theme(panel.grid.major = element_blank())+
  theme(panel.grid.minor = element_blank())+
  theme(axis.text.x = element_text(size = 8))+
  theme(axis.text.y = element_text(size = 10))+
  theme(axis.title.x = element_text(size = 16))+
  theme(axis.title.y = element_text(size = 16))