# Clearing workspace
rm(list = ls()) # Clear environment
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 522804 28.0 1163480 62.2 660491 35.3
## Vcells 949031 7.3 8388608 64.0 1769514 13.6
# Clear unused memory
cat("\f")
# Clear the console
Correlation is defined as a statistical measure that expresses the extent to which two variables are linearly related. Correlation is measured by r which can take on any value between -1 and 1. Anything above 0 indicates a positive relationship while anything below 0 indicates a negative relationship. A positive relationship tells us that both variables tend to increase together and a negative relationship tells us that as one variable increases, the other decreases. The closer our R value is to either 1 and -1, the stronger the relationship is.
Covariance is a measure of the relationship between two random variables and to the extent that they change together. Similar to correlation, covariance can have both a positive and negative relationship. If the covariance for two variables is positive, we will see both variables move in the same direction. If we have a negative covariance, as one variable is greater, it corresponds with the other variable being lesser. As example of this could be as two stocks tend to move up together, they have a positive covariance.
For this, I decided to look at Michael Jordan and Lebron James’ regular season statistics from their careers. These are two that have constantly been compared to one another as to who is the greatest basketball player of all time. While there are plenty of other factors like playoff statistics, championships, teammates, and so on, I thought it would be interesting to look at their data. There were a lot of variables in our original data set but for our summary statistics, I only wanted to look at a few. For this, I created a dataframes of Games Played, Win/Loss, Minutes Played, Field Goal Percentage, Assists, Steals, Blocks, Points, and plus minus.
# MJ Regular Season Data
MJData <- read.csv('./jordan_career.csv') # Downloading MJ Data
# Lebron Regular Season Data
LebronData <- read.csv('./lebron_career.csv') # Downloading Lebron Data
?merge
## starting httpd help server ... done
# Creating MJ Data Frame
selected_columns <- c(Games = "game",
Win.Loss = "result",
Minutes.Played = "mp",
FG.Percentage = "fgp",
Assist = "ast",
Steals = "stl",
Blocks = "blk",
Points = "pts",
Plus.Minus = "plus_minus")
MJData.df <- MJData[, selected_columns]
# Creating Lebron Data Frame/columns we are more interested in
selected_columns <- c(Games = "game",
Win.Loss = "result",
Minutes.Played = "mp",
FG.Percentage = "fgp",
Assists = "ast",
Steals = "stl",
Blocks = "blk",
Points = "pts",
Plus.Minus = "plus_minus")
LebronData.df <- LebronData[, selected_columns]
# Merging on Points
Merged.Data <- merge(x = MJData.df,
y = LebronData.df,
by.x = c("pts"),
by.y = c("game")
)
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
# MJ Summary Data of Selected Columns
summary(MJData.df)
## game result mp fgp
## Min. : 1.00 Length:1072 Min. :12.00 Min. :0.1110
## 1st Qu.:18.00 Class :character 1st Qu.:36.00 1st Qu.:0.4230
## Median :39.00 Mode :character Median :39.00 Median :0.5000
## Mean :39.57 Mean :38.26 Mean :0.4957
## 3rd Qu.:60.00 3rd Qu.:42.00 3rd Qu.:0.5710
## Max. :82.00 Max. :56.00 Max. :0.8280
## ast stl blk pts
## Min. : 0.000 Min. : 0.000 Min. :0.000 Min. : 2.00
## 1st Qu.: 3.000 1st Qu.: 1.000 1st Qu.:0.000 1st Qu.:23.00
## Median : 5.000 Median : 2.000 Median :1.000 Median :30.00
## Mean : 5.255 Mean : 2.345 Mean :0.833 Mean :30.12
## 3rd Qu.: 7.000 3rd Qu.: 3.000 3rd Qu.:1.000 3rd Qu.:36.00
## Max. :17.000 Max. :10.000 Max. :6.000 Max. :69.00
## plus_minus
## Mode:logical
## NA's:1072
##
##
##
##
# Lebron Summary Data of Selected Columns
summary(LebronData.df)
## game result mp fgp
## Min. : 1.00 Length:1265 Min. :11.0 Min. :0.000
## 1st Qu.:19.00 Class :character 1st Qu.:35.0 1st Qu.:0.429
## Median :38.00 Mode :character Median :39.0 Median :0.500
## Mean :38.04 Mean :38.4 Mean :0.505
## 3rd Qu.:56.00 3rd Qu.:42.0 3rd Qu.:0.579
## Max. :82.00 Max. :55.0 Max. :0.929
## ast stl blk pts
## Min. : 0.000 Min. :0.000 Min. :0.0000 Min. : 3.00
## 1st Qu.: 5.000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:22.00
## Median : 7.000 Median :1.000 Median :1.0000 Median :27.00
## Mean : 7.388 Mean :1.593 Mean :0.7565 Mean :27.07
## 3rd Qu.: 9.000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:32.00
## Max. :19.000 Max. :7.000 Max. :5.0000 Max. :61.00
## plus_minus
## Min. :-39.00
## 1st Qu.: -3.00
## Median : 6.00
## Mean : 5.41
## 3rd Qu.: 14.00
## Max. : 39.00
# Combined Data Sets
summary(Merged.Data)
## pts game result.x mp.x
## Min. : 2.0 Min. : 1.00 Length:18213 Min. :12.00
## 1st Qu.:23.0 1st Qu.:18.00 Class :character 1st Qu.:36.00
## Median :30.0 Median :39.00 Mode :character Median :39.00
## Mean :30.1 Mean :39.56 Mean :38.25
## 3rd Qu.:36.0 3rd Qu.:60.00 3rd Qu.:42.00
## Max. :69.0 Max. :82.00 Max. :56.00
## fgp.x ast.x stl.x blk.x
## Min. :0.1110 Min. : 0.000 Min. : 0.000 Min. :0.0000
## 1st Qu.:0.4230 1st Qu.: 3.000 1st Qu.: 1.000 1st Qu.:0.0000
## Median :0.5000 Median : 5.000 Median : 2.000 Median :1.0000
## Mean :0.4957 Mean : 5.255 Mean : 2.344 Mean :0.8329
## 3rd Qu.:0.5710 3rd Qu.: 7.000 3rd Qu.: 3.000 3rd Qu.:1.0000
## Max. :0.8280 Max. :17.000 Max. :10.000 Max. :6.0000
## plus_minus.x result.y mp.y fgp.y
## Mode:logical Length:18213 Min. :17.00 Min. :0.0000
## NA's:18213 Class :character 1st Qu.:36.00 1st Qu.:0.4350
## Mode :character Median :39.00 Median :0.5000
## Mean :38.41 Mean :0.5048
## 3rd Qu.:42.00 3rd Qu.:0.5790
## Max. :55.00 Max. :0.9290
## ast.y stl.y blk.y pts.y
## Min. : 0.000 Min. :0.000 Min. :0.000 Min. : 3.00
## 1st Qu.: 5.000 1st Qu.:1.000 1st Qu.:0.000 1st Qu.:22.00
## Median : 7.000 Median :1.000 Median :1.000 Median :27.00
## Mean : 7.349 Mean :1.675 Mean :0.739 Mean :26.81
## 3rd Qu.: 9.000 3rd Qu.:2.000 3rd Qu.:1.000 3rd Qu.:32.00
## Max. :19.000 Max. :7.000 Max. :5.000 Max. :61.00
## plus_minus.y
## Min. :-39.000
## 1st Qu.: -2.000
## Median : 6.000
## Mean : 5.666
## 3rd Qu.: 14.000
## Max. : 39.000
I want to run a test to see if there is any correlation between the amount of individual points for Lebron James and his field goal percentage for any given night.
# Scatterplot this data
plot(x = LebronData$pts,
y = LebronData$fgp,
xlab = "Points",
ylab = "Field Goal Percentage",
main = "Points vs Field Goal Percentage"
)
# Calculating our Correlation Coefficient
cor(x = LebronData$pts,
y = LebronData$fgp)
## [1] 0.4912411
Compared to the video linked in the discussion,we did not have any missing data sets so did not need to add in or remove any values. As we can see here, we have r=0.4912. This tells us that we have a positive relationship and that points vs field goal percentage are moderately correlated. If this R value was closer to 1, we could say it has a strong correlation. This means that field goal percentage does have a positive affect on the amount of points Lebron scores on any given night, they are just not very strongly correlated.
# Plotting with Regression Line
plot(x = LebronData$pts,
y = LebronData$fgp,
xlab = "Points",
ylab = "Field Goal Percentage",
main = "Points vs Field Goal Percentage"
)
abline(lm(LebronData$pts ~ LebronData$fgp)) # This wont show?