# Read in data
firstbase = read.csv("firstbasestats.csv")
str(firstbase)
'data.frame': 23 obs. of 15 variables:
$ Player : chr "Freddie Freeman" "Jose Abreu" "Nate Lowe" "Paul Goldschmidt" ...
$ Pos : chr "1B" "1B" "1B" "1B" ...
$ Team : chr "LAD" "CHW" "TEX" "STL" ...
$ GP : int 159 157 157 151 160 140 160 145 146 143 ...
$ AB : int 612 601 593 561 638 551 583 555 545 519 ...
$ H : int 199 183 179 178 175 152 141 139 132 124 ...
$ X2B : int 47 40 26 41 35 27 25 28 40 23 ...
$ HR : int 21 15 27 35 32 20 36 22 8 18 ...
$ RBI : int 100 75 76 115 97 84 94 85 53 63 ...
$ AVG : num 0.325 0.305 0.302 0.317 0.274 0.276 0.242 0.251 0.242 0.239 ...
$ OBP : num 0.407 0.379 0.358 0.404 0.339 0.34 0.327 0.305 0.288 0.319 ...
$ SLG : num 0.511 0.446 0.492 0.578 0.48 0.437 0.477 0.423 0.36 0.391 ...
$ OPS : num 0.918 0.824 0.851 0.981 0.818 0.777 0.804 0.729 0.647 0.71 ...
$ WAR : num 5.77 4.19 3.21 7.86 3.85 3.07 5.05 1.32 -0.33 1.87 ...
$ Payroll.Salary2023: num 27000000 19500000 4050000 26000000 14500000 ...
summary(firstbase)
Player Pos
Length:23 Length:23
Class :character Class :character
Mode :character Mode :character
Team GP AB
Length:23 Min. : 5.0 Min. : 14.0
Class :character 1st Qu.:105.5 1st Qu.:309.0
Mode :character Median :131.0 Median :465.0
Mean :120.2 Mean :426.9
3rd Qu.:152.0 3rd Qu.:558.0
Max. :160.0 Max. :638.0
H X2B HR
Min. : 3.0 Min. : 1.00 Min. : 0.00
1st Qu.: 74.5 1st Qu.:13.50 1st Qu.: 8.00
Median :115.0 Median :23.00 Median :18.00
Mean :110.0 Mean :22.39 Mean :17.09
3rd Qu.:146.5 3rd Qu.:28.00 3rd Qu.:24.50
Max. :199.0 Max. :47.00 Max. :36.00
RBI AVG OBP
Min. : 1.00 Min. :0.2020 Min. :0.2140
1st Qu.: 27.00 1st Qu.:0.2180 1st Qu.:0.3030
Median : 63.00 Median :0.2420 Median :0.3210
Mean : 59.43 Mean :0.2499 Mean :0.3242
3rd Qu.: 84.50 3rd Qu.:0.2750 3rd Qu.:0.3395
Max. :115.00 Max. :0.3250 Max. :0.4070
SLG OPS WAR
Min. :0.2860 Min. :0.5000 Min. :-1.470
1st Qu.:0.3505 1st Qu.:0.6445 1st Qu.: 0.190
Median :0.4230 Median :0.7290 Median : 1.310
Mean :0.4106 Mean :0.7346 Mean : 1.788
3rd Qu.:0.4690 3rd Qu.:0.8175 3rd Qu.: 3.140
Max. :0.5780 Max. :0.9810 Max. : 7.860
Payroll.Salary2023
Min. : 720000
1st Qu.: 739200
Median : 4050000
Mean : 6972743
3rd Qu.: 8150000
Max. :27000000
# Linear Regression (one variable)
model1 = lm(Payroll.Salary2023 ~ RBI, data=firstbase)
summary(model1)
Call:
lm(formula = Payroll.Salary2023 ~ RBI, data = firstbase)
Residuals:
Min 1Q Median 3Q Max
-10250331 -5220790 -843455 2386848 13654950
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2363744 2866320 -0.825 0.41883
RBI 157088 42465 3.699 0.00133 **
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 6516000 on 21 degrees of freedom
Multiple R-squared: 0.3945, Adjusted R-squared: 0.3657
F-statistic: 13.68 on 1 and 21 DF, p-value: 0.001331
# Sum of Squared Errors
model1$residuals
1 2 3 4
13654950.2 10082148.6 -5524939.3 10298631.2
5 6 7 8
1626214.0 -6731642.8 -5902522.2 -10250330.7
9 10 11 12
-4711916.8 -532796.1 -6667082.5 -6696203.1
13 14 15 16
7582148.6 -4916640.9 -1898125.3 -336532.3
17 18 19 20
-995042.5 -1311618.3 -843454.5 8050721.3
21 22 23
1250336.9 1847040.4 2926656.0
SSE = sum(model1$residuals^2)
SSE
[1] 8.914926e+14
# Linear Regression (two variables)
model2 = lm(Payroll.Salary2023 ~ AVG + RBI, data=firstbase)
summary(model2)
Call:
lm(formula = Payroll.Salary2023 ~ AVG + RBI, data = firstbase)
Residuals:
Min 1Q Median 3Q Max
-9097952 -4621582 -33233 3016541 10260245
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -18083756 9479036 -1.908 0.0709 .
AVG 74374031 42934155 1.732 0.0986 .
RBI 108850 49212 2.212 0.0388 *
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 6226000 on 20 degrees of freedom
Multiple R-squared: 0.4735, Adjusted R-squared: 0.4209
F-statistic: 8.994 on 2 and 20 DF, p-value: 0.001636
# Sum of Squared Errors
SSE = sum(model2$residuals^2)
SSE
[1] 7.751841e+14
# Linear Regression (all variables)
model3 = lm(Payroll.Salary2023 ~ HR + RBI + AVG + OBP+ OPS, data=firstbase)
summary(model3)
Call:
lm(formula = Payroll.Salary2023 ~ HR + RBI + AVG + OBP + OPS,
data = firstbase)
Residuals:
Min 1Q Median 3Q Max
-9611440 -3338119 64016 4472451 9490309
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -31107858 11738494 -2.650 0.0168 *
HR -341069 552069 -0.618 0.5449
RBI 115786 113932 1.016 0.3237
AVG -63824769 104544645 -0.611 0.5496
OBP 27054948 131210166 0.206 0.8391
OPS 60181012 95415131 0.631 0.5366
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 6023000 on 17 degrees of freedom
Multiple R-squared: 0.5811, Adjusted R-squared: 0.4579
F-statistic: 4.717 on 5 and 17 DF, p-value: 0.006951
# Sum of Squared Errors
SSE = sum(model3$residuals^2)
SSE
[1] 6.167793e+14
# Remove HR
model4 = lm(Payroll.Salary2023 ~ RBI + AVG + OBP+OPS, data=firstbase)
summary(model4)
Call:
lm(formula = Payroll.Salary2023 ~ RBI + AVG + OBP + OPS, data = firstbase)
Residuals:
Min 1Q Median 3Q Max
-9399551 -3573842 98921 3979339 9263512
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -29466887 11235931 -2.623 0.0173 *
RBI 71495 87015 0.822 0.4220
AVG -11035457 59192453 -0.186 0.8542
OBP 86360720 87899074 0.982 0.3389
OPS 9464546 47788458 0.198 0.8452
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 5919000 on 18 degrees of freedom
Multiple R-squared: 0.5717, Adjusted R-squared: 0.4765
F-statistic: 6.007 on 4 and 18 DF, p-value: 0.00298
firstbase<-firstbase[,-(1:3)]
# Correlations
cor(firstbase$RBI, firstbase$Payroll.Salary2023)
[1] 0.6281239
cor(firstbase$AVG, firstbase$OBP)
[1] 0.8028894
cor(firstbase)
GP AB H
GP 1.0000000 0.9779421 0.9056508
AB 0.9779421 1.0000000 0.9516701
H 0.9056508 0.9516701 1.0000000
X2B 0.8446267 0.8924632 0.9308318
HR 0.7432552 0.7721339 0.7155225
RBI 0.8813917 0.9125839 0.9068893
AVG 0.4430808 0.5126292 0.7393167
OBP 0.4841583 0.5026125 0.6560021
SLG 0.6875270 0.7471949 0.8211406
OPS 0.6504483 0.6980141 0.8069779
WAR 0.5645243 0.6211558 0.7688712
Payroll.Salary2023 0.4614889 0.5018820 0.6249911
X2B HR RBI
GP 0.8446267 0.7432552 0.8813917
AB 0.8924632 0.7721339 0.9125839
H 0.9308318 0.7155225 0.9068893
X2B 1.0000000 0.5889699 0.8485911
HR 0.5889699 1.0000000 0.8929048
RBI 0.8485911 0.8929048 1.0000000
AVG 0.6613085 0.3444242 0.5658479
OBP 0.5466537 0.4603408 0.5704463
SLG 0.7211259 0.8681501 0.8824090
OPS 0.6966830 0.7638721 0.8156612
WAR 0.6757470 0.6897677 0.7885666
Payroll.Salary2023 0.6450730 0.5317619 0.6281239
AVG OBP SLG
GP 0.4430808 0.4841583 0.6875270
AB 0.5126292 0.5026125 0.7471949
H 0.7393167 0.6560021 0.8211406
X2B 0.6613085 0.5466537 0.7211259
HR 0.3444242 0.4603408 0.8681501
RBI 0.5658479 0.5704463 0.8824090
AVG 1.0000000 0.8028894 0.7254274
OBP 0.8028894 1.0000000 0.7617499
SLG 0.7254274 0.7617499 1.0000000
OPS 0.7989005 0.8987390 0.9686752
WAR 0.7855945 0.7766375 0.8611140
Payroll.Salary2023 0.5871543 0.7025979 0.6974086
OPS WAR
GP 0.6504483 0.5645243
AB 0.6980141 0.6211558
H 0.8069779 0.7688712
X2B 0.6966830 0.6757470
HR 0.7638721 0.6897677
RBI 0.8156612 0.7885666
AVG 0.7989005 0.7855945
OBP 0.8987390 0.7766375
SLG 0.9686752 0.8611140
OPS 1.0000000 0.8799893
WAR 0.8799893 1.0000000
Payroll.Salary2023 0.7394981 0.8086359
Payroll.Salary2023
GP 0.4614889
AB 0.5018820
H 0.6249911
X2B 0.6450730
HR 0.5317619
RBI 0.6281239
AVG 0.5871543
OBP 0.7025979
SLG 0.6974086
OPS 0.7394981
WAR 0.8086359
Payroll.Salary2023 1.0000000
#Removing AVG
model5 = lm(Payroll.Salary2023 ~ RBI + OBP+OPS, data=firstbase)
summary(model5)
Call:
lm(formula = Payroll.Salary2023 ~ RBI + OBP + OPS, data = firstbase)
Residuals:
Min 1Q Median 3Q Max
-9465449 -3411234 259746 4102864 8876798
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -29737007 10855411 -2.739 0.013 *
RBI 72393 84646 0.855 0.403
OBP 82751360 83534224 0.991 0.334
OPS 7598051 45525575 0.167 0.869
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 5767000 on 19 degrees of freedom
Multiple R-squared: 0.5709, Adjusted R-squared: 0.5031
F-statistic: 8.426 on 3 and 19 DF, p-value: 0.000913
model6 = lm(Payroll.Salary2023 ~ RBI + OBP, data=firstbase)
summary(model6)
Call:
lm(formula = Payroll.Salary2023 ~ RBI + OBP, data = firstbase)
Residuals:
Min 1Q Median 3Q Max
-9045497 -3487008 139497 4084739 9190185
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -28984802 9632560 -3.009 0.00693 **
RBI 84278 44634 1.888 0.07360 .
OBP 95468873 33385182 2.860 0.00969 **
---
Signif. codes:
0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 5625000 on 20 degrees of freedom
Multiple R-squared: 0.5703, Adjusted R-squared: 0.5273
F-statistic: 13.27 on 2 and 20 DF, p-value: 0.0002149
# Read in test set
firstbaseTest = read.csv("firstbasestats_test.csv")
str(firstbaseTest)
'data.frame': 2 obs. of 15 variables:
$ Player : chr "Matt Olson" "Josh Bell"
$ Pos : chr "1B" "1B"
$ Team : chr "ATL" "SD"
$ GP : int 162 156
$ AB : int 616 552
$ H : int 148 147
$ X2B : int 44 29
$ HR : int 34 17
$ RBI : int 103 71
$ AVG : num 0.24 0.266
$ OBP : num 0.325 0.362
$ SLG : num 0.477 0.422
$ OPS : num 0.802 0.784
$ WAR : num 3.29 3.5
$ Payroll.Salary2023: num 21000000 16500000
# Make test set predictions
predictTest = predict(model6, newdata=firstbaseTest)
predictTest
1 2
10723186 11558647
# Compute R-squared
SSE = sum((firstbaseTest$Payroll.Salary2023 - predictTest)^2)
SST = sum((firstbaseTest$Payroll.Salary2023 - mean(firstbase$Payroll.Salary2023))^2)
1 - SSE/SST
[1] 0.5477734
#After completing the analysis, I found some interesting insights into how salary connects with hitting performance for active first basemen in the 2022 MLB season. While there were clear trends, such as players with higher HR, RBI, and SLG stats generally earning more, there were also several outliers that stood out. Some players were being paid much more or less than their performance would suggest, which points to other influences like experience, contract history, or even marketability playing a part in salary decisions. This ended up being a great hands-on project for practicing data wrangling, visualization, and correlation analysis in R. It definitely gave me ideas for exploring other positions or comparing across multiple seasons.
LS0tCnRpdGxlOiAiSW50cm8gdG8gTGluZWFyIFJlZ3Jlc3Npb246IEZpcnN0IEJhc2UgaGl0dGluZyBzdGF0cy0tQWN0aXZpdHkgNiIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKYGBge3J9CiMgUmVhZCBpbiBkYXRhCmZpcnN0YmFzZSA9IHJlYWQuY3N2KCJmaXJzdGJhc2VzdGF0cy5jc3YiKQpzdHIoZmlyc3RiYXNlKQpgYGAKYGBge3J9CnN1bW1hcnkoZmlyc3RiYXNlKQpgYGAKYGBge3J9CiMgTGluZWFyIFJlZ3Jlc3Npb24gKG9uZSB2YXJpYWJsZSkKbW9kZWwxID0gbG0oUGF5cm9sbC5TYWxhcnkyMDIzIH4gUkJJLCBkYXRhPWZpcnN0YmFzZSkKc3VtbWFyeShtb2RlbDEpCmBgYApgYGB7cn0KIyBTdW0gb2YgU3F1YXJlZCBFcnJvcnMKbW9kZWwxJHJlc2lkdWFscwpgYGAKYGBge3J9ClNTRSA9IHN1bShtb2RlbDEkcmVzaWR1YWxzXjIpClNTRQpgYGAKYGBge3J9CiMgTGluZWFyIFJlZ3Jlc3Npb24gKHR3byB2YXJpYWJsZXMpCm1vZGVsMiA9IGxtKFBheXJvbGwuU2FsYXJ5MjAyMyB+IEFWRyArIFJCSSwgZGF0YT1maXJzdGJhc2UpCnN1bW1hcnkobW9kZWwyKQpgYGAKYGBge3J9CiMgU3VtIG9mIFNxdWFyZWQgRXJyb3JzClNTRSA9IHN1bShtb2RlbDIkcmVzaWR1YWxzXjIpClNTRQpgYGAKYGBge3J9CiMgTGluZWFyIFJlZ3Jlc3Npb24gKGFsbCB2YXJpYWJsZXMpCm1vZGVsMyA9IGxtKFBheXJvbGwuU2FsYXJ5MjAyMyB+IEhSICsgUkJJICsgQVZHICsgT0JQKyBPUFMsIGRhdGE9Zmlyc3RiYXNlKQpzdW1tYXJ5KG1vZGVsMykKYGBgCmBgYHtyfQojIFN1bSBvZiBTcXVhcmVkIEVycm9ycwpTU0UgPSBzdW0obW9kZWwzJHJlc2lkdWFsc14yKQpTU0UKYGBgCmBgYHtyfQojIFJlbW92ZSBIUgptb2RlbDQgPSBsbShQYXlyb2xsLlNhbGFyeTIwMjMgfiBSQkkgKyBBVkcgKyBPQlArT1BTLCBkYXRhPWZpcnN0YmFzZSkKc3VtbWFyeShtb2RlbDQpCmBgYApgYGB7cn0KZmlyc3RiYXNlPC1maXJzdGJhc2VbLC0oMTozKV0KYGBgCgpgYGB7cn0KIyBDb3JyZWxhdGlvbnMKY29yKGZpcnN0YmFzZSRSQkksIGZpcnN0YmFzZSRQYXlyb2xsLlNhbGFyeTIwMjMpCmBgYApgYGB7cn0KY29yKGZpcnN0YmFzZSRBVkcsIGZpcnN0YmFzZSRPQlApCmBgYApgYGB7cn0KY29yKGZpcnN0YmFzZSkKYGBgCmBgYHtyfQojUmVtb3ZpbmcgQVZHCm1vZGVsNSA9IGxtKFBheXJvbGwuU2FsYXJ5MjAyMyB+IFJCSSArIE9CUCtPUFMsIGRhdGE9Zmlyc3RiYXNlKQpzdW1tYXJ5KG1vZGVsNSkKYGBgCmBgYHtyfQptb2RlbDYgPSBsbShQYXlyb2xsLlNhbGFyeTIwMjMgfiBSQkkgKyBPQlAsIGRhdGE9Zmlyc3RiYXNlKQpzdW1tYXJ5KG1vZGVsNikKYGBgCmBgYHtyfQojIFJlYWQgaW4gdGVzdCBzZXQKZmlyc3RiYXNlVGVzdCA9IHJlYWQuY3N2KCJmaXJzdGJhc2VzdGF0c190ZXN0LmNzdiIpCnN0cihmaXJzdGJhc2VUZXN0KQpgYGAKYGBge3J9CiMgTWFrZSB0ZXN0IHNldCBwcmVkaWN0aW9ucwpwcmVkaWN0VGVzdCA9IHByZWRpY3QobW9kZWw2LCBuZXdkYXRhPWZpcnN0YmFzZVRlc3QpCnByZWRpY3RUZXN0CmBgYApgYGB7cn0KIyBDb21wdXRlIFItc3F1YXJlZApTU0UgPSBzdW0oKGZpcnN0YmFzZVRlc3QkUGF5cm9sbC5TYWxhcnkyMDIzIC0gcHJlZGljdFRlc3QpXjIpClNTVCA9IHN1bSgoZmlyc3RiYXNlVGVzdCRQYXlyb2xsLlNhbGFyeTIwMjMgLSBtZWFuKGZpcnN0YmFzZSRQYXlyb2xsLlNhbGFyeTIwMjMpKV4yKQoxIC0gU1NFL1NTVApgYGAKYGBge3J9CiNBZnRlciBjb21wbGV0aW5nIHRoZSBhbmFseXNpcywgSSBmb3VuZCBzb21lIGludGVyZXN0aW5nIGluc2lnaHRzIGludG8gaG93IHNhbGFyeSBjb25uZWN0cyB3aXRoIGhpdHRpbmcgcGVyZm9ybWFuY2UgZm9yIGFjdGl2ZSBmaXJzdCBiYXNlbWVuIGluIHRoZSAyMDIyIE1MQiBzZWFzb24uIFdoaWxlIHRoZXJlIHdlcmUgY2xlYXIgdHJlbmRzLCBzdWNoIGFzIHBsYXllcnMgd2l0aCBoaWdoZXIgSFIsIFJCSSwgYW5kIFNMRyBzdGF0cyBnZW5lcmFsbHkgZWFybmluZyBtb3JlLCB0aGVyZSB3ZXJlIGFsc28gc2V2ZXJhbCBvdXRsaWVycyB0aGF0IHN0b29kIG91dC4gU29tZSBwbGF5ZXJzIHdlcmUgYmVpbmcgcGFpZCBtdWNoIG1vcmUgb3IgbGVzcyB0aGFuIHRoZWlyIHBlcmZvcm1hbmNlIHdvdWxkIHN1Z2dlc3QsIHdoaWNoIHBvaW50cyB0byBvdGhlciBpbmZsdWVuY2VzIGxpa2UgZXhwZXJpZW5jZSwgY29udHJhY3QgaGlzdG9yeSwgb3IgZXZlbiBtYXJrZXRhYmlsaXR5IHBsYXlpbmcgYSBwYXJ0IGluIHNhbGFyeSBkZWNpc2lvbnMuIFRoaXMgZW5kZWQgdXAgYmVpbmcgYSBncmVhdCBoYW5kcy1vbiBwcm9qZWN0IGZvciBwcmFjdGljaW5nIGRhdGEgd3JhbmdsaW5nLCB2aXN1YWxpemF0aW9uLCBhbmQgY29ycmVsYXRpb24gYW5hbHlzaXMgaW4gUi4gSXQgZGVmaW5pdGVseSBnYXZlIG1lIGlkZWFzIGZvciBleHBsb3Jpbmcgb3RoZXIgcG9zaXRpb25zIG9yIGNvbXBhcmluZyBhY3Jvc3MgbXVsdGlwbGUgc2Vhc29ucy4KYGBgCgo=