#setting the wd
library(readr)
library(ggplot2)
setwd("~/NYU/classes/4. Statistical Modeling/meetups")
#loading and viewing the data
library(readr)
baseball <- read_csv("BaseballHittersSalary.csv")
## Rows: 263 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (2): Hits, Salary
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(baseball)
head(baseball,10)
## # A tibble: 10 x 2
## Hits Salary
## <dbl> <dbl>
## 1 81 475
## 2 130 480
## 3 141 500
## 4 87 91.5
## 5 169 750
## 6 37 70
## 7 73 100
## 8 81 75
## 9 92 1100
## 10 159 517.
names(baseball)
## [1] "Hits" "Salary"
#initial regression
# simple regression model stored in the variable named "RegModel"
RegModel<- lm(Salary ~ Hits, data= baseball)
#getting the summary
summary(RegModel)
##
## Call:
## lm(formula = Salary ~ Hits, data = baseball)
##
## Residuals:
## Min 1Q Median 3Q Max
## -893.99 -245.63 -59.08 181.12 2059.90
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 63.0488 64.9822 0.970 0.333
## Hits 4.3854 0.5561 7.886 8.53e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 406.2 on 261 degrees of freedom
## Multiple R-squared: 0.1924, Adjusted R-squared: 0.1893
## F-statistic: 62.19 on 1 and 261 DF, p-value: 8.531e-14
#simple regression so plot the fitted line and the values
# deciding on X and Y axis + plotting the points on graph in purple
p1 = ggplot( data= baseball,aes( x= Hits,y= Salary)) + geom_point( color= 'purple')
# plotting the regression line through the points
# "labs()" function is used to lable the axis
p1 + geom_smooth( method= 'lm', se= F, col= "orange")+ labs(x= "Hits", y= "Salary")
## `geom_smooth()` using formula 'y ~ x'
#Plot of fitted values vs. standard residuals (can be multiple regression )
#loading ggplot library
library(ggplot2)
# obtaining standard residuals
RegModel.StdRes <- rstandard(RegModel)
# obtaining fitted values
RegModel.Fit <- fitted.values(RegModel)
# deciding on X and Y axis + plotting the points on graph in blue
p3=ggplot(data=baseball,aes(x=RegModel.Fit,y=RegModel.StdRes))+geom_point(color='blue')
# plotting the best fitting line through the points in red
# "labs()" function is used to lable the axis
p3 + geom_smooth( method= 'lm', se= F, col= "red")+ labs(x= "Fitted values", y= "Standard Residuals")
## `geom_smooth()` using formula 'y ~ x'
#note it is cone / funnel shaped so re-run with the log of Y
# Creating a new coloumn named 'log.Salary' which is natural log of an already existing coloumn 'Price'
baseball$log.Salary <- log(baseball$Salary)
# Viewing how our new data frame looks
head(baseball, 10)
## # A tibble: 10 x 3
## Hits Salary log.Salary
## <dbl> <dbl> <dbl>
## 1 81 475 6.16
## 2 130 480 6.17
## 3 141 500 6.21
## 4 87 91.5 4.52
## 5 169 750 6.62
## 6 37 70 4.25
## 7 73 100 4.61
## 8 81 75 4.32
## 9 92 1100 7.00
## 10 159 517. 6.25
#simple regression so plot the fitted line and the values with log.Salary
# deciding on X and Y axis + plotting the points on graph in purple
p1 = ggplot( data= baseball,aes( x= Hits,y= log.Salary)) + geom_point( color= 'purple')
# plotting the regression line through the points
# "labs()" function is used to lable the axis
p1 + geom_smooth( method= 'lm', se= F, col= "orange")+ labs(x= "Hits", y= "log.Salary")
## `geom_smooth()` using formula 'y ~ x'
#updated regression with log.Salary
# simple regression model stored in the variable named "RegModel2"
RegModel2<- lm(log.Salary ~ Hits, data= baseball)
#getting the summary
summary(RegModel2)
##
## Call:
## lm(formula = log.Salary ~ Hits, data = baseball)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.3325 -0.7185 0.1471 0.5510 2.6818
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.971962 0.127315 39.053 < 2e-16 ***
## Hits 0.008859 0.001089 8.131 1.72e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7958 on 261 degrees of freedom
## Multiple R-squared: 0.2021, Adjusted R-squared: 0.1991
## F-statistic: 66.12 on 1 and 261 DF, p-value: 1.724e-14
#Plot of fitted values vs. standard residuals (can be multiple regression ) - updated for the log.Salary
#loading ggplot library
library(ggplot2)
# obtaining standard residuals
RegModel.StdRes2 <- rstandard(RegModel2)
# obtaining fitted values
RegModel.Fit2 <- fitted.values(RegModel2)
# deciding on X and Y axis + plotting the points on graph in blue
p3=ggplot(data=baseball,aes(x=RegModel.Fit2,y=RegModel.StdRes2))+geom_point(color='blue')
# plotting the best fitting line through the points in red
# "labs()" function is used to lable the axis
p3 + geom_smooth( method= 'lm', se= F, col= "red")+ labs(x= "Fitted values", y= "Standard Residuals")
## `geom_smooth()` using formula 'y ~ x'