#setting the wd 
library(readr)
library(ggplot2)
setwd("~/NYU/classes/4. Statistical Modeling/meetups")
#loading and viewing the data 
library(readr)
baseball <- read_csv("BaseballHittersSalary.csv")
## Rows: 263 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (2): Hits, Salary
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(baseball)
head(baseball,10)
## # A tibble: 10 x 2
##     Hits Salary
##    <dbl>  <dbl>
##  1    81  475  
##  2   130  480  
##  3   141  500  
##  4    87   91.5
##  5   169  750  
##  6    37   70  
##  7    73  100  
##  8    81   75  
##  9    92 1100  
## 10   159  517.
names(baseball)
## [1] "Hits"   "Salary"

#initial regression

# simple regression model stored in the variable named "RegModel"
RegModel<- lm(Salary ~ Hits, data= baseball) 
#getting the summary
summary(RegModel)
## 
## Call:
## lm(formula = Salary ~ Hits, data = baseball)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -893.99 -245.63  -59.08  181.12 2059.90 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  63.0488    64.9822   0.970    0.333    
## Hits          4.3854     0.5561   7.886 8.53e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 406.2 on 261 degrees of freedom
## Multiple R-squared:  0.1924, Adjusted R-squared:  0.1893 
## F-statistic: 62.19 on 1 and 261 DF,  p-value: 8.531e-14

#simple regression so plot the fitted line and the values

# deciding on X and Y axis + plotting the points on graph in purple
p1 = ggplot( data= baseball,aes( x= Hits,y= Salary)) + geom_point( color= 'purple') 

# plotting the regression line through the points
# "labs()" function is used to lable the axis
p1 + geom_smooth( method= 'lm', se= F, col= "orange")+ labs(x= "Hits", y= "Salary") 
## `geom_smooth()` using formula 'y ~ x'

#Plot of fitted values vs. standard residuals (can be multiple regression )

#loading ggplot library
library(ggplot2)
# obtaining standard residuals
RegModel.StdRes <- rstandard(RegModel)
# obtaining fitted values
RegModel.Fit <- fitted.values(RegModel)
# deciding on X and Y axis + plotting the points on graph in blue
p3=ggplot(data=baseball,aes(x=RegModel.Fit,y=RegModel.StdRes))+geom_point(color='blue')
# plotting the best fitting line through the points in red
# "labs()" function is used to lable the axis
p3 + geom_smooth( method= 'lm', se= F, col= "red")+ labs(x= "Fitted values", y= "Standard Residuals")
## `geom_smooth()` using formula 'y ~ x'

#note it is cone / funnel shaped so re-run with the log of Y

# Creating a new coloumn named 'log.Salary' which is natural log of an already existing coloumn 'Price'
baseball$log.Salary <- log(baseball$Salary)

# Viewing how our new data frame looks
head(baseball, 10)
## # A tibble: 10 x 3
##     Hits Salary log.Salary
##    <dbl>  <dbl>      <dbl>
##  1    81  475         6.16
##  2   130  480         6.17
##  3   141  500         6.21
##  4    87   91.5       4.52
##  5   169  750         6.62
##  6    37   70         4.25
##  7    73  100         4.61
##  8    81   75         4.32
##  9    92 1100         7.00
## 10   159  517.        6.25

#simple regression so plot the fitted line and the values with log.Salary

# deciding on X and Y axis + plotting the points on graph in purple
p1 = ggplot( data= baseball,aes( x= Hits,y= log.Salary)) + geom_point( color= 'purple') 

# plotting the regression line through the points
# "labs()" function is used to lable the axis
p1 + geom_smooth( method= 'lm', se= F, col= "orange")+ labs(x= "Hits", y= "log.Salary") 
## `geom_smooth()` using formula 'y ~ x'

#updated regression with log.Salary

# simple regression model stored in the variable named "RegModel2"
RegModel2<- lm(log.Salary ~ Hits, data= baseball) 
#getting the summary
summary(RegModel2)
## 
## Call:
## lm(formula = log.Salary ~ Hits, data = baseball)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3325 -0.7185  0.1471  0.5510  2.6818 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4.971962   0.127315  39.053  < 2e-16 ***
## Hits        0.008859   0.001089   8.131 1.72e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7958 on 261 degrees of freedom
## Multiple R-squared:  0.2021, Adjusted R-squared:  0.1991 
## F-statistic: 66.12 on 1 and 261 DF,  p-value: 1.724e-14

#Plot of fitted values vs. standard residuals (can be multiple regression ) - updated for the log.Salary

#loading ggplot library
library(ggplot2)
# obtaining standard residuals
RegModel.StdRes2 <- rstandard(RegModel2)
# obtaining fitted values
RegModel.Fit2 <- fitted.values(RegModel2)
# deciding on X and Y axis + plotting the points on graph in blue
p3=ggplot(data=baseball,aes(x=RegModel.Fit2,y=RegModel.StdRes2))+geom_point(color='blue')
# plotting the best fitting line through the points in red
# "labs()" function is used to lable the axis
p3 + geom_smooth( method= 'lm', se= F, col= "red")+ labs(x= "Fitted values", y= "Standard Residuals")
## `geom_smooth()` using formula 'y ~ x'