Assignment on Simple Linear Regression (Name: Karthick Sothivelr)

# Read Salary Data csv file and store it in salary_data
salary_data <- read.csv("C:\\Users\\samy_\\Desktop\\Data Science Class Materials\\Salary_Data.csv")
View(salary_data)
attach(salary_data)
# Structure of Salary Data
str(salary_data)
## 'data.frame':    30 obs. of  2 variables:
##  $ YearsExperience: num  1.1 1.3 1.5 2 2.2 2.9 3 3.2 3.2 3.7 ...
##  $ Salary         : int  39343 46205 37731 43525 39891 56642 60150 54445 64445 57189 ...
# Summary of Statistics for Salary Data
summary(salary_data)
##  YearsExperience      Salary      
##  Min.   : 1.100   Min.   : 37731  
##  1st Qu.: 3.200   1st Qu.: 56721  
##  Median : 4.700   Median : 65237  
##  Mean   : 5.313   Mean   : 76003  
##  3rd Qu.: 7.700   3rd Qu.:100545  
##  Max.   :10.500   Max.   :122391
library(psych)
describe(salary_data)
##                 vars  n     mean       sd  median  trimmed      mad
## YearsExperience    1 30     5.31     2.84     4.7     5.21     2.89
## Salary             2 30 76003.00 27414.43 65237.0 75078.88 30203.53
##                     min      max   range skew kurtosis      se
## YearsExperience     1.1     10.5     9.4 0.34    -1.17    0.52
## Salary          37731.0 122391.0 84660.0 0.32    -1.40 5005.17
# Scatter Plot of Salary VS YearsExperience
plot(YearsExperience, Salary, xlab = "Years Experience", ylab = "Salary", main = "Scatter Plot of Salary vs Years Experience", las = 1)

# Correlation Coefficient of Salary and YearsExperience
cor(YearsExperience, Salary) # Strong +ve Correlation
## [1] 0.9782416
# Simple Linear Regression Model (Y = Salary, X = Years Experience)
lin_model <- lm(Salary ~ YearsExperience)
summary(lin_model)
## 
## Call:
## lm(formula = Salary ~ YearsExperience)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7958.0 -4088.5  -459.9  3372.6 11448.0 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      25792.2     2273.1   11.35 5.51e-12 ***
## YearsExperience   9450.0      378.8   24.95  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5788 on 28 degrees of freedom
## Multiple R-squared:  0.957,  Adjusted R-squared:  0.9554 
## F-statistic: 622.5 on 1 and 28 DF,  p-value: < 2.2e-16
## R-squared value for the model is 0.957 (This indicates that the model is a good model)
## Both the parameters are significant because Pr(>|t|) is less than 0.05
## Model: Salary = 25792.2 + 9450*YearsExperience
# Confidence Interval for model Parameters
confint(lin_model, level = 0.95) # 95% Confidence Interval (Note: CI = 1 - alpha, where alpha is significance level)
##                     2.5 %   97.5 %
## (Intercept)     21136.061 30448.34
## YearsExperience  8674.119 10225.81
# Display the fit information
par(mfrow = c(2,2))
plot(lin_model) # Residual vs Fitted, Normal Q-Q, Scale-Location, Residuals vs Leverage

# Residuals
residuals(lin_model)
##          1          2          3          4          5          6 
##  3155.8412  8127.8488 -2236.1437 -1167.1248 -6691.1173  3444.9091 
##          7          8          9         10         11         12 
##  6007.9128 -1587.0796  8412.9204 -3568.0608   570.9467 -7798.0495 
##         13         14         15         16         17         18 
## -6635.0495 -7456.0457 -7206.0306 -4159.0156 -7958.0080  7210.9995 
##         19         20         21         22         23         24 
##  -183.9779 11448.0259  1686.0560  5386.0673   855.0975 10530.1088 
##         25         26         27         28         29         30 
##  1424.1276 -5259.8611  1402.1577 -3876.8385  -735.8121 -3144.8046
# Prediction Values
pred <- predict(lin_model)
print(pred)
##         1         2         3         4         5         6         7 
##  36187.16  38077.15  39967.14  44692.12  46582.12  53197.09  54142.09 
##         8         9        10        11        12        13        14 
##  56032.08  56032.08  60757.06  62647.05  63592.05  63592.05  64537.05 
##        15        16        17        18        19        20        21 
##  68317.03  72097.02  73987.01  75877.00  81546.98  82491.97  90051.94 
##        22        23        24        25        26        27        28 
##  92886.93 100446.90 103281.89 108006.87 110841.86 115566.84 116511.84 
##        29        30 
## 123126.81 125016.80
# Plot of Actual Data points and Predicted Line
par(mfrow = c(1,1))
plot(YearsExperience, Salary, xlab = "Years Experience", ylab = "Salary", main = "Salary vs Years Experience")
lines(YearsExperience, pred)

# Predicting new values for the Salary given the years of experience
new_yearsExp <- data.frame(YearsExperience=c(2, 7, 4.5, 8, 4.8, 7.8, 8.9, 2.5))
pred2 <- predict(lin_model, new_yearsExp)
pred2 <- data.frame(YearsExperience = new_yearsExp, Estimated_Salary = pred2)
pred2
##   YearsExperience Estimated_Salary
## 1             2.0         44692.12
## 2             7.0         91941.94
## 3             4.5         68317.03
## 4             8.0        101391.90
## 5             4.8         71152.02
## 6             7.8         99501.91
## 7             8.9        109896.86
## 8             2.5         49417.11