Assignment on Simple Linear Regression (Name: Karthick Sothivelr)
# Read Salary Data csv file and store it in salary_data
salary_data <- read.csv("C:\\Users\\samy_\\Desktop\\Data Science Class Materials\\Salary_Data.csv")
View(salary_data)
attach(salary_data)
# Structure of Salary Data
str(salary_data)
## 'data.frame': 30 obs. of 2 variables:
## $ YearsExperience: num 1.1 1.3 1.5 2 2.2 2.9 3 3.2 3.2 3.7 ...
## $ Salary : int 39343 46205 37731 43525 39891 56642 60150 54445 64445 57189 ...
# Summary of Statistics for Salary Data
summary(salary_data)
## YearsExperience Salary
## Min. : 1.100 Min. : 37731
## 1st Qu.: 3.200 1st Qu.: 56721
## Median : 4.700 Median : 65237
## Mean : 5.313 Mean : 76003
## 3rd Qu.: 7.700 3rd Qu.:100545
## Max. :10.500 Max. :122391
library(psych)
describe(salary_data)
## vars n mean sd median trimmed mad
## YearsExperience 1 30 5.31 2.84 4.7 5.21 2.89
## Salary 2 30 76003.00 27414.43 65237.0 75078.88 30203.53
## min max range skew kurtosis se
## YearsExperience 1.1 10.5 9.4 0.34 -1.17 0.52
## Salary 37731.0 122391.0 84660.0 0.32 -1.40 5005.17
# Scatter Plot of Salary VS YearsExperience
plot(YearsExperience, Salary, xlab = "Years Experience", ylab = "Salary", main = "Scatter Plot of Salary vs Years Experience", las = 1)

# Correlation Coefficient of Salary and YearsExperience
cor(YearsExperience, Salary) # Strong +ve Correlation
## [1] 0.9782416
# Simple Linear Regression Model (Y = Salary, X = Years Experience)
lin_model <- lm(Salary ~ YearsExperience)
summary(lin_model)
##
## Call:
## lm(formula = Salary ~ YearsExperience)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7958.0 -4088.5 -459.9 3372.6 11448.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25792.2 2273.1 11.35 5.51e-12 ***
## YearsExperience 9450.0 378.8 24.95 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5788 on 28 degrees of freedom
## Multiple R-squared: 0.957, Adjusted R-squared: 0.9554
## F-statistic: 622.5 on 1 and 28 DF, p-value: < 2.2e-16
## R-squared value for the model is 0.957 (This indicates that the model is a good model)
## Both the parameters are significant because Pr(>|t|) is less than 0.05
## Model: Salary = 25792.2 + 9450*YearsExperience
# Confidence Interval for model Parameters
confint(lin_model, level = 0.95) # 95% Confidence Interval (Note: CI = 1 - alpha, where alpha is significance level)
## 2.5 % 97.5 %
## (Intercept) 21136.061 30448.34
## YearsExperience 8674.119 10225.81
# Display the fit information
par(mfrow = c(2,2))
plot(lin_model) # Residual vs Fitted, Normal Q-Q, Scale-Location, Residuals vs Leverage

# Residuals
residuals(lin_model)
## 1 2 3 4 5 6
## 3155.8412 8127.8488 -2236.1437 -1167.1248 -6691.1173 3444.9091
## 7 8 9 10 11 12
## 6007.9128 -1587.0796 8412.9204 -3568.0608 570.9467 -7798.0495
## 13 14 15 16 17 18
## -6635.0495 -7456.0457 -7206.0306 -4159.0156 -7958.0080 7210.9995
## 19 20 21 22 23 24
## -183.9779 11448.0259 1686.0560 5386.0673 855.0975 10530.1088
## 25 26 27 28 29 30
## 1424.1276 -5259.8611 1402.1577 -3876.8385 -735.8121 -3144.8046
# Prediction Values
pred <- predict(lin_model)
print(pred)
## 1 2 3 4 5 6 7
## 36187.16 38077.15 39967.14 44692.12 46582.12 53197.09 54142.09
## 8 9 10 11 12 13 14
## 56032.08 56032.08 60757.06 62647.05 63592.05 63592.05 64537.05
## 15 16 17 18 19 20 21
## 68317.03 72097.02 73987.01 75877.00 81546.98 82491.97 90051.94
## 22 23 24 25 26 27 28
## 92886.93 100446.90 103281.89 108006.87 110841.86 115566.84 116511.84
## 29 30
## 123126.81 125016.80
# Plot of Actual Data points and Predicted Line
par(mfrow = c(1,1))
plot(YearsExperience, Salary, xlab = "Years Experience", ylab = "Salary", main = "Salary vs Years Experience")
lines(YearsExperience, pred)

# Predicting new values for the Salary given the years of experience
new_yearsExp <- data.frame(YearsExperience=c(2, 7, 4.5, 8, 4.8, 7.8, 8.9, 2.5))
pred2 <- predict(lin_model, new_yearsExp)
pred2 <- data.frame(YearsExperience = new_yearsExp, Estimated_Salary = pred2)
pred2
## YearsExperience Estimated_Salary
## 1 2.0 44692.12
## 2 7.0 91941.94
## 3 4.5 68317.03
## 4 8.0 101391.90
## 5 4.8 71152.02
## 6 7.8 99501.91
## 7 8.9 109896.86
## 8 2.5 49417.11