Assignment 1
mydata <- read.csv("C:\\Users\\RISHI RAHUL\\Desktop\\DS\\2 SLR\\Assignment\\Salary_Data.csv")
# Column Names
colnames(mydata)
## [1] "YearsExperience" "Salary"
attach(mydata)
# Summary
summary(mydata)
## YearsExperience Salary
## Min. : 1.100 Min. : 37731
## 1st Qu.: 3.200 1st Qu.: 56721
## Median : 4.700 Median : 65237
## Mean : 5.313 Mean : 76003
## 3rd Qu.: 7.700 3rd Qu.:100545
## Max. :10.500 Max. :122391
# VAR and SD
var(Salary)
## [1] 751550960
sd(YearsExperience)
## [1] 2.837888
# Skewness and Kurtosis
library(e1071)
skewness(Salary)
## [1] 0.3194946
kurtosis(YearsExperience)
## [1] -1.17293
# Plotting the Complete Dataset
plot(mydata)

boxplot(Salary)

boxplot(Salary, horizontal = TRUE)

hist(YearsExperience)

# Normality check by the plot
qqnorm(YearsExperience)

# Correlation coefficient value for Salary and YearsExperience
cor(mydata)
## YearsExperience Salary
## YearsExperience 1.0000000 0.9782416
## Salary 0.9782416 1.0000000
# Simple model without using any transformation
model <- lm(Salary~YearsExperience)
summary(model) # R-squared value for the above model is 0.957
##
## Call:
## lm(formula = Salary ~ YearsExperience)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7958.0 -4088.5 -459.9 3372.6 11448.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25792.2 2273.1 11.35 5.51e-12 ***
## YearsExperience 9450.0 378.8 24.95 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5788 on 28 degrees of freedom
## Multiple R-squared: 0.957, Adjusted R-squared: 0.9554
## F-statistic: 622.5 on 1 and 28 DF, p-value: < 2.2e-16
confint(model,level = 0.95) # confidence interval
## 2.5 % 97.5 %
## (Intercept) 21136.061 30448.34
## YearsExperience 8674.119 10225.81
#predict(model,interval="predict")
# Logarthmic transformation
model_log<-lm(Salary~log(YearsExperience)) # Regression using logarthmic transformation
summary(model_log) # R-squared value for the above model is 0.853
##
## Call:
## lm(formula = Salary ~ log(YearsExperience))
##
## Residuals:
## Min 1Q Median 3Q Max
## -15392.6 -7523.0 559.7 6336.1 20629.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14928 5156 2.895 0.00727 **
## log(YearsExperience) 40582 3172 12.792 3.25e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10660 on 28 degrees of freedom
## Multiple R-squared: 0.8539, Adjusted R-squared: 0.8487
## F-statistic: 163.6 on 1 and 28 DF, p-value: 3.25e-13
confint(model_log,level=0.95)
## 2.5 % 97.5 %
## (Intercept) 4365.921 25490.02
## log(YearsExperience) 34083.512 47080.46
#predict(model_log,interval="predict")
# Exponential model
model_exp<-lm(log(Salary)~YearsExperience) # regression using Exponential model
summary(model_exp) # R-squared value for the above model is 0.932
##
## Call:
## lm(formula = log(Salary) ~ YearsExperience)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.18949 -0.06946 -0.01068 0.06932 0.19029
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.507402 0.038443 273.33 <2e-16 ***
## YearsExperience 0.125453 0.006406 19.59 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09789 on 28 degrees of freedom
## Multiple R-squared: 0.932, Adjusted R-squared: 0.9295
## F-statistic: 383.6 on 1 and 28 DF, p-value: < 2.2e-16
# Simple model gives the best R-Squared value
model_final <- predict(model)
model_final
## 1 2 3 4 5 6 7
## 36187.16 38077.15 39967.14 44692.12 46582.12 53197.09 54142.09
## 8 9 10 11 12 13 14
## 56032.08 56032.08 60757.06 62647.05 63592.05 63592.05 64537.05
## 15 16 17 18 19 20 21
## 68317.03 72097.02 73987.01 75877.00 81546.98 82491.97 90051.94
## 22 23 24 25 26 27 28
## 92886.93 100446.90 103281.89 108006.87 110841.86 115566.84 116511.84
## 29 30
## 123126.81 125016.80
rmse <- sqrt(mean((model_final-Salary)^2))
rmse
## [1] 5592.044
plot(model)




hist(residuals(model))
