Simple Linear Regression

Salary Data

Assignment 1

mydata <- read.csv("C:\\Users\\RISHI RAHUL\\Desktop\\DS\\2 SLR\\Assignment\\Salary_Data.csv")

# Column Names
colnames(mydata)
## [1] "YearsExperience" "Salary"
attach(mydata)

# Summary
summary(mydata)
##  YearsExperience      Salary      
##  Min.   : 1.100   Min.   : 37731  
##  1st Qu.: 3.200   1st Qu.: 56721  
##  Median : 4.700   Median : 65237  
##  Mean   : 5.313   Mean   : 76003  
##  3rd Qu.: 7.700   3rd Qu.:100545  
##  Max.   :10.500   Max.   :122391
# VAR and SD
var(Salary)
## [1] 751550960
sd(YearsExperience)
## [1] 2.837888
# Skewness and Kurtosis
library(e1071)
skewness(Salary)
## [1] 0.3194946
kurtosis(YearsExperience)
## [1] -1.17293
# Plotting the Complete Dataset
plot(mydata)

boxplot(Salary)

boxplot(Salary, horizontal = TRUE)

hist(YearsExperience)

# Normality check by the plot
qqnorm(YearsExperience)

# Correlation coefficient value for Salary and YearsExperience
cor(mydata)
##                 YearsExperience    Salary
## YearsExperience       1.0000000 0.9782416
## Salary                0.9782416 1.0000000
# Simple model without using any transformation
model <- lm(Salary~YearsExperience)
summary(model) # R-squared value for the above model is 0.957
## 
## Call:
## lm(formula = Salary ~ YearsExperience)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7958.0 -4088.5  -459.9  3372.6 11448.0 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      25792.2     2273.1   11.35 5.51e-12 ***
## YearsExperience   9450.0      378.8   24.95  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5788 on 28 degrees of freedom
## Multiple R-squared:  0.957,  Adjusted R-squared:  0.9554 
## F-statistic: 622.5 on 1 and 28 DF,  p-value: < 2.2e-16
confint(model,level = 0.95) # confidence interval
##                     2.5 %   97.5 %
## (Intercept)     21136.061 30448.34
## YearsExperience  8674.119 10225.81
#predict(model,interval="predict")

# Logarthmic transformation
model_log<-lm(Salary~log(YearsExperience))  # Regression using logarthmic transformation
summary(model_log)  # R-squared value for the above model is 0.853
## 
## Call:
## lm(formula = Salary ~ log(YearsExperience))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15392.6  -7523.0    559.7   6336.1  20629.8 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             14928       5156   2.895  0.00727 ** 
## log(YearsExperience)    40582       3172  12.792 3.25e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10660 on 28 degrees of freedom
## Multiple R-squared:  0.8539, Adjusted R-squared:  0.8487 
## F-statistic: 163.6 on 1 and 28 DF,  p-value: 3.25e-13
confint(model_log,level=0.95)
##                          2.5 %   97.5 %
## (Intercept)           4365.921 25490.02
## log(YearsExperience) 34083.512 47080.46
#predict(model_log,interval="predict")

# Exponential model 
model_exp<-lm(log(Salary)~YearsExperience) # regression using Exponential model
summary(model_exp) # R-squared value for the above model is 0.932
## 
## Call:
## lm(formula = log(Salary) ~ YearsExperience)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.18949 -0.06946 -0.01068  0.06932  0.19029 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     10.507402   0.038443  273.33   <2e-16 ***
## YearsExperience  0.125453   0.006406   19.59   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09789 on 28 degrees of freedom
## Multiple R-squared:  0.932,  Adjusted R-squared:  0.9295 
## F-statistic: 383.6 on 1 and 28 DF,  p-value: < 2.2e-16
# Simple model gives the best R-Squared value
model_final <- predict(model)
model_final
##         1         2         3         4         5         6         7 
##  36187.16  38077.15  39967.14  44692.12  46582.12  53197.09  54142.09 
##         8         9        10        11        12        13        14 
##  56032.08  56032.08  60757.06  62647.05  63592.05  63592.05  64537.05 
##        15        16        17        18        19        20        21 
##  68317.03  72097.02  73987.01  75877.00  81546.98  82491.97  90051.94 
##        22        23        24        25        26        27        28 
##  92886.93 100446.90 103281.89 108006.87 110841.86 115566.84 116511.84 
##        29        30 
## 123126.81 125016.80
rmse <- sqrt(mean((model_final-Salary)^2))
rmse
## [1] 5592.044
plot(model)

hist(residuals(model))