R Notebook

This assignment and practice is based on the course Machine Learning from A to Z by udemy

Part 1: Data preprocessing

df<-read.csv("https://raw.githubusercontent.com/tuyenhavan/Statistics/Machine-Learning-Data/Data.csv")

head(df)

##   Country Age Salary Purchased
## 1  France  44  72000        No
## 2   Spain  27  48000       Yes
## 3 Germany  30  54000        No
## 4   Spain  38  61000        No
## 5 Germany  40     NA       Yes
## 6  France  35  58000       Yes

Dealing with missing data

# Using ifelse statement

df$Salary<- ifelse(is.na(df$Salary),mean(df$Salary,na.rm = T),df$Salary)

# For age

df$Age<-ifelse(is.na(df$Age),mean(df$Age,na.rm = T),df$Age)

head(df,10)

##    Country      Age   Salary Purchased
## 1   France 44.00000 72000.00        No
## 2    Spain 27.00000 48000.00       Yes
## 3  Germany 30.00000 54000.00        No
## 4    Spain 38.00000 61000.00        No
## 5  Germany 40.00000 63777.78       Yes
## 6   France 35.00000 58000.00       Yes
## 7    Spain 38.77778 52000.00        No
## 8   France 48.00000 79000.00       Yes
## 9  Germany 50.00000 83000.00        No
## 10  France 37.00000 67000.00       Yes

Encode the categorical variables

# 1 is corresponding with France, and so on ( 2 is Germany, 3 is Spain)
df$Country<-factor(df$Country, levels = c("France","Germany","Spain"),labels = c(1,2,3))

head(df)

##   Country Age   Salary Purchased
## 1       1  44 72000.00        No
## 2       3  27 48000.00       Yes
## 3       2  30 54000.00        No
## 4       3  38 61000.00        No
## 5       2  40 63777.78       Yes
## 6       1  35 58000.00       Yes

df$Purchased<-factor(df$Purchased, levels = c("No","Yes"),labels = c(0,1))

head(df,10)

##    Country      Age   Salary Purchased
## 1        1 44.00000 72000.00         0
## 2        3 27.00000 48000.00         1
## 3        2 30.00000 54000.00         0
## 4        3 38.00000 61000.00         0
## 5        2 40.00000 63777.78         1
## 6        1 35.00000 58000.00         1
## 7        3 38.77778 52000.00         0
## 8        1 48.00000 79000.00         1
## 9        2 50.00000 83000.00         0
## 10       1 37.00000 67000.00         1

Splitting data set into train and test sets

library(caTools)

## Warning: package 'caTools' was built under R version 3.2.5

set.seed(123)

split<-sample.split(df$Purchased, SplitRatio = 0.8)

train<-subset(df, split==T) # training set

test<-subset(df, split==F) # test set

head(train) # Check the first few rows

##   Country      Age   Salary Purchased
## 1       1 44.00000 72000.00         0
## 2       3 27.00000 48000.00         1
## 3       2 30.00000 54000.00         0
## 4       3 38.00000 61000.00         0
## 5       2 40.00000 63777.78         1
## 7       3 38.77778 52000.00         0

Feature Scaling

Standardization of data

\[ X\_stand = \frac{X-mean(X)}{sd(X)} \] * Normalization of a variable

\[ X\_norm= \frac{X-min(X)}{max(X)-min(X)}\]

# Applying feature scalig to train and test sets
train[,c(2,3)]<-scale(train[,c(2,3)])

test[,c(2,3)]<-scale(test[,c(2,3)])

head(train,10)

##    Country         Age     Salary Purchased
## 1        1  0.90101716  0.9392746         0
## 2        3 -1.58847494 -1.3371160         1
## 3        2 -1.14915281 -0.7680183         0
## 4        3  0.02237289 -0.1040711         0
## 5        2  0.31525431  0.1594000         1
## 7        3  0.13627122 -0.9577176         0
## 8        1  1.48678000  1.6032218         1
## 10       1 -0.12406783  0.4650265         1

Part 2: Regression Models

Simple Linear Regression

df1<-read.csv("https://raw.githubusercontent.com/tuyenhavan/Statistics/Machine-Learning-Data/Salary_Data.csv")

head(df1)

##   YearsExperience Salary
## 1             1.1  39343
## 2             1.3  46205
## 3             1.5  37731
## 4             2.0  43525
## 5             2.2  39891
## 6             2.9  56642

Building a simple linear regression using df1 dataset

# Splitting data into training and test sets
library(caTools)
set.seed(123)

split<- sample.split(df1$Salary, SplitRatio = 0.7)

# training_set

training_set<- subset(df1, split==T)
# test set

test_set <- subset(df1, split==F)
# Build a linear regression on training set

model1<-lm(Salary~YearsExperience, data=training_set)

# Check the summary of the model

summary(model1)

## 
## Call:
## lm(formula = Salary ~ YearsExperience, data = training_set)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7193.8 -3833.2   732.8  3283.1  8885.6 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      25845.9     2594.3   9.962 5.59e-09 ***
## YearsExperience   9285.5      402.8  23.051 2.38e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5329 on 19 degrees of freedom
## Multiple R-squared:  0.9655, Adjusted R-squared:  0.9637 
## F-statistic: 531.3 on 1 and 19 DF,  p-value: 2.376e-15

# All information can be found here 
attributes(model1)

## $names
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"        
## 
## $class
## [1] "lm"

# fitted.values means that 

fitted<- predict(model1, newdata=training_set)

training_pred<-data.frame(training_set,fitted)

head(training_pred)

##    YearsExperience Salary   fitted
## 1              1.1  39343 36059.94
## 3              1.5  37731 39774.13
## 6              2.9  56642 52773.79
## 7              3.0  60150 53702.34
## 9              3.2  64445 55559.44
## 10             3.7  57189 60202.17

# predicting new values based on the test set

y_pred<-predict(model1, newdata=test_set)

test_pred<-data.frame(YearsExperience=test_set$YearsExperience,Salary=y_pred, Type1="Predicted")

training_fit<-data.frame(training_set,model_fitted=model1$fitted.values)

# Create a new data frame, which includes both predicted and observed values
library(dplyr)

## Warning: package 'dplyr' was built under R version 3.2.5

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

training_model<-training_fit %>% dplyr::mutate(Type1="Observed") %>% bind_rows(test_pred)

head(training_model)

##   YearsExperience Salary model_fitted    Type1
## 1             1.1  39343     36059.94 Observed
## 2             1.5  37731     39774.13 Observed
## 3             2.9  56642     52773.79 Observed
## 4             3.0  60150     53702.34 Observed
## 5             3.2  64445     55559.44 Observed
## 6             3.7  57189     60202.17 Observed

# Visualing the result

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.2.5

ggplot(data=training_model,aes(x=YearsExperience,y=Salary)) + geom_point(aes(color=Type1)) + xlab("Year of Experience") + ylab("Salary (Dollars)") + ggtitle("Year of Experience vs Salary") + theme_bw() + theme(plot.title = element_text(hjust=0.5)) + geom_line(data=training_pred,aes(x=training_pred$YearsExperience,y=training_pred$fitted),color=4) + labs(color="Salary Data")

# Visualize the test set 

ggplot(data=test_set,aes(x=YearsExperience,y=Salary)) + geom_point(col=4)+ xlab("Year of Experience") + ylab("Salary (Dollars)") + ggtitle("Year of Experience vs Salary") + theme_bw() + theme(plot.title = element_text(hjust=0.5)) + geom_line(data=training_set,aes(x=training_set$YearsExperience,y=model1$fitted.values),color=2)

** Multiple Linear Regression**

There are essential assumptions of linear regression. Before conducting any linear regression, following assumptions should be checked

Linearity
Homoscedasticity
Multivariate normality
Independence of errors
Lack of multicolinearity

startup<-read.csv("https://raw.githubusercontent.com/tuyenhavan/Statistics/Dataset/50-Startups.csv")

head(startup)

##   R.D.Spend Administration Marketing.Spend      State   Profit
## 1  165349.2      136897.80        471784.1   New York 192261.8
## 2  162597.7      151377.59        443898.5 California 191792.1
## 3  153441.5      101145.55        407934.5 California 191050.4
## 4  144372.4      118671.85        383199.6   New York 182902.0
## 5  142107.3       91391.77        366168.4 California 166187.9
## 6  131876.9       99814.71        362861.4   New York 156991.1

str(startup)

## 'data.frame':    50 obs. of  5 variables:
##  $ R.D.Spend      : num  165349 162598 153442 144372 142107 ...
##  $ Administration : num  136898 151378 101146 118672 91392 ...
##  $ Marketing.Spend: num  471784 443899 407935 383200 366168 ...
##  $ State          : Factor w/ 2 levels "California","New York": 2 1 1 2 1 2 1 2 2 1 ...
##  $ Profit         : num  192262 191792 191050 182902 166188 ...

unique(startup$State)

## [1] New York   California
## Levels: California New York

table(unique(startup$State)) # Calculate the frequency of each state

## 
## California   New York 
##          1          1

To build a linear regression, there are usually few following steps

All feed in
Forward selection
Backward elimination: remove insignificant p-values predictor
Bidirection selection

\[ 2^n -1 \ where \ n \ is \ the \ number \ of \ predictors \]

# Multiple linear regression building 

## Splitting data into training and test sets

library(caTools)
set.seed(123)

split=sample.split(startup$Profit, SplitRatio = 0.8)

training_set<-subset(startup, split==T)

test_set<-subset(startup,split==F)

# Building a multiple linear regression model with all possible predictors

multi_model<-lm(Profit~., data=startup)

summary(multi_model)

## 
## Call:
## lm(formula = Profit ~ ., data = startup)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -34163  -4312    113   6631  17916 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.042e+04  6.654e+03   7.577 1.43e-09 ***
## R.D.Spend        8.080e-01  4.575e-02  17.662  < 2e-16 ***
## Administration  -2.362e-02  5.186e-02  -0.455    0.651    
## Marketing.Spend  2.637e-02  1.668e-02   1.581    0.121    
## StateNew York   -1.332e+03  2.690e+03  -0.495    0.623    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9309 on 45 degrees of freedom
## Multiple R-squared:  0.951,  Adjusted R-squared:  0.9467 
## F-statistic: 218.4 on 4 and 45 DF,  p-value: < 2.2e-16

# As only R.D.Spend is significant so new model could be

model_good<-lm(Profit~R.D.Spend, data=training_set)

summary(model_good)

## 
## Call:
## lm(formula = Profit ~ R.D.Spend, data = training_set)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -34334  -4894   -340   6752  17147 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4.902e+04  2.748e+03   17.84   <2e-16 ***
## R.D.Spend   8.563e-01  3.357e-02   25.51   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9836 on 38 degrees of freedom
## Multiple R-squared:  0.9448, Adjusted R-squared:  0.9434 
## F-statistic: 650.8 on 1 and 38 DF,  p-value: < 2.2e-16

# Predict the test set

y_pred<-predict(model_good, newdata = test_set)

test_set<-data.frame(test_set,Profit_predicted=y_pred)

head(test_set)

##    R.D.Spend Administration Marketing.Spend      State   Profit
## 4   144372.4      118671.85        383199.6   New York 182902.0
## 5   142107.3       91391.77        366168.4 California 166187.9
## 8   130298.1      145530.06        323876.7   New York 155752.6
## 11  101913.1      110594.11        229161.0 California 146122.0
## 16  114523.6      122616.84        261776.2   New York 129917.0
## 20   86419.7      153514.11             0.0   New York 122776.9
##    Profit_predicted
## 4          172647.9
## 5          170708.2
## 8          160595.5
## 11         136288.1
## 16         147087.1
## 20         123020.5

Stepwise selections or predictors selection

# Backward elemination

model<-lm(Profit~.,data=training_set)

summary(model)

## 
## Call:
## lm(formula = Profit ~ ., data = training_set)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33635  -4423      0   6272  18586 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.003e+04  7.261e+03   6.890  5.3e-08 ***
## R.D.Spend        7.983e-01  5.421e-02  14.727  < 2e-16 ***
## Administration  -2.738e-02  5.687e-02  -0.482    0.633    
## Marketing.Spend  3.295e-02  2.011e-02   1.638    0.110    
## StateNew York   -1.152e+03  3.098e+03  -0.372    0.712    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9747 on 35 degrees of freedom
## Multiple R-squared:  0.9501, Adjusted R-squared:  0.9444 
## F-statistic: 166.6 on 4 and 35 DF,  p-value: < 2.2e-16

# sate has highest p-value and no significant. So remove it

model1<-lm(Profit~R.D.Spend+Administration +Marketing.Spend,data=training_set)

summary(model1)

## 
## Call:
## lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend, 
##     data = training_set)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33117  -4858    -36   6020  17957 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      4.970e+04  7.120e+03   6.980 3.48e-08 ***
## R.D.Spend        7.983e-01  5.356e-02  14.905  < 2e-16 ***
## Administration  -2.895e-02  5.603e-02  -0.517    0.609    
## Marketing.Spend  3.283e-02  1.987e-02   1.652    0.107    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9629 on 36 degrees of freedom
## Multiple R-squared:  0.9499, Adjusted R-squared:  0.9457 
## F-statistic: 227.6 on 3 and 36 DF,  p-value: < 2.2e-16

# remove Administration
model2<-lm(Profit~R.D.Spend+Marketing.Spend,data=training_set)

summary(model2)

## 
## Call:
## lm(formula = Profit ~ R.D.Spend + Marketing.Spend, data = training_set)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33294  -4763   -354   6351  17693 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     4.638e+04  3.019e+03  15.364   <2e-16 ***
## R.D.Spend       7.879e-01  4.916e-02  16.026   <2e-16 ***
## Marketing.Spend 3.538e-02  1.905e-02   1.857   0.0713 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9533 on 37 degrees of freedom
## Multiple R-squared:  0.9495, Adjusted R-squared:  0.9468 
## F-statistic: 348.1 on 2 and 37 DF,  p-value: < 2.2e-16

# remove Marketing Spend
model3<-lm(Profit~R.D.Spend,data=training_set)

summary(model3)

## 
## Call:
## lm(formula = Profit ~ R.D.Spend, data = training_set)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -34334  -4894   -340   6752  17147 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4.902e+04  2.748e+03   17.84   <2e-16 ***
## R.D.Spend   8.563e-01  3.357e-02   25.51   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9836 on 38 degrees of freedom
## Multiple R-squared:  0.9448, Adjusted R-squared:  0.9434 
## F-statistic: 650.8 on 1 and 38 DF,  p-value: < 2.2e-16

# This model is statistical significant. Check plot to see the satisfation of assumptions

par(mfrow=c(2,2))

plot(model3)