This assignment and practice is based on the course Machine Learning from A to Z by udemy
Part 1: Data preprocessing
df<-read.csv("https://raw.githubusercontent.com/tuyenhavan/Statistics/Machine-Learning-Data/Data.csv")
head(df)
## Country Age Salary Purchased
## 1 France 44 72000 No
## 2 Spain 27 48000 Yes
## 3 Germany 30 54000 No
## 4 Spain 38 61000 No
## 5 Germany 40 NA Yes
## 6 France 35 58000 Yes
Dealing with missing data
# Using ifelse statement
df$Salary<- ifelse(is.na(df$Salary),mean(df$Salary,na.rm = T),df$Salary)
# For age
df$Age<-ifelse(is.na(df$Age),mean(df$Age,na.rm = T),df$Age)
head(df,10)
## Country Age Salary Purchased
## 1 France 44.00000 72000.00 No
## 2 Spain 27.00000 48000.00 Yes
## 3 Germany 30.00000 54000.00 No
## 4 Spain 38.00000 61000.00 No
## 5 Germany 40.00000 63777.78 Yes
## 6 France 35.00000 58000.00 Yes
## 7 Spain 38.77778 52000.00 No
## 8 France 48.00000 79000.00 Yes
## 9 Germany 50.00000 83000.00 No
## 10 France 37.00000 67000.00 Yes
Encode the categorical variables
# 1 is corresponding with France, and so on ( 2 is Germany, 3 is Spain)
df$Country<-factor(df$Country, levels = c("France","Germany","Spain"),labels = c(1,2,3))
head(df)
## Country Age Salary Purchased
## 1 1 44 72000.00 No
## 2 3 27 48000.00 Yes
## 3 2 30 54000.00 No
## 4 3 38 61000.00 No
## 5 2 40 63777.78 Yes
## 6 1 35 58000.00 Yes
df$Purchased<-factor(df$Purchased, levels = c("No","Yes"),labels = c(0,1))
head(df,10)
## Country Age Salary Purchased
## 1 1 44.00000 72000.00 0
## 2 3 27.00000 48000.00 1
## 3 2 30.00000 54000.00 0
## 4 3 38.00000 61000.00 0
## 5 2 40.00000 63777.78 1
## 6 1 35.00000 58000.00 1
## 7 3 38.77778 52000.00 0
## 8 1 48.00000 79000.00 1
## 9 2 50.00000 83000.00 0
## 10 1 37.00000 67000.00 1
Splitting data set into train and test sets
library(caTools)
## Warning: package 'caTools' was built under R version 3.2.5
set.seed(123)
split<-sample.split(df$Purchased, SplitRatio = 0.8)
train<-subset(df, split==T) # training set
test<-subset(df, split==F) # test set
head(train) # Check the first few rows
## Country Age Salary Purchased
## 1 1 44.00000 72000.00 0
## 2 3 27.00000 48000.00 1
## 3 2 30.00000 54000.00 0
## 4 3 38.00000 61000.00 0
## 5 2 40.00000 63777.78 1
## 7 3 38.77778 52000.00 0
Feature Scaling
\[ X\_stand = \frac{X-mean(X)}{sd(X)} \] * Normalization of a variable
\[ X\_norm= \frac{X-min(X)}{max(X)-min(X)}\]
# Applying feature scalig to train and test sets
train[,c(2,3)]<-scale(train[,c(2,3)])
test[,c(2,3)]<-scale(test[,c(2,3)])
head(train,10)
## Country Age Salary Purchased
## 1 1 0.90101716 0.9392746 0
## 2 3 -1.58847494 -1.3371160 1
## 3 2 -1.14915281 -0.7680183 0
## 4 3 0.02237289 -0.1040711 0
## 5 2 0.31525431 0.1594000 1
## 7 3 0.13627122 -0.9577176 0
## 8 1 1.48678000 1.6032218 1
## 10 1 -0.12406783 0.4650265 1
Part 2: Regression Models
Simple Linear Regression
df1<-read.csv("https://raw.githubusercontent.com/tuyenhavan/Statistics/Machine-Learning-Data/Salary_Data.csv")
head(df1)
## YearsExperience Salary
## 1 1.1 39343
## 2 1.3 46205
## 3 1.5 37731
## 4 2.0 43525
## 5 2.2 39891
## 6 2.9 56642
Building a simple linear regression using df1 dataset
# Splitting data into training and test sets
library(caTools)
set.seed(123)
split<- sample.split(df1$Salary, SplitRatio = 0.7)
# training_set
training_set<- subset(df1, split==T)
# test set
test_set <- subset(df1, split==F)
# Build a linear regression on training set
model1<-lm(Salary~YearsExperience, data=training_set)
# Check the summary of the model
summary(model1)
##
## Call:
## lm(formula = Salary ~ YearsExperience, data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7193.8 -3833.2 732.8 3283.1 8885.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25845.9 2594.3 9.962 5.59e-09 ***
## YearsExperience 9285.5 402.8 23.051 2.38e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5329 on 19 degrees of freedom
## Multiple R-squared: 0.9655, Adjusted R-squared: 0.9637
## F-statistic: 531.3 on 1 and 19 DF, p-value: 2.376e-15
# All information can be found here
attributes(model1)
## $names
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
##
## $class
## [1] "lm"
# fitted.values means that
fitted<- predict(model1, newdata=training_set)
training_pred<-data.frame(training_set,fitted)
head(training_pred)
## YearsExperience Salary fitted
## 1 1.1 39343 36059.94
## 3 1.5 37731 39774.13
## 6 2.9 56642 52773.79
## 7 3.0 60150 53702.34
## 9 3.2 64445 55559.44
## 10 3.7 57189 60202.17
# predicting new values based on the test set
y_pred<-predict(model1, newdata=test_set)
test_pred<-data.frame(YearsExperience=test_set$YearsExperience,Salary=y_pred, Type1="Predicted")
training_fit<-data.frame(training_set,model_fitted=model1$fitted.values)
# Create a new data frame, which includes both predicted and observed values
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.2.5
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
training_model<-training_fit %>% dplyr::mutate(Type1="Observed") %>% bind_rows(test_pred)
head(training_model)
## YearsExperience Salary model_fitted Type1
## 1 1.1 39343 36059.94 Observed
## 2 1.5 37731 39774.13 Observed
## 3 2.9 56642 52773.79 Observed
## 4 3.0 60150 53702.34 Observed
## 5 3.2 64445 55559.44 Observed
## 6 3.7 57189 60202.17 Observed
# Visualing the result
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.5
ggplot(data=training_model,aes(x=YearsExperience,y=Salary)) + geom_point(aes(color=Type1)) + xlab("Year of Experience") + ylab("Salary (Dollars)") + ggtitle("Year of Experience vs Salary") + theme_bw() + theme(plot.title = element_text(hjust=0.5)) + geom_line(data=training_pred,aes(x=training_pred$YearsExperience,y=training_pred$fitted),color=4) + labs(color="Salary Data")
# Visualize the test set
ggplot(data=test_set,aes(x=YearsExperience,y=Salary)) + geom_point(col=4)+ xlab("Year of Experience") + ylab("Salary (Dollars)") + ggtitle("Year of Experience vs Salary") + theme_bw() + theme(plot.title = element_text(hjust=0.5)) + geom_line(data=training_set,aes(x=training_set$YearsExperience,y=model1$fitted.values),color=2)
** Multiple Linear Regression**
There are essential assumptions of linear regression. Before conducting any linear regression, following assumptions should be checked
Linearity
Homoscedasticity
Multivariate normality
Independence of errors
Lack of multicolinearity
startup<-read.csv("https://raw.githubusercontent.com/tuyenhavan/Statistics/Dataset/50-Startups.csv")
head(startup)
## R.D.Spend Administration Marketing.Spend State Profit
## 1 165349.2 136897.80 471784.1 New York 192261.8
## 2 162597.7 151377.59 443898.5 California 191792.1
## 3 153441.5 101145.55 407934.5 California 191050.4
## 4 144372.4 118671.85 383199.6 New York 182902.0
## 5 142107.3 91391.77 366168.4 California 166187.9
## 6 131876.9 99814.71 362861.4 New York 156991.1
str(startup)
## 'data.frame': 50 obs. of 5 variables:
## $ R.D.Spend : num 165349 162598 153442 144372 142107 ...
## $ Administration : num 136898 151378 101146 118672 91392 ...
## $ Marketing.Spend: num 471784 443899 407935 383200 366168 ...
## $ State : Factor w/ 2 levels "California","New York": 2 1 1 2 1 2 1 2 2 1 ...
## $ Profit : num 192262 191792 191050 182902 166188 ...
unique(startup$State)
## [1] New York California
## Levels: California New York
table(unique(startup$State)) # Calculate the frequency of each state
##
## California New York
## 1 1
To build a linear regression, there are usually few following steps
All feed in
Forward selection
Backward elimination: remove insignificant p-values predictor
Bidirection selection
\[ 2^n -1 \ where \ n \ is \ the \ number \ of \ predictors \]
# Multiple linear regression building
## Splitting data into training and test sets
library(caTools)
set.seed(123)
split=sample.split(startup$Profit, SplitRatio = 0.8)
training_set<-subset(startup, split==T)
test_set<-subset(startup,split==F)
# Building a multiple linear regression model with all possible predictors
multi_model<-lm(Profit~., data=startup)
summary(multi_model)
##
## Call:
## lm(formula = Profit ~ ., data = startup)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34163 -4312 113 6631 17916
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.042e+04 6.654e+03 7.577 1.43e-09 ***
## R.D.Spend 8.080e-01 4.575e-02 17.662 < 2e-16 ***
## Administration -2.362e-02 5.186e-02 -0.455 0.651
## Marketing.Spend 2.637e-02 1.668e-02 1.581 0.121
## StateNew York -1.332e+03 2.690e+03 -0.495 0.623
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9309 on 45 degrees of freedom
## Multiple R-squared: 0.951, Adjusted R-squared: 0.9467
## F-statistic: 218.4 on 4 and 45 DF, p-value: < 2.2e-16
# As only R.D.Spend is significant so new model could be
model_good<-lm(Profit~R.D.Spend, data=training_set)
summary(model_good)
##
## Call:
## lm(formula = Profit ~ R.D.Spend, data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34334 -4894 -340 6752 17147
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.902e+04 2.748e+03 17.84 <2e-16 ***
## R.D.Spend 8.563e-01 3.357e-02 25.51 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9836 on 38 degrees of freedom
## Multiple R-squared: 0.9448, Adjusted R-squared: 0.9434
## F-statistic: 650.8 on 1 and 38 DF, p-value: < 2.2e-16
# Predict the test set
y_pred<-predict(model_good, newdata = test_set)
test_set<-data.frame(test_set,Profit_predicted=y_pred)
head(test_set)
## R.D.Spend Administration Marketing.Spend State Profit
## 4 144372.4 118671.85 383199.6 New York 182902.0
## 5 142107.3 91391.77 366168.4 California 166187.9
## 8 130298.1 145530.06 323876.7 New York 155752.6
## 11 101913.1 110594.11 229161.0 California 146122.0
## 16 114523.6 122616.84 261776.2 New York 129917.0
## 20 86419.7 153514.11 0.0 New York 122776.9
## Profit_predicted
## 4 172647.9
## 5 170708.2
## 8 160595.5
## 11 136288.1
## 16 147087.1
## 20 123020.5
Stepwise selections or predictors selection
# Backward elemination
model<-lm(Profit~.,data=training_set)
summary(model)
##
## Call:
## lm(formula = Profit ~ ., data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33635 -4423 0 6272 18586
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.003e+04 7.261e+03 6.890 5.3e-08 ***
## R.D.Spend 7.983e-01 5.421e-02 14.727 < 2e-16 ***
## Administration -2.738e-02 5.687e-02 -0.482 0.633
## Marketing.Spend 3.295e-02 2.011e-02 1.638 0.110
## StateNew York -1.152e+03 3.098e+03 -0.372 0.712
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9747 on 35 degrees of freedom
## Multiple R-squared: 0.9501, Adjusted R-squared: 0.9444
## F-statistic: 166.6 on 4 and 35 DF, p-value: < 2.2e-16
# sate has highest p-value and no significant. So remove it
model1<-lm(Profit~R.D.Spend+Administration +Marketing.Spend,data=training_set)
summary(model1)
##
## Call:
## lm(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend,
## data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33117 -4858 -36 6020 17957
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.970e+04 7.120e+03 6.980 3.48e-08 ***
## R.D.Spend 7.983e-01 5.356e-02 14.905 < 2e-16 ***
## Administration -2.895e-02 5.603e-02 -0.517 0.609
## Marketing.Spend 3.283e-02 1.987e-02 1.652 0.107
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9629 on 36 degrees of freedom
## Multiple R-squared: 0.9499, Adjusted R-squared: 0.9457
## F-statistic: 227.6 on 3 and 36 DF, p-value: < 2.2e-16
# remove Administration
model2<-lm(Profit~R.D.Spend+Marketing.Spend,data=training_set)
summary(model2)
##
## Call:
## lm(formula = Profit ~ R.D.Spend + Marketing.Spend, data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33294 -4763 -354 6351 17693
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.638e+04 3.019e+03 15.364 <2e-16 ***
## R.D.Spend 7.879e-01 4.916e-02 16.026 <2e-16 ***
## Marketing.Spend 3.538e-02 1.905e-02 1.857 0.0713 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9533 on 37 degrees of freedom
## Multiple R-squared: 0.9495, Adjusted R-squared: 0.9468
## F-statistic: 348.1 on 2 and 37 DF, p-value: < 2.2e-16
# remove Marketing Spend
model3<-lm(Profit~R.D.Spend,data=training_set)
summary(model3)
##
## Call:
## lm(formula = Profit ~ R.D.Spend, data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -34334 -4894 -340 6752 17147
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.902e+04 2.748e+03 17.84 <2e-16 ***
## R.D.Spend 8.563e-01 3.357e-02 25.51 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9836 on 38 degrees of freedom
## Multiple R-squared: 0.9448, Adjusted R-squared: 0.9434
## F-statistic: 650.8 on 1 and 38 DF, p-value: < 2.2e-16
# This model is statistical significant. Check plot to see the satisfation of assumptions
par(mfrow=c(2,2))
plot(model3)