The 50startups data set is being used for Artificial neural network modelling by using the other expenditure in different states and predict the profit.
library(neuralnet) # regression
## Warning: package 'neuralnet' was built under R version 3.5.1
library(nnet) # classification
## Warning: package 'nnet' was built under R version 3.5.1
library(NeuralNetTools)
## Warning: package 'NeuralNetTools' was built under R version 3.5.1
library(plyr)
## Warning: package 'plyr' was built under R version 3.5.1
# Read the data
Startups <- read.csv(file.choose())
View(Startups)
class(Startups)
## [1] "data.frame"
Startups$State <- as.numeric(revalue(Startups$State,
c("New York"="0", "California"="1",
"Florida"="2")))
str(Startups)
## 'data.frame': 50 obs. of 5 variables:
## $ R.D.Spend : num 165349 162598 153442 144372 142107 ...
## $ Administration : num 136898 151378 101146 118672 91392 ...
## $ Marketing.Spend: num 471784 443899 407935 383200 366168 ...
## $ State : num 3 1 2 3 2 3 1 2 3 1 ...
## $ Profit : num 192262 191792 191050 182902 166188 ...
Startups <- as.data.frame(Startups)
attach(Startups)
# Exploratory data Analysis :
plot(R.D.Spend, Profit)
plot(Administration, Profit)
plot(Marketing.Spend, Profit)
plot(State, Profit)
windows()
# Find the correlation between Output (Profit) & inputs (R.D Spend, Administration, Marketing, State) - SCATTER DIAGRAM
pairs(Startups)
# Correlation coefficient - Strength & Direction of correlation
cor(Startups)
## R.D.Spend Administration Marketing.Spend State
## R.D.Spend 1.0000000 0.24195525 0.72424813 0.10468511
## Administration 0.2419552 1.00000000 -0.03215388 0.01184720
## Marketing.Spend 0.7242481 -0.03215388 1.00000000 0.07766961
## State 0.1046851 0.01184720 0.07766961 1.00000000
## Profit 0.9729005 0.20071657 0.74776572 0.10179631
## Profit
## R.D.Spend 0.9729005
## Administration 0.2007166
## Marketing.Spend 0.7477657
## State 0.1017963
## Profit 1.0000000
summary(Startups) # Confirms on the different scale and demands normalizing the data.
## R.D.Spend Administration Marketing.Spend State
## Min. : 0 Min. : 51283 Min. : 0 Min. :1
## 1st Qu.: 39936 1st Qu.:103731 1st Qu.:129300 1st Qu.:1
## Median : 73051 Median :122700 Median :212716 Median :2
## Mean : 73722 Mean :121345 Mean :211025 Mean :2
## 3rd Qu.:101603 3rd Qu.:144842 3rd Qu.:299469 3rd Qu.:3
## Max. :165349 Max. :182646 Max. :471784 Max. :3
## Profit
## Min. : 14681
## 1st Qu.: 90139
## Median :107978
## Mean :112013
## 3rd Qu.:139766
## Max. :192262
# Apply Normalization technique to the whole dataset :
normalize<-function(x){
return ( (x-min(x))/(max(x)-min(x)))
}
Startups_norm<-as.data.frame(lapply(Startups,FUN=normalize))
summary(Startups_norm$Profit) # Normalized form of profit
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.4249 0.5254 0.5481 0.7044 1.0000
summary(Startups$profit) # Orginal profit value
## Length Class Mode
## 0 NULL NULL
# Data Partition
set.seed(123)
ind <- sample(2, nrow(Startups_norm), replace = TRUE, prob = c(0.7,0.3))
Startups_train <- Startups_norm[ind==1,]
startups_test <- Startups_norm[ind==2,]
# Creating a neural network model on training data
startups_model <- neuralnet(Profit~R.D.Spend+Administration
+Marketing.Spend+State,data = Startups_train)
str(startups_model)
## List of 13
## $ call : language neuralnet(formula = Profit ~ R.D.Spend + Administration + Marketing.Spend + State, data = Startups_train)
## $ response : num [1:35, 1] 1 0.993 0.801 0.796 0.774 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:35] "1" "3" "6" "7" ...
## .. ..$ : chr "Profit"
## $ covariate : num [1:35, 1:4] 1 0.928 0.798 0.814 0.729 ...
## $ model.list :List of 2
## ..$ response : chr "Profit"
## ..$ variables: chr [1:4] "R.D.Spend" "Administration" "Marketing.Spend" "State"
## $ err.fct :function (x, y)
## ..- attr(*, "type")= chr "sse"
## $ act.fct :function (x)
## ..- attr(*, "type")= chr "logistic"
## $ linear.output : logi TRUE
## $ data :'data.frame': 35 obs. of 5 variables:
## ..$ R.D.Spend : num [1:35] 1 0.928 0.798 0.814 0.729 ...
## ..$ Administration : num [1:35] 0.652 0.38 0.369 0.73 0.742 ...
## ..$ Marketing.Spend: num [1:35] 1 0.865 0.769 0.271 0.66 ...
## ..$ State : num [1:35] 1 0.5 1 0 1 0 0 0.5 0 0.5 ...
## ..$ Profit : num [1:35] 1 0.993 0.801 0.796 0.774 ...
## $ net.result :List of 1
## ..$ : num [1:35, 1] 0.94 0.915 0.833 0.816 0.766 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:35] "1" "3" "6" "7" ...
## .. .. ..$ : NULL
## $ weights :List of 1
## ..$ :List of 2
## .. ..$ : num [1:5, 1] 1.3318 -2.9611 0.186 -0.2393 0.0588
## .. ..$ : num [1:2, 1] 1.11 -1.11
## $ startweights :List of 1
## ..$ :List of 2
## .. ..$ : num [1:5, 1] -1.687 0.838 0.153 -1.138 1.254
## .. ..$ : num [1:2, 1] 0.426 -0.295
## $ generalized.weights:List of 1
## ..$ : num [1:35, 1:4] 7.72 6.2 4.47 4.3 3.95 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:35] "1" "3" "6" "7" ...
## .. .. ..$ : NULL
## $ result.matrix : num [1:10, 1] 0.03571 0.00993 396 1.33177 -2.96109 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:10] "error" "reached.threshold" "steps" "Intercept.to.1layhid1" ...
## .. ..$ : chr "1"
## - attr(*, "class")= chr "nn"
plot(startups_model, rep = "best")
summary(startups_model)
## Length Class Mode
## call 3 -none- call
## response 35 -none- numeric
## covariate 140 -none- numeric
## model.list 2 -none- list
## err.fct 1 -none- function
## act.fct 1 -none- function
## linear.output 1 -none- logical
## data 5 data.frame list
## net.result 1 -none- list
## weights 1 -none- list
## startweights 1 -none- list
## generalized.weights 1 -none- list
## result.matrix 10 -none- numeric
par(mar = numeric(4), family = 'serif')
plotnet(startups_model, alpha = 0.6)
# Evaluating model performance
set.seed(12323)
model_results <- compute(startups_model,startups_test[1:4])
predicted_profit <- model_results$net.result
# Predicted profit Vs Actual profit of test data.
cor(predicted_profit,startups_test$Profit)
## [,1]
## [1,] 0.9556347568
# since the prediction is in Normalized form, we need to de-normalize it
# to get the actual prediction on profit
str_max <- max(Startups$Profit)
str_min <- min(Startups$Profit)
unnormalize <- function(x, min, max) {
return( (max - min)*x + min )
}
ActualProfit_pred <- unnormalize(predicted_profit,str_min,str_max)
head(ActualProfit_pred)
## [,1]
## 2 181006.9920
## 4 169968.5527
## 5 170593.0556
## 8 159477.3361
## 11 137739.4836
## 16 146600.4791
# Improve the model performance :
set.seed(12345)
Startups_model2 <- neuralnet(Profit~R.D.Spend+Administration
+Marketing.Spend+State,data = Startups_train,
hidden = 2)
plot(Startups_model2 ,rep = "best")
summary(Startups_model2)
## Length Class Mode
## call 4 -none- call
## response 35 -none- numeric
## covariate 140 -none- numeric
## model.list 2 -none- list
## err.fct 1 -none- function
## act.fct 1 -none- function
## linear.output 1 -none- logical
## data 5 data.frame list
## net.result 1 -none- list
## weights 1 -none- list
## startweights 1 -none- list
## generalized.weights 1 -none- list
## result.matrix 16 -none- numeric
model_results2<-compute(Startups_model2,startups_test[1:4])
predicted_Profit2<-model_results2$net.result
cor(predicted_Profit2,startups_test$Profit)
## [,1]
## [1,] 0.9639338485
plot(predicted_Profit2,startups_test$Profit)
par(mar = numeric(4), family = 'serif')
plotnet(Startups_model2, alpha = 0.6)
# SSE(Error) has reduced and training steps had been increased as the number of neurons under hidden layer are increased