This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
################################################################################
# INTRODUCTIION
# The purpose of this project is to apply nural networks to model customer
# behavior using the Online food dataset. We aim to predict weather a customer
# will place an order using various features.
# using various demograpic and behavioural features, while demonstrating advanced
# R skills in data wrangling, modeling, and interpretion.
################################################################################
################################################################################
# DATA OBSERVATION
################################################################################
#import our dataset
library(readr)
library(class)
## Warning: package 'class' was built under R version 4.4.3
library(gmodels)
## Warning: package 'gmodels' was built under R version 4.4.3
library(nnet)
library(psych)
## Warning: package 'psych' was built under R version 4.4.3
library(NeuralNetTools)
## Warning: package 'NeuralNetTools' was built under R version 4.4.3
set.seed(534)
onlinefood <- read_csv("C:/Users/ijiol/OneDrive/Documents/R projects/R for Advanced Topics/Datasets/onlinefood.csv")
## Rows: 388 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): Gender, Marital Status, Occupation, Monthly Income, Educational Qua...
## dbl (5): Age, Family size, latitude, longitude, Pin code
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#View(onlinefood)
df<-onlinefood
#Preview the data
head(df)
## # A tibble: 6 × 12
## Age Gender `Marital Status` Occupation `Monthly Income`
## <dbl> <chr> <chr> <chr> <chr>
## 1 20 Female Single Student No Income
## 2 24 Female Single Student Below Rs.10000
## 3 22 Male Single Student Below Rs.10000
## 4 22 Female Single Student No Income
## 5 22 Male Single Student Below Rs.10000
## 6 27 Female Married Employee More than 50000
## # ℹ 7 more variables: `Educational Qualifications` <chr>, `Family size` <dbl>,
## # latitude <dbl>, longitude <dbl>, `Pin code` <dbl>, Output <chr>,
## # Feedback <chr>
summary(df)
## Age Gender Marital Status Occupation
## Min. :18.00 Length:388 Length:388 Length:388
## 1st Qu.:23.00 Class :character Class :character Class :character
## Median :24.00 Mode :character Mode :character Mode :character
## Mean :24.63
## 3rd Qu.:26.00
## Max. :33.00
## Monthly Income Educational Qualifications Family size latitude
## Length:388 Length:388 Min. :1.000 Min. :12.87
## Class :character Class :character 1st Qu.:2.000 1st Qu.:12.94
## Mode :character Mode :character Median :3.000 Median :12.98
## Mean :3.281 Mean :12.97
## 3rd Qu.:4.000 3rd Qu.:13.00
## Max. :6.000 Max. :13.10
## longitude Pin code Output Feedback
## Min. :77.48 Min. :560001 Length:388 Length:388
## 1st Qu.:77.57 1st Qu.:560011 Class :character Class :character
## Median :77.59 Median :560034 Mode :character Mode :character
## Mean :77.60 Mean :560040
## 3rd Qu.:77.63 3rd Qu.:560068
## Max. :77.76 Max. :560109
any(is.na(df))
## [1] FALSE
dim(df)
## [1] 388 12
#Understand why we got the error "Error in table(df) : attempt to make a table with >= 2^31 elements)
table(df$Feedback)
##
## Negative Positive
## 71 317
table(df$Output)
##
## No Yes
## 87 301
#Convert character columns to factors
df$Gender <- as.factor(df$Gender)
df$"Marital Status" <- as.factor(df$"Marital Status")
df$Occupation <- as.factor(df$Occupation)
df$"Monthly Income" <- as.factor(df$"Monthly Income")
df$"Educational Qualifications" <- as.factor(df$"Educational Qualifications")
df$Output <- as.factor(df$Output)
df$Feedback <- as.factor(df$Feedback)
str(df)
## spc_tbl_ [388 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Age : num [1:388] 20 24 22 22 22 27 22 24 23 23 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 1 2 1 2 1 2 1 1 1 ...
## $ Marital Status : Factor w/ 3 levels "Married","Prefer not to say",..: 3 3 3 3 3 1 3 3 3 3 ...
## $ Occupation : Factor w/ 4 levels "Employee","House wife",..: 4 4 4 4 4 1 4 4 4 4 ...
## $ Monthly Income : Factor w/ 5 levels "10001 to 25000",..: 5 3 3 5 3 4 5 5 5 5 ...
## $ Educational Qualifications: Factor w/ 5 levels "Graduate","Ph.D",..: 3 1 3 1 3 3 1 3 3 3 ...
## $ Family size : num [1:388] 4 3 3 6 4 2 3 3 2 4 ...
## $ latitude : num [1:388] 13 13 13 12.9 13 ...
## $ longitude : num [1:388] 77.6 77.6 77.7 77.6 77.6 ...
## $ Pin code : num [1:388] 560001 560009 560017 560019 560010 ...
## $ Output : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
## $ Feedback : Factor w/ 2 levels "Negative","Positive": 2 2 1 2 2 2 2 2 2 2 ...
## - attr(*, "spec")=
## .. cols(
## .. Age = col_double(),
## .. Gender = col_character(),
## .. `Marital Status` = col_character(),
## .. Occupation = col_character(),
## .. `Monthly Income` = col_character(),
## .. `Educational Qualifications` = col_character(),
## .. `Family size` = col_double(),
## .. latitude = col_double(),
## .. longitude = col_double(),
## .. `Pin code` = col_double(),
## .. Output = col_character(),
## .. Feedback = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
#Splitting the data into test and train set
ind <- sample(2, nrow(df), replace = TRUE, prob = c(0.7, 0.3))
trainData <- df[ind == 1, ]
testData <- df[ind == 2, ]
#removes the target value
#trainData1 = trainData[,-11]
#testData1 = testData[,-11]
trainData1 = trainData
testData1 = testData
trainLabels <- trainData$Output
testLabels <- testData$Output
summary(trainLabels)
## No Yes
## 58 202
#normalize the dataset
normalize <- function(x) {
return ((x-min(x)) / (max(x) - min(x)))
}
numeric_col <- sapply(trainData1, is.numeric)
# Normalize numeric features in train and test
trainData1_norm <- trainData1
testData1_norm <- testData1
trainData1_norm[, numeric_col] <- as.data.frame(lapply(trainData1[, numeric_col], normalize))
testData1_norm[, numeric_col] <- as.data.frame(lapply(testData1[, numeric_col], normalize))
#Visualizing relationships using scatterplots
barplot(table(df$Output), col = "lightblue", main = "Distribution of Output", ylab = "Frequency")
cor(df[c("Age", "Family size", "latitude", "longitude", "Pin code")])
## Age Family size latitude longitude Pin code
## Age 1.000000000 0.169981512 0.006594549 0.04749956 0.137294125
## Family size 0.169981512 1.000000000 -0.053536733 0.07012644 -0.009402396
## latitude 0.006594549 -0.053536733 1.000000000 -0.14423361 -0.201813497
## longitude 0.047499559 0.070126439 -0.144233613 1.00000000 0.156119479
## Pin code 0.137294125 -0.009402396 -0.201813497 0.15611948 1.000000000
pairs(df[c("Age", "Family size", "latitude", "longitude", "Pin code")])
pairs.panels(df[c("Age", "Family size", "latitude", "longitude", "Pin code")])
################################################################################
# Model Building & Results m###############################################################################
#Building the model
nnModel <- nnet(Output ~ ., data = trainData1_norm, size = 5, decay = 0.01, epochs = 350)
## # weights: 111
## initial value 146.167027
## iter 10 value 90.716827
## iter 20 value 71.202593
## iter 30 value 61.886022
## iter 40 value 57.692428
## iter 50 value 51.699518
## iter 60 value 47.613647
## iter 70 value 44.314818
## iter 80 value 41.141551
## iter 90 value 38.419751
## iter 100 value 35.131873
## final value 35.131873
## stopped after 100 iterations
summary(nnModel)
## a 20-5-1 network with 111 weights
## options were - entropy fitting decay=0.01
## b->h1 i1->h1 i2->h1 i3->h1 i4->h1 i5->h1 i6->h1 i7->h1 i8->h1 i9->h1
## -0.71 3.13 -0.87 -1.70 0.47 0.65 -0.88 4.55 -0.13 0.24
## i10->h1 i11->h1 i12->h1 i13->h1 i14->h1 i15->h1 i16->h1 i17->h1 i18->h1 i19->h1
## 3.57 2.68 0.97 0.86 2.48 2.47 -4.80 -1.67 4.80 -2.10
## i20->h1
## 2.53
## b->h2 i1->h2 i2->h2 i3->h2 i4->h2 i5->h2 i6->h2 i7->h2 i8->h2 i9->h2
## 0.61 1.09 1.11 1.64 5.94 -3.00 -0.07 -4.27 0.55 -8.01
## i10->h2 i11->h2 i12->h2 i13->h2 i14->h2 i15->h2 i16->h2 i17->h2 i18->h2 i19->h2
## -3.78 -5.00 -0.94 -3.85 -3.48 3.45 7.41 1.36 -5.72 8.11
## i20->h2
## 0.51
## b->h3 i1->h3 i2->h3 i3->h3 i4->h3 i5->h3 i6->h3 i7->h3 i8->h3 i9->h3
## -0.06 0.71 -5.85 0.47 -2.28 0.01 -0.39 0.80 -0.19 -0.63
## i10->h3 i11->h3 i12->h3 i13->h3 i14->h3 i15->h3 i16->h3 i17->h3 i18->h3 i19->h3
## 1.24 -3.74 -2.34 1.81 -0.12 -0.21 -1.03 1.88 -2.04 -3.26
## i20->h3
## 0.09
## b->h4 i1->h4 i2->h4 i3->h4 i4->h4 i5->h4 i6->h4 i7->h4 i8->h4 i9->h4
## 1.23 -5.35 -0.13 0.10 0.33 0.79 -0.57 -0.26 3.04 3.15
## i10->h4 i11->h4 i12->h4 i13->h4 i14->h4 i15->h4 i16->h4 i17->h4 i18->h4 i19->h4
## -2.78 -1.88 3.21 0.18 -0.23 -0.79 -2.81 -0.76 -3.66 4.03
## i20->h4
## 7.05
## b->h5 i1->h5 i2->h5 i3->h5 i4->h5 i5->h5 i6->h5 i7->h5 i8->h5 i9->h5
## 0.12 2.07 -0.48 -5.32 -3.72 0.88 0.88 3.24 -2.45 6.72
## i10->h5 i11->h5 i12->h5 i13->h5 i14->h5 i15->h5 i16->h5 i17->h5 i18->h5 i19->h5
## 1.95 3.14 1.58 3.80 -1.72 -2.33 -9.19 0.65 1.46 -4.18
## i20->h5
## 1.52
## b->o h1->o h2->o h3->o h4->o h5->o
## -0.36 15.01 -12.63 -2.67 9.31 -15.47
plotnet(nnModel)
#predict with the model
predict <- predict(nnModel, newdata = testData1_norm, type = "class")
predict
## [1] "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "No" "Yes" "Yes" "Yes" "Yes" "Yes"
## [13] "Yes" "Yes" "Yes" "Yes" "No" "Yes" "Yes" "No" "Yes" "Yes" "Yes" "Yes"
## [25] "Yes" "Yes" "Yes" "No" "Yes" "Yes" "Yes" "Yes" "Yes" "No" "Yes" "Yes"
## [37] "Yes" "Yes" "Yes" "Yes" "No" "Yes" "Yes" "Yes" "Yes" "No" "Yes" "Yes"
## [49] "Yes" "No" "No" "Yes" "Yes" "Yes" "No" "Yes" "Yes" "Yes" "Yes" "Yes"
## [61] "No" "Yes" "Yes" "Yes" "No" "Yes" "Yes" "Yes" "Yes" "No" "Yes" "Yes"
## [73] "Yes" "Yes" "No" "No" "No" "No" "No" "Yes" "No" "No" "Yes" "Yes"
## [85] "Yes" "No" "No" "Yes" "Yes" "No" "Yes" "Yes" "Yes" "Yes" "Yes" "No"
## [97] "Yes" "Yes" "Yes" "No" "No" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes"
## [109] "Yes" "Yes" "No" "Yes" "Yes" "Yes" "Yes" "No" "Yes" "Yes" "Yes" "No"
## [121] "Yes" "No" "No" "No" "Yes" "Yes" "Yes" "Yes"
# Confusion matrix
length(testLabels)
## [1] 128
length(testData1)
## [1] 12
length(predict)
## [1] 128
confus_Mat <- table(predict, testLabels)
confus_Mat
## testLabels
## predict No Yes
## No 21 11
## Yes 8 88
# Accuracy calculation
accuracy <- sum(diag(confus_Mat)) / sum(confus_Mat)
# Print the accuracy
print(paste("Accuracy: ", accuracy))
## [1] "Accuracy: 0.8515625"
################################################################################
# Improving the Model
################################################################################
# Load caret for model tuning
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
##
## The following objects are masked from 'package:psych':
##
## %+%, alpha
##
## Loading required package: lattice
# Define cross-validation method
control <- trainControl(method = "cv", number = 5)
# Create tuning grid
tune_grid <- expand.grid(size = c(3, 5, 7, 10), decay = c(0.01, 0.001, 0.0001))
# Train model with tuning
set.seed(123)
tuned_model <- train(Output ~ ., data = trainData1_norm,
method = "nnet",
trControl = control,
tuneGrid = tune_grid,
linout = FALSE,
trace = FALSE,
epochs = 500)
# View best model
print(tuned_model)
## Neural Network
##
## 260 samples
## 11 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 207, 209, 208, 208, 208
## Resampling results across tuning parameters:
##
## size decay Accuracy Kappa
## 3 1e-04 0.8425083 0.5140427
## 3 1e-03 0.8231239 0.4886297
## 3 1e-02 0.8389581 0.5298187
## 5 1e-04 0.8115128 0.4542445
## 5 1e-03 0.7924969 0.3950488
## 5 1e-02 0.8311178 0.5138384
## 7 1e-04 0.8115911 0.4349432
## 7 1e-03 0.8003429 0.4673695
## 7 1e-02 0.7962036 0.4311516
## 10 1e-04 0.7963459 0.4135841
## 10 1e-03 0.7959859 0.4428946
## 10 1e-02 0.8192052 0.5026792
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were size = 3 and decay = 1e-04.
plot(tuned_model)
# Predict on test set using best model
pred_tuned <- predict(tuned_model, newdata = testData1_norm)
# Confusion matrix
conf_matrix <- confusionMatrix(pred_tuned, testLabels)
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 17 1
## Yes 12 98
##
## Accuracy : 0.8984
## 95% CI : (0.8326, 0.9448)
## No Information Rate : 0.7734
## P-Value [Acc > NIR] : 0.0002039
##
## Kappa : 0.6653
##
## Mcnemar's Test P-Value : 0.0055457
##
## Sensitivity : 0.5862
## Specificity : 0.9899
## Pos Pred Value : 0.9444
## Neg Pred Value : 0.8909
## Prevalence : 0.2266
## Detection Rate : 0.1328
## Detection Prevalence : 0.1406
## Balanced Accuracy : 0.7881
##
## 'Positive' Class : No
##
# Accuracy
print(paste("Tuned Accuracy:", round(conf_matrix$overall['Accuracy'], 4)))
## [1] "Tuned Accuracy: 0.8984"
You can also embed plots, for example:
Note that the echo = FALSE
parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.