R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

################################################################################
#                            INTRODUCTIION
#  The purpose of this project is to apply nural networks to model customer 
#  behavior using the Online food dataset. We aim to predict weather a customer
#  will place an order using various features. 
#  using various demograpic and behavioural features, while demonstrating advanced
#  R skills in data wrangling, modeling, and interpretion.
################################################################################


################################################################################
#                            DATA OBSERVATION
################################################################################
#import our dataset
library(readr)
library(class)
## Warning: package 'class' was built under R version 4.4.3
library(gmodels)
## Warning: package 'gmodels' was built under R version 4.4.3
library(nnet)
library(psych)
## Warning: package 'psych' was built under R version 4.4.3
library(NeuralNetTools)
## Warning: package 'NeuralNetTools' was built under R version 4.4.3
set.seed(534)
onlinefood <- read_csv("C:/Users/ijiol/OneDrive/Documents/R projects/R for Advanced Topics/Datasets/onlinefood.csv")
## Rows: 388 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): Gender, Marital Status, Occupation, Monthly Income, Educational Qua...
## dbl (5): Age, Family size, latitude, longitude, Pin code
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#View(onlinefood)
df<-onlinefood

#Preview the data
head(df)
## # A tibble: 6 × 12
##     Age Gender `Marital Status` Occupation `Monthly Income`
##   <dbl> <chr>  <chr>            <chr>      <chr>           
## 1    20 Female Single           Student    No Income       
## 2    24 Female Single           Student    Below Rs.10000  
## 3    22 Male   Single           Student    Below Rs.10000  
## 4    22 Female Single           Student    No Income       
## 5    22 Male   Single           Student    Below Rs.10000  
## 6    27 Female Married          Employee   More than 50000 
## # ℹ 7 more variables: `Educational Qualifications` <chr>, `Family size` <dbl>,
## #   latitude <dbl>, longitude <dbl>, `Pin code` <dbl>, Output <chr>,
## #   Feedback <chr>
summary(df)
##       Age           Gender          Marital Status      Occupation       
##  Min.   :18.00   Length:388         Length:388         Length:388        
##  1st Qu.:23.00   Class :character   Class :character   Class :character  
##  Median :24.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :24.63                                                           
##  3rd Qu.:26.00                                                           
##  Max.   :33.00                                                           
##  Monthly Income     Educational Qualifications  Family size       latitude    
##  Length:388         Length:388                 Min.   :1.000   Min.   :12.87  
##  Class :character   Class :character           1st Qu.:2.000   1st Qu.:12.94  
##  Mode  :character   Mode  :character           Median :3.000   Median :12.98  
##                                                Mean   :3.281   Mean   :12.97  
##                                                3rd Qu.:4.000   3rd Qu.:13.00  
##                                                Max.   :6.000   Max.   :13.10  
##    longitude        Pin code         Output            Feedback        
##  Min.   :77.48   Min.   :560001   Length:388         Length:388        
##  1st Qu.:77.57   1st Qu.:560011   Class :character   Class :character  
##  Median :77.59   Median :560034   Mode  :character   Mode  :character  
##  Mean   :77.60   Mean   :560040                                        
##  3rd Qu.:77.63   3rd Qu.:560068                                        
##  Max.   :77.76   Max.   :560109
any(is.na(df))
## [1] FALSE
dim(df)
## [1] 388  12
#Understand why we got the error "Error in table(df) : attempt to make a table with >= 2^31 elements)
table(df$Feedback)
## 
## Negative Positive 
##       71      317
table(df$Output)
## 
##  No Yes 
##  87 301
#Convert character columns to factors
df$Gender <- as.factor(df$Gender)
df$"Marital Status" <- as.factor(df$"Marital Status")
df$Occupation <- as.factor(df$Occupation)
df$"Monthly Income" <- as.factor(df$"Monthly Income")
df$"Educational Qualifications" <- as.factor(df$"Educational Qualifications")
df$Output <- as.factor(df$Output)
df$Feedback <- as.factor(df$Feedback)

str(df)
## spc_tbl_ [388 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Age                       : num [1:388] 20 24 22 22 22 27 22 24 23 23 ...
##  $ Gender                    : Factor w/ 2 levels "Female","Male": 1 1 2 1 2 1 2 1 1 1 ...
##  $ Marital Status            : Factor w/ 3 levels "Married","Prefer not to say",..: 3 3 3 3 3 1 3 3 3 3 ...
##  $ Occupation                : Factor w/ 4 levels "Employee","House wife",..: 4 4 4 4 4 1 4 4 4 4 ...
##  $ Monthly Income            : Factor w/ 5 levels "10001 to 25000",..: 5 3 3 5 3 4 5 5 5 5 ...
##  $ Educational Qualifications: Factor w/ 5 levels "Graduate","Ph.D",..: 3 1 3 1 3 3 1 3 3 3 ...
##  $ Family size               : num [1:388] 4 3 3 6 4 2 3 3 2 4 ...
##  $ latitude                  : num [1:388] 13 13 13 12.9 13 ...
##  $ longitude                 : num [1:388] 77.6 77.6 77.7 77.6 77.6 ...
##  $ Pin code                  : num [1:388] 560001 560009 560017 560019 560010 ...
##  $ Output                    : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Feedback                  : Factor w/ 2 levels "Negative","Positive": 2 2 1 2 2 2 2 2 2 2 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   Age = col_double(),
##   ..   Gender = col_character(),
##   ..   `Marital Status` = col_character(),
##   ..   Occupation = col_character(),
##   ..   `Monthly Income` = col_character(),
##   ..   `Educational Qualifications` = col_character(),
##   ..   `Family size` = col_double(),
##   ..   latitude = col_double(),
##   ..   longitude = col_double(),
##   ..   `Pin code` = col_double(),
##   ..   Output = col_character(),
##   ..   Feedback = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
#Splitting the data into test and train set
ind <- sample(2, nrow(df), replace = TRUE, prob = c(0.7, 0.3))

trainData <- df[ind == 1, ]
testData <- df[ind == 2, ]

#removes the target value
#trainData1 = trainData[,-11]
#testData1 = testData[,-11]
trainData1 = trainData
testData1 = testData

trainLabels <- trainData$Output
testLabels <- testData$Output

summary(trainLabels)
##  No Yes 
##  58 202
#normalize the dataset
normalize <- function(x) {
  return ((x-min(x)) / (max(x) - min(x)))
}

numeric_col <- sapply(trainData1, is.numeric)
# Normalize numeric features in train and test
trainData1_norm <- trainData1
testData1_norm <- testData1


trainData1_norm[, numeric_col] <- as.data.frame(lapply(trainData1[, numeric_col], normalize))
testData1_norm[, numeric_col] <- as.data.frame(lapply(testData1[, numeric_col], normalize))


#Visualizing relationships using scatterplots

barplot(table(df$Output), col = "lightblue", main = "Distribution of Output", ylab = "Frequency")

cor(df[c("Age", "Family size", "latitude", "longitude", "Pin code")])
##                     Age  Family size     latitude   longitude     Pin code
## Age         1.000000000  0.169981512  0.006594549  0.04749956  0.137294125
## Family size 0.169981512  1.000000000 -0.053536733  0.07012644 -0.009402396
## latitude    0.006594549 -0.053536733  1.000000000 -0.14423361 -0.201813497
## longitude   0.047499559  0.070126439 -0.144233613  1.00000000  0.156119479
## Pin code    0.137294125 -0.009402396 -0.201813497  0.15611948  1.000000000
pairs(df[c("Age", "Family size", "latitude", "longitude", "Pin code")])

pairs.panels(df[c("Age", "Family size", "latitude", "longitude", "Pin code")])

################################################################################
#                            Model Building & Results m###############################################################################

#Building the model
nnModel <- nnet(Output ~ ., data = trainData1_norm, size = 5, decay = 0.01, epochs = 350)
## # weights:  111
## initial  value 146.167027 
## iter  10 value 90.716827
## iter  20 value 71.202593
## iter  30 value 61.886022
## iter  40 value 57.692428
## iter  50 value 51.699518
## iter  60 value 47.613647
## iter  70 value 44.314818
## iter  80 value 41.141551
## iter  90 value 38.419751
## iter 100 value 35.131873
## final  value 35.131873 
## stopped after 100 iterations
summary(nnModel)
## a 20-5-1 network with 111 weights
## options were - entropy fitting  decay=0.01
##   b->h1  i1->h1  i2->h1  i3->h1  i4->h1  i5->h1  i6->h1  i7->h1  i8->h1  i9->h1 
##   -0.71    3.13   -0.87   -1.70    0.47    0.65   -0.88    4.55   -0.13    0.24 
## i10->h1 i11->h1 i12->h1 i13->h1 i14->h1 i15->h1 i16->h1 i17->h1 i18->h1 i19->h1 
##    3.57    2.68    0.97    0.86    2.48    2.47   -4.80   -1.67    4.80   -2.10 
## i20->h1 
##    2.53 
##   b->h2  i1->h2  i2->h2  i3->h2  i4->h2  i5->h2  i6->h2  i7->h2  i8->h2  i9->h2 
##    0.61    1.09    1.11    1.64    5.94   -3.00   -0.07   -4.27    0.55   -8.01 
## i10->h2 i11->h2 i12->h2 i13->h2 i14->h2 i15->h2 i16->h2 i17->h2 i18->h2 i19->h2 
##   -3.78   -5.00   -0.94   -3.85   -3.48    3.45    7.41    1.36   -5.72    8.11 
## i20->h2 
##    0.51 
##   b->h3  i1->h3  i2->h3  i3->h3  i4->h3  i5->h3  i6->h3  i7->h3  i8->h3  i9->h3 
##   -0.06    0.71   -5.85    0.47   -2.28    0.01   -0.39    0.80   -0.19   -0.63 
## i10->h3 i11->h3 i12->h3 i13->h3 i14->h3 i15->h3 i16->h3 i17->h3 i18->h3 i19->h3 
##    1.24   -3.74   -2.34    1.81   -0.12   -0.21   -1.03    1.88   -2.04   -3.26 
## i20->h3 
##    0.09 
##   b->h4  i1->h4  i2->h4  i3->h4  i4->h4  i5->h4  i6->h4  i7->h4  i8->h4  i9->h4 
##    1.23   -5.35   -0.13    0.10    0.33    0.79   -0.57   -0.26    3.04    3.15 
## i10->h4 i11->h4 i12->h4 i13->h4 i14->h4 i15->h4 i16->h4 i17->h4 i18->h4 i19->h4 
##   -2.78   -1.88    3.21    0.18   -0.23   -0.79   -2.81   -0.76   -3.66    4.03 
## i20->h4 
##    7.05 
##   b->h5  i1->h5  i2->h5  i3->h5  i4->h5  i5->h5  i6->h5  i7->h5  i8->h5  i9->h5 
##    0.12    2.07   -0.48   -5.32   -3.72    0.88    0.88    3.24   -2.45    6.72 
## i10->h5 i11->h5 i12->h5 i13->h5 i14->h5 i15->h5 i16->h5 i17->h5 i18->h5 i19->h5 
##    1.95    3.14    1.58    3.80   -1.72   -2.33   -9.19    0.65    1.46   -4.18 
## i20->h5 
##    1.52 
##   b->o  h1->o  h2->o  h3->o  h4->o  h5->o 
##  -0.36  15.01 -12.63  -2.67   9.31 -15.47
plotnet(nnModel)

#predict with the model
predict <- predict(nnModel, newdata = testData1_norm, type = "class")
predict
##   [1] "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "No"  "Yes" "Yes" "Yes" "Yes" "Yes"
##  [13] "Yes" "Yes" "Yes" "Yes" "No"  "Yes" "Yes" "No"  "Yes" "Yes" "Yes" "Yes"
##  [25] "Yes" "Yes" "Yes" "No"  "Yes" "Yes" "Yes" "Yes" "Yes" "No"  "Yes" "Yes"
##  [37] "Yes" "Yes" "Yes" "Yes" "No"  "Yes" "Yes" "Yes" "Yes" "No"  "Yes" "Yes"
##  [49] "Yes" "No"  "No"  "Yes" "Yes" "Yes" "No"  "Yes" "Yes" "Yes" "Yes" "Yes"
##  [61] "No"  "Yes" "Yes" "Yes" "No"  "Yes" "Yes" "Yes" "Yes" "No"  "Yes" "Yes"
##  [73] "Yes" "Yes" "No"  "No"  "No"  "No"  "No"  "Yes" "No"  "No"  "Yes" "Yes"
##  [85] "Yes" "No"  "No"  "Yes" "Yes" "No"  "Yes" "Yes" "Yes" "Yes" "Yes" "No" 
##  [97] "Yes" "Yes" "Yes" "No"  "No"  "Yes" "Yes" "Yes" "Yes" "Yes" "Yes" "Yes"
## [109] "Yes" "Yes" "No"  "Yes" "Yes" "Yes" "Yes" "No"  "Yes" "Yes" "Yes" "No" 
## [121] "Yes" "No"  "No"  "No"  "Yes" "Yes" "Yes" "Yes"
# Confusion matrix
length(testLabels)
## [1] 128
length(testData1)
## [1] 12
length(predict)
## [1] 128
confus_Mat <- table(predict, testLabels)
confus_Mat
##        testLabels
## predict No Yes
##     No  21  11
##     Yes  8  88
# Accuracy calculation
accuracy <- sum(diag(confus_Mat)) / sum(confus_Mat)

# Print the accuracy
print(paste("Accuracy: ", accuracy))
## [1] "Accuracy:  0.8515625"
################################################################################
#                 Improving the Model
################################################################################

# Load caret for model tuning
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## 
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
## 
## Loading required package: lattice
# Define cross-validation method
control <- trainControl(method = "cv", number = 5)

# Create tuning grid
tune_grid <- expand.grid(size = c(3, 5, 7, 10), decay = c(0.01, 0.001, 0.0001))

# Train model with tuning
set.seed(123)
tuned_model <- train(Output ~ ., data = trainData1_norm, 
                     method = "nnet", 
                     trControl = control,
                     tuneGrid = tune_grid,
                     linout = FALSE,
                     trace = FALSE,
                     epochs = 500)

# View best model
print(tuned_model)
## Neural Network 
## 
## 260 samples
##  11 predictor
##   2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 207, 209, 208, 208, 208 
## Resampling results across tuning parameters:
## 
##   size  decay  Accuracy   Kappa    
##    3    1e-04  0.8425083  0.5140427
##    3    1e-03  0.8231239  0.4886297
##    3    1e-02  0.8389581  0.5298187
##    5    1e-04  0.8115128  0.4542445
##    5    1e-03  0.7924969  0.3950488
##    5    1e-02  0.8311178  0.5138384
##    7    1e-04  0.8115911  0.4349432
##    7    1e-03  0.8003429  0.4673695
##    7    1e-02  0.7962036  0.4311516
##   10    1e-04  0.7963459  0.4135841
##   10    1e-03  0.7959859  0.4428946
##   10    1e-02  0.8192052  0.5026792
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were size = 3 and decay = 1e-04.
plot(tuned_model)

# Predict on test set using best model
pred_tuned <- predict(tuned_model, newdata = testData1_norm)

# Confusion matrix
conf_matrix <- confusionMatrix(pred_tuned, testLabels)
print(conf_matrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction No Yes
##        No  17   1
##        Yes 12  98
##                                           
##                Accuracy : 0.8984          
##                  95% CI : (0.8326, 0.9448)
##     No Information Rate : 0.7734          
##     P-Value [Acc > NIR] : 0.0002039       
##                                           
##                   Kappa : 0.6653          
##                                           
##  Mcnemar's Test P-Value : 0.0055457       
##                                           
##             Sensitivity : 0.5862          
##             Specificity : 0.9899          
##          Pos Pred Value : 0.9444          
##          Neg Pred Value : 0.8909          
##              Prevalence : 0.2266          
##          Detection Rate : 0.1328          
##    Detection Prevalence : 0.1406          
##       Balanced Accuracy : 0.7881          
##                                           
##        'Positive' Class : No              
## 
# Accuracy
print(paste("Tuned Accuracy:", round(conf_matrix$overall['Accuracy'], 4)))
## [1] "Tuned Accuracy: 0.8984"

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.