Data Wrangling (Data Preprocessing)

Setup

# Load the necessary packages required to produce the report. 
library(knitr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(magrittr)

# Video Presentation Link : https://rmit-arc.instructuremedia.com/embed/04cc62f2-1edd-4708-bbba-413dbbaefff4

Data generation

# Data generation, provide your R codes here.

# Generate the customer number and account number(1.1/1.2)
cust_num <- seq(1,100)

seed_set <- set.seed(1709)
acc_num <- sample(10000:10999,100, replace = FALSE)

# Generate Purchases made data (1.3)
purchases_made <- sample(1:300,100, replace = TRUE)

# Generate Average Items Purchase (1.4)
ave_item <- sample(1:40,100,replace = TRUE)

# Generate most purchased item (1.5)
items <- c("milk","sugar","coffee","coke","apple","chicken","beef","pork",
           "taco","fish","chips","cake","bread","ice cream","dog food",
           "toilet paper","magazine","toothpaste","cookies","lettuce","apple",
           "tomatoes","spinach","broccoli","rice","pasta","quinoa")

most_purchased_item <- sample(items,100,replace = TRUE)

# Create first data frame using generated data
synthetic1 <- data.frame(cust_num,acc_num,purchases_made,ave_item,most_purchased_item)

## Create missing values for dataframe 1
synthetic1 %<>%
  mutate(purchases_made = case_when(purchases_made >= 15 ~ purchases_made))

# Generate ave spend (2.3)- correlation to ave item
synthetic2 <- data.frame(cust_num,acc_num,ave_item)
synthetic2 %<>%
  mutate(ave_spend = (ave_item * 10.78) + (ave_item*3.2))

# Generate rewards account status with missing values (2.4)
set.seed(170)
synthetic2["Rewards Account"] = FALSE
for (i in seq(1,100)){
  ii <- sample(0:100,1)
  if (ii >= 45){
    synthetic2[i,"Rewards Account"] = TRUE
  }
  if (ii>90){
      synthetic2[i,"Rewards Account"] = NaN
  }
}
  
# Generate date joined (2.5)
date_joined <- sample(seq(as.Date("2017/01/01"), as.Date("2023/01/01"), by = "day"),100)
synthetic2<-data.frame(synthetic2,date_joined)

head(synthetic1)

head(synthetic2)

# These two data frames contain the following variables

# cust_num - This is the customer number in order of who signed up - Nominal variable
# acc_num - This is a unique identifier used to perform actions on accounts- Categorical variable
# purchases_made - This is the number of orders the customer has made - Nominal variable
# ave_item - This is the number of items a customer purchases in one transaction - Nominal variable
# most_purchased_item - This is the customers most frequently purchased item - Categorical variable 
# ave_spend - This is the customers average spend per transaction - Nominal variable
# Rewards Account - This is whether or not the customer has signed up for special offers and rewards denoted by a 0 or 1 - Factor Variable
# date_joined - This is the day the customer activated their account - Date variable

Merging data sets

# Merge your synthetic data sets, provide R codes here.
# Here we will group by the two variables "cust_num" and "acc_num" before using left_join to merge the two sets

merged_synth <- synthetic1 %<>% left_join(synthetic2, by = c("cust_num","acc_num"))
head(merged_synth)

Checking structure of combined data

# Check structure of combined data and perform all necessary data type conversions, provide R codes here.
str(merged_synth)

## 'data.frame':    100 obs. of  9 variables:
##  $ cust_num           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ acc_num            : int  10080 10650 10194 10057 10551 10755 10626 10164 10410 10975 ...
##  $ purchases_made     : int  246 38 223 39 41 95 172 185 133 275 ...
##  $ ave_item.x         : int  22 9 31 15 36 34 17 12 28 29 ...
##  $ most_purchased_item: chr  "ice cream" "pork" "pork" "pork" ...
##  $ ave_item.y         : int  22 9 31 15 36 34 17 12 28 29 ...
##  $ ave_spend          : num  308 126 433 210 503 ...
##  $ Rewards.Account    : num  0 1 1 0 0 1 1 1 0 0 ...
##  $ date_joined        : Date, format: "2021-11-25" "2019-03-12" ...

# Here the only changed we need to make is convert the boolean variable into a logical variable 
merged_synth$Rewards.Account<-  as.logical(merged_synth$Rewards.Account)
str(merged_synth)

## 'data.frame':    100 obs. of  9 variables:
##  $ cust_num           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ acc_num            : int  10080 10650 10194 10057 10551 10755 10626 10164 10410 10975 ...
##  $ purchases_made     : int  246 38 223 39 41 95 172 185 133 275 ...
##  $ ave_item.x         : int  22 9 31 15 36 34 17 12 28 29 ...
##  $ most_purchased_item: chr  "ice cream" "pork" "pork" "pork" ...
##  $ ave_item.y         : int  22 9 31 15 36 34 17 12 28 29 ...
##  $ ave_spend          : num  308 126 433 210 503 ...
##  $ Rewards.Account    : logi  FALSE TRUE TRUE FALSE FALSE TRUE ...
##  $ date_joined        : Date, format: "2021-11-25" "2019-03-12" ...

# This data frame has 100 observations for 9 variables containg a mixture of integer, numeric, character and factor variables, as well as a dated variable

Generate summary statistics

# Generate summary statistics, provide R codes here.

# Group the data by the variable "acc_num"
grouped_synth <- merged_synth %>% group_by(acc_num)

# Define the quantiles to be able to attain the q1 and q3 values
quartiles = quantile(grouped_synth$ave_spend, c(0.25,0.75))

# Find summary statistics for mean, median, q1, q3 and std deviation
grouped_synth %>%
summarise(Mean = mean(ave_spend) , 
          Median = median(ave_spend),
          q1 = quartiles[1],
          q3 = quartiles[2],
          std_dev = sd(merged_synth$ave_spend)
)

Scanning data

# Scan variables for missing values, provide R codes here.

# Create an empty data frame to store the missing values in
missing_value_df <- data.frame()

# Use two for loops to search each value in the data frame. If any value is missing it will add the row to the empty data frame
for (i in 1:ncol(merged_synth)){
  for (j in 1:nrow(merged_synth)){
    if (is.na(merged_synth[j,i])){
     missing_value_df<- rbind(missing_value_df,merged_synth[j,])
      
    }
    
  }
}

# Here we can see that there are N/A values in the columns for "purchases_made" and "Rewards.Account"

# Firstly we will make the N/A in the "purchases_made" column to be median values, so that the row is filled but data is not altered durastically
for (i in 1:nrow(merged_synth)){
  if (is.na(merged_synth[i,3])){
    merged_synth[i,3] = median(merged_synth$purchases_made, na.rm = TRUE)
  }
}

# Secondly we will delete any rows without values in the "Rewards.Account" as it will not affect any of our calculations and there is no substitute that would be adequate

for (i in 1:nrow(merged_synth)){
  if (is.na(merged_synth[i,8])){
    merged_synth<- merged_synth[-i,]  
    }
}

# Now after running the below loop again we can check that no missing values have been let through the data set
for (i in 1:ncol(merged_synth)){
  for (j in 1:nrow(merged_synth)){
    if (is.na(merged_synth[j,i])){
     missing_value_df<- rbind(missing_value_df,merged_synth[j,])
      
    }
    
  }
}

Data Wrangling (Data Preprocessing)

Mid-term assessment

Brayden Smith S3717835

18/09/2023

Setup

Data generation

Merging data sets

Checking structure of combined data

Generate summary statistics

Scanning data